В штатном примере Dirbot есть фильтрующий pipeline. Оказалось, что его можно приспособить для фильтрации кириллицы. Не зря я собрал раньше все свои упражнения в пост Кодировка UTF-8 Или, как дружить с объектами Str и Unicode Pyton 2.x
Накануне читал два раздела документации Scrapy - Pipelines, Item Loaders... Здесь пример простейшего практикума.
К пауку mail_csv_2.py решил добавить pipelines дабы отфильтровать первые две строчки скачиваемого csv файла
In [1]:
%load "C:\\Users\\kiss\\Documents\\GitMyScrapy\\scrapy_csv_2\\scrapy_csv_2\\spiders\\mail_csv_2.py"
In []:
from scrapy.contrib.spiders import CSVFeedSpider
from scrapy_csv_2.items import ScrapyCsv1Item
from scrapy import log
class MailCsvSpider(CSVFeedSpider):
name = 'mail_csv_2'
#allowed_domains = ['file://C:/Users/kiss/Documents/GitHub_2/scrapy_csv_2/']
#start_urls = ['nissan_9_1_00.csv']
headers = ['N', 'N100', 'purl']
delimiter = ';'
start_urls = ['file://C:/Users/kiss/Documents/GitMyScrapy/scrapy_csv_2/nissan_2.csv']
# Do any adaptations you need here
#def adapt_response(self, response):
# return response
def parse_row(self, response, row):
i = ScrapyCsv1Item()
i['N'] = row['N']
i['N100'] = row['N100']
i['purl'] = row['purl']
log.msg('Hi, this is a row!: %r' % row)
return i
In [2]:
%load "C:\\Users\\kiss\\Documents\\GitMyScrapy\\scrapy_csv_2\\scrapy_csv_2\\pipelines.py"
In []:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class ScrapyCsv1Pipeline(object):
"""A pipeline for filtering out items which contain certain words in their
description"""
# put all words in lowercase
words_to_filter = ['Авто@Mail.Ru','Просмотры']
def process_item(self, item, spider):
for word in self.words_to_filter:
if word in unicode(item['N']):
raise DropItem("Contains forbidden word: %s" % word)
else:
return item
После добавления "# -- coding: utf-8 --"
In []:
C:\Users\kiss\Documents\GitMyScrapy\scrapy_csv_2>scrapy crawl mail_csv_2_1 -o items_2_2.csv -t csv
2014-07-25 17:56:59+0400 [scrapy] INFO: Scrapy 0.20.1 started (bot: scrapy_csv_2)
2014-07-25 17:56:59+0400 [scrapy] DEBUG: Optional features available: ssl, http11, boto, django
2014-07-25 17:56:59+0400 [scrapy] DEBUG: Overridden settings: {'NEWSPIDER_MODULE': 'scrapy_csv_2.spiders', 'FEED_FORMAT': 'csv', 'SP
IDER_MODULES': ['scrapy_csv_2.spiders'], 'FEED_URI': 'items_2_2.csv', 'BOT_NAME': 'scrapy_csv_2'}
2014-07-25 17:57:00+0400 [scrapy] DEBUG: Enabled extensions: FeedExporter, LogStats, TelnetConsole, CloseSpider, WebService, CoreSta
ts, SpiderState
2014-07-25 17:57:01+0400 [scrapy] DEBUG: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMid
dleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMid
dleware, ChunkedTransferMiddleware, DownloaderStats
2014-07-25 17:57:01+0400 [scrapy] DEBUG: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlL
engthMiddleware, DepthMiddleware
2014-07-25 17:57:01+0400 [scrapy] DEBUG: Enabled item pipelines: ScrapyCsv1Pipeline
2014-07-25 17:57:01+0400 [mail_csv_2_1] INFO: Spider opened
2014-07-25 17:57:01+0400 [mail_csv_2_1] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2014-07-25 17:57:01+0400 [scrapy] DEBUG: Telnet console listening on 0.0.0.0:6023
2014-07-25 17:57:01+0400 [scrapy] DEBUG: Web service listening on 0.0.0.0:6080
2014-07-25 17:57:01+0400 [mail_csv_2_1] DEBUG: Crawled (200) <GET file://C:/Users/kiss/Documents/GitMyScrapy/scrapy_csv_2/nissan_2.c
sv> (referer: None)
2014-07-25 17:57:01+0400 [scrapy] INFO: Hi, this is a row!: {'N100': u'85837', 'purl': u'http://auto.mail.ru', 'N': u'\u0410\u0432\u
0442\u043e@Mail.Ru'}
2014-07-25 17:57:01+0400 [mail_csv_2_1] ERROR: Error processing {'N': u'\u0410\u0432\u0442\u043e@Mail.Ru',
'N100': u'85837',
'purl': u'http://auto.mail.ru'}
Traceback (most recent call last):
File "C:\Users\kiss\Anaconda\lib\site-packages\scrapy\middleware.py", line 62, in _process_chain
return process_chain(self.methods[methodname], obj, *args)
File "C:\Users\kiss\Anaconda\lib\site-packages\scrapy\utils\defer.py", line 65, in process_chain
d.callback(input)
File "C:\Users\kiss\Anaconda\lib\site-packages\twisted\internet\defer.py", line 382, in callback
self._startRunCallbacks(result)
File "C:\Users\kiss\Anaconda\lib\site-packages\twisted\internet\defer.py", line 490, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "C:\Users\kiss\Anaconda\lib\site-packages\twisted\internet\defer.py", line 577, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "scrapy_csv_2\pipelines.py", line 16, in process_item
if word in unicode(item['N']):
exceptions.UnicodeDecodeError: 'ascii' codec can't decode byte 0xd0 in position 0: ordinal not in range(128)
2014-07-25 17:57:01+0400 [scrapy] WARNING: ignoring row 2 (length: 0, should be: 3)
2014-07-25 17:57:01+0400 [scrapy] INFO: Hi, this is a row!: {'N100': u'%', 'purl': u'\u0421\u0442\u0440\u0430\u043d\u0438\u0446\u044
b', 'N': u'\u041f\u0440\u043e\u0441\u043c\u043e\u0442\u0440\u044b'}
2014-07-25 17:57:01+0400 [mail_csv_2_1] ERROR: Error processing {'N': u'\u041f\u0440\u043e\u0441\u043c\u043e\u0442\u0440\u044b',
'N100': u'%',
'purl': u'\u0421\u0442\u0440\u0430\u043d\u0438\u0446\u044b'}
Traceback (most recent call last):
File "C:\Users\kiss\Anaconda\lib\site-packages\scrapy\middleware.py", line 62, in _process_chain
return process_chain(self.methods[methodname], obj, *args)
File "C:\Users\kiss\Anaconda\lib\site-packages\scrapy\utils\defer.py", line 65, in process_chain
d.callback(input)
File "C:\Users\kiss\Anaconda\lib\site-packages\twisted\internet\defer.py", line 382, in callback
self._startRunCallbacks(result)
File "C:\Users\kiss\Anaconda\lib\site-packages\twisted\internet\defer.py", line 490, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "C:\Users\kiss\Anaconda\lib\site-packages\twisted\internet\defer.py", line 577, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "scrapy_csv_2\pipelines.py", line 16, in process_item
if word in unicode(item['N']):
exceptions.UnicodeDecodeError: 'ascii' codec can't decode byte 0xd0 in position 0: ordinal not in range(128)
2014-07-25 17:57:01+0400 [scrapy] INFO: Hi, this is a row!: {'N100': u'39,46', 'purl': u'http://auto.mail.ru/catalogue/nissan/', 'N'
: u'7371'}
2014-07-25 17:57:01+0400 [mail_csv_2_1] ERROR: Error processing {'N': u'7371',
'N100': u'39,46',
'purl': u'http://auto.mail.ru/catalogue/nissan/'}
Traceback (most recent call last):
File "C:\Users\kiss\Anaconda\lib\site-packages\scrapy\middleware.py", line 62, in _process_chain
return process_chain(self.methods[methodname], obj, *args)
File "C:\Users\kiss\Anaconda\lib\site-packages\scrapy\utils\defer.py", line 65, in process_chain
d.callback(input)
File "C:\Users\kiss\Anaconda\lib\site-packages\twisted\internet\defer.py", line 382, in callback
self._startRunCallbacks(result)
File "C:\Users\kiss\Anaconda\lib\site-packages\twisted\internet\defer.py", line 490, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "C:\Users\kiss\Anaconda\lib\site-packages\twisted\internet\defer.py", line 577, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "scrapy_csv_2\pipelines.py", line 16, in process_item
if word in unicode(item['N']):
exceptions.UnicodeDecodeError: 'ascii' codec can't decode byte 0xd0 in position 0: ordinal not in range(128)
2014-07-25 17:57:01+0400 [scrapy] INFO: Hi, this is a row!: {'N100': u'7,58', 'purl': u'http://auto.mail.ru/catalogue/nissan/qashqai
/', 'N': u'1416'}
2014-07-25 17:57:01+0400 [mail_csv_2_1] ERROR: Error processing {'N': u'1416',
'N100': u'7,58',
'purl': u'http://auto.mail.ru/catalogue/nissan/qashqai/'}
Traceback (most recent call last):
File "C:\Users\kiss\Anaconda\lib\site-packages\scrapy\middleware.py", line 62, in _process_chain
return process_chain(self.methods[methodname], obj, *args)
File "C:\Users\kiss\Anaconda\lib\site-packages\scrapy\utils\defer.py", line 65, in process_chain
d.callback(input)
File "C:\Users\kiss\Anaconda\lib\site-packages\twisted\internet\defer.py", line 382, in callback
self._startRunCallbacks(result)
File "C:\Users\kiss\Anaconda\lib\site-packages\twisted\internet\defer.py", line 490, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "C:\Users\kiss\Anaconda\lib\site-packages\twisted\internet\defer.py", line 577, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "scrapy_csv_2\pipelines.py", line 16, in process_item
if word in unicode(item['N']):
exceptions.UnicodeDecodeError: 'ascii' codec can't decode byte 0xd0 in position 0: ordinal not in range(128)
2014-07-25 17:57:01+0400 [scrapy] INFO: Hi, this is a row!: {'N100': u'6,31', 'purl': u'http://auto.mail.ru/catalogue/nissan/x-trail
/', 'N': u'1179'}
2014-07-25 17:57:01+0400 [mail_csv_2_1] ERROR: Error processing {'N': u'1179',
'N100': u'6,31',
'purl': u'http://auto.mail.ru/catalogue/nissan/x-trail/'}
Traceback (most recent call last):
File "C:\Users\kiss\Anaconda\lib\site-packages\scrapy\middleware.py", line 62, in _process_chain
return process_chain(self.methods[methodname], obj, *args)
File "C:\Users\kiss\Anaconda\lib\site-packages\scrapy\utils\defer.py", line 65, in process_chain
d.callback(input)
File "C:\Users\kiss\Anaconda\lib\site-packages\twisted\internet\defer.py", line 382, in callback
self._startRunCallbacks(result)
File "C:\Users\kiss\Anaconda\lib\site-packages\twisted\internet\defer.py", line 490, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "C:\Users\kiss\Anaconda\lib\site-packages\twisted\internet\defer.py", line 577, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "scrapy_csv_2\pipelines.py", line 16, in process_item
if word in unicode(item['N']):
exceptions.UnicodeDecodeError: 'ascii' codec can't decode byte 0xd0 in position 0: ordinal not in range(128)
2014-07-25 17:57:01+0400 [mail_csv_2_1] INFO: Closing spider (finished)
2014-07-25 17:57:01+0400 [mail_csv_2_1] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 265,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 294,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2014, 7, 25, 13, 57, 1, 451000),
'log_count/DEBUG': 7,
'log_count/ERROR': 5,
'log_count/INFO': 8,
'log_count/WARNING': 1,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start_time': datetime.datetime(2014, 7, 25, 13, 57, 1, 191000)}
2014-07-25 17:57:01+0400 [mail_csv_2_1] INFO: Spider closed (finished)
In []:
"""
Как и ожидалось, дело в том, что кодировку вот этих прекраных строк, Scrapy не хочет понимать
File "scrapy_csv_2\pipelines.py", line 16, in process_item
if word in unicode(item['N']):
exceptions.UnicodeDecodeError: 'ascii' codec can't decode byte 0xd0 in position 0: ordinal not in range(128)
"""
Вот он какой и откуда... этот байт "0xd0"
In [3]:
s='Авто@Mail.Ru'
s
Out[3]:
Воспроизведем ошибку exceptions.UnicodeDecodeError:
In [6]:
unicode(s)
A c чем, собственно, эта запись сравнивалась? Полагаю, что со строчкой u'41043244243e@Mail.Ru'
In []:
[scrapy] INFO: Hi, this is a row!: {'N100': u'85837', 'purl': u'http://auto.mail.ru', 'N': u'\u0410\u0432\u
0442\u043e@Mail.Ru'}
In [5]:
#Напоминаем, что парсился вот этот файл
%load "C:/Users/kiss/Documents/GitMyScrapy/scrapy_csv_2/nissan_2.csv"
In []:
"Авто@Mail.Ru";85837;"http://auto.mail.ru"
"Просмотры";"%";"Страницы"
"7371";"39,46";"http://auto.mail.ru/catalogue/nissan/"
"1416";"7,58";"http://auto.mail.ru/catalogue/nissan/qashqai/"
"1179";"6,31";"http://auto.mail.ru/catalogue/nissan/x-trail/"
Проще всего сразу конвертировать наши строки в коде в Юникод (ссылка на мой пост с подробностями наверху). Проверим, получится ли строка Юникода из нашей, да, вот команда :
In [7]:
ss=unicode(s,'utf-8')
ss
Out[7]:
Заменим одну строчку на words_to_filter = [unicode('Авто@Mail.Ru','utf-8'),unicode('Просмотры','utf-8')]¶
In [8]:
# Исправленый вариант (он ругается, но убирает две лишних строчки
%load "C:\\Users\\kiss\\Documents\\GitMyScrapy\\scrapy_csv_2\\scrapy_csv_2\\pipelines.py"
In []:
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class ScrapyCsv1Pipeline(object):
"""A pipeline for filtering out items which contain certain words in their
description"""
# put all words in lowercase
words_to_filter = [unicode('Авто@Mail.Ru','utf-8'),unicode('Просмотры','utf-8')]
def process_item(self, item, spider):
for word in self.words_to_filter:
if word in item['N']:
raise DropItem("Contains forbidden word: %s" % word)
else:
return item
In []:
# так работа паук без pipelines (запись строчек в текстовый файл)
purl,N100,N
http://auto.mail.ru,85837,Авто@Mail.Ru
Страницы,%,Просмотры
http://auto.mail.ru/catalogue/nissan/,"39,46",7371
http://auto.mail.ru/catalogue/nissan/qashqai/,"7,58",1416
http://auto.mail.ru/catalogue/nissan/x-trail/,"6,31",1179
In []:
# Теперь первые две строчки отфильтровываются
purl,N100,N
http://auto.mail.ru/catalogue/nissan/,"39,46",7371
http://auto.mail.ru/catalogue/nissan/qashqai/,"7,58",1416
http://auto.mail.ru/catalogue/nissan/x-trail/,"6,31",1179
Посты чуть ниже также могут вас заинтересовать
Комментариев нет:
Отправить комментарий