Поиск по блогу

среда, 14 января 2015 г.

Свалка удачных маленьких экспериментов, результаты которых я собираюсь часто использовать

Здесь я хотел подготовить код для краулера bmw_mail, не удержался и опробовал сразу несколько разных идей: import HTML, import HTML, ''.join(tree.xpath('//span[@class="rank"]/i/text()')), codecs, lxml, requests, .replace('\n','').encode('utf8'), print ''.join(this_link).encode('utf8')
Получилось все, но этот пост не тянет даже на черновик. Здесь просто сумбур из фрагманты кода, но зато я здесь изобрел сразу несколько велосипедов.

In [2]:
from IPython.display import Image, HTML
Image ('C:\\Users\\kiss\\Pictures\\pythonR\\bmw_mail.png')
Out[2]:
In []:
//div[@class="catalog-age__mod__item__box"]

//div[@class="catalog-age__mod__item__note"] 
#- Автомат,  задний привод,  бензин, 9.2 с до 100 км/ч  

//div[@class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_hp"]
#136 л.с.

//a[@class="catalog-age__mod__item__type__link"]/text()
//a[@class="catalog-age__mod__item__type__link"]/@href

//a[@class="catalog-age__mod__item__equip clear"]
In [4]:
from IPython.display import  HTML
In []:
/html/body/div[2]/div/div[2]/div[1]/div/div/div[2]/div/div[2]/div[2]/div/div[1]/div/div[1]/div[2]/div[7]/div/div[1]/div[2]
In [6]:
<div class="catalog-age__mod__item__box">
 <div class="catalog-age__mod__item__type clear">
 <div class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_title">
 <a class="catalog-age__mod__item__type__link" href="https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/specifications/?gear_type=204&amp;fuel=222&amp;modification_id=22938">316i AT</a>
 </div>
 <div class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_hp">136 л.с.</div>
 </div>

 <div class="catalog-age__mod__item__note">
 Автомат,
 задний&nbsp;привод,
 бензин, 9.2&nbsp;с&nbsp;до&nbsp;100&nbsp;км/ч
 </div>

 <a class="catalog-age__mod__item__equip clear" href="https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/32100/">
     <span>Special Edition SKD</span>

     <div class="catalog-age__mod__item__equip__price">
         <span class="catalog-age__mod__item__equip__price__value">
             <span class="rank"><i>1</i><i>398</i><i>000</i></span>
         </span>
     <span class="catalog-age__mod__item__equip__price__note">руб.</span>
     </div>
 </a>
 
 
 </div>
  File "<ipython-input-6-00b8b512da05>", line 1
    <div class="catalog-age__mod__item__box">
    ^
SyntaxError: invalid syntax
In [7]:
s=import  HTML
In [12]:
s.data
Out[12]:
u'<div class="catalog-age__mod__item__box">\n <div class="catalog-age__mod__item__type clear">\n <div class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_title">\n <a class="catalog-age__mod__item__type__link" href="https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/specifications/?gear_type=204&amp;fuel=222&amp;modification_id=22938">316i AT</a>\n </div>\n <div class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_hp">136 \u043b.\u0441.</div>\n </div>\n\n <div class="catalog-age__mod__item__note">\n \u0410\u0432\u0442\u043e\u043c\u0430\u0442,\n \u0437\u0430\u0434\u043d\u0438\u0439&nbsp;\u043f\u0440\u0438\u0432\u043e\u0434,\n \u0431\u0435\u043d\u0437\u0438\u043d, 9.2&nbsp;\u0441&nbsp;\u0434\u043e&nbsp;100&nbsp;\u043a\u043c/\u0447\n </div>\n\n <a class="catalog-age__mod__item__equip clear" href="https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/32100/">\n <span>Special Edition SKD</span>\n\n <div class="catalog-age__mod__item__equip__price">\n <span class="catalog-age__mod__item__equip__price__value">\n <span class="rank"><i>1</i><i>398</i><i>000</i></span>\n </span>\n <span class="catalog-age__mod__item__equip__price__note">\u0440\u0443\u0431.</span>\n </div>\n </a>\n \n \n </div>'
In [8]:
from lxml import html
import requests
In [13]:
tree=html.fromstring(s.data)
In [22]:
tree.
Out[22]:
<Element body at 0x43896d8>
In [14]:
#This will create a "Автомат,  задний привод,  бензин, 9.2 с до 100 км/ч" 
description = tree.xpath('//div[@class="catalog-age__mod__item__note"] /text()')
#This will create a list 136 л.с.
power = tree.xpath('//div[@class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_hp"]/text()')
In [17]:
print description, power
[u'\n \u0410\u0432\u0442\u043e\u043c\u0430\u0442,\n \u0437\u0430\u0434\u043d\u0438\u0439\xa0\u043f\u0440\u0438\u0432\u043e\u0434,\n \u0431\u0435\u043d\u0437\u0438\u043d, 9.2\xa0\u0441\xa0\u0434\u043e\xa0100\xa0\u043a\u043c/\u0447\n '] [u'136 \u043b.\u0441.']

А здесь меня заколдобило от этих юникодов, бросил все и нашел ''.join(description).encode('utf8')

In [39]:
# -*- coding: utf-8 -*-
# coding: utf8
''.join(power).encode('utf8')# Расодируем Unicode в 16-ричные символы кириллицы
Out[39]:
'136 \xd0\xbb.\xd1\x81.'
In [40]:
print '136 \xd0\xbb.\xd1\x81.'
136 л.с.

In [47]:
print ''.join(description).encode('utf8')
 Автомат,
 задний привод,
 бензин, 9.2 с до 100 км/ч
 

In [50]:
# Или берем просто первый элемент списка
# и по пути убираем разбив ку на строки
print description[0].replace('\n','').encode('utf8')
 Автомат, задний привод, бензин, 9.2 с до 100 км/ч 

In []:
<div class="catalog-age__mod__item__box">
 <div class="catalog-age__mod__item__type clear">
 <div class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_title">
 <a href="https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/specifications/?gear_type=204&amp;fuel=59&amp;modification_id=22939" class="catalog-age__mod__item__type__link">320d 184hp xDrive AT</a>
 </div>
 <div class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_hp">184 л.с.</div>
 </div>

 <div class="catalog-age__mod__item__note">
 Автомат,
 полный&nbsp;привод,
 дизель, 7.4&nbsp;с&nbsp;до&nbsp;100&nbsp;км/ч
 </div>

 <a href="https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/30440/" class="catalog-age__mod__item__equip clear">
 <span>Sport Line SKD</span>

 <div class="catalog-age__mod__item__equip__price">
 <span class="catalog-age__mod__item__equip__price__value">
 <span class="rank"><i>1</i><i>752</i><i>000</i></span>
 </span>
 <span class="catalog-age__mod__item__equip__price__note">руб.</span>
 </div>
 </a>
 
 <a href="https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/35796/" class="catalog-age__mod__item__equip clear">
 <span>Modern Line SKD</span>

 <div class="catalog-age__mod__item__equip__price">
 <span class="catalog-age__mod__item__equip__price__value">
 <span class="rank"><i>1</i><i>883</i><i>000</i></span>
 </span>
 <span class="catalog-age__mod__item__equip__price__note">руб.</span>
 </div>
 </a>
 
 <a href="https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/30439/" class="catalog-age__mod__item__equip clear">
 <span>Luxury Line SKD</span>

 <div class="catalog-age__mod__item__equip__price">
 <span class="catalog-age__mod__item__equip__price__value">
 <span class="rank"><i>1</i><i>950</i><i>000</i></span>
 </span>
 <span class="catalog-age__mod__item__equip__price__note">руб.</span>
 </div>
 </a>
 
 
 </div>
In [51]:
#This will create a "Автомат,  задний привод,  бензин, 9.2 с до 100 км/ч" 
description = tree.xpath('//div[@class="catalog-age__mod__item__note"] /text()')
#This will create a list 136 л.с.
power = tree.xpath('//div[@class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_hp"]/text()')

#This will parse link to specifications folder and text in link
#"https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/specifications/?gear_type=204&amp;fuel=222&amp;modification_id=22938"
# "316i AT"
specification_link = tree.xpath('//a[@class="catalog-age__mod__item__type__link"]/@href')
specification_text = tree.xpath('//a[@class="catalog-age__mod__item__type__link"]/text()')

#
this_link = tree.xpath('//a[@class="catalog-age__mod__item__equip clear"]/@href')
this_text = tree.xpath('//a[@class="catalog-age__mod__item__equip clear"]/text()')
#Into link this tag <a
#<span>Special Edition SKD</span>
in_this_name=tree.xpath('//a[@class="catalog-age__mod__item__equip clear"]/span[0]/text()')
#<span class="rank"><i>1</i><i>398</i><i>000</i></span>
in_this_price=tree.xpath('//span[@class="rank"]/text()')
#

#This will create a list 136 л.с.
power = tree.xpath('//div[@class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_hp"]/text()')
In []:
mod__item__box = response.xpath('//div[@class="catalog-age__mod__item__box"]')
    
        items = []
    
        for sel in mod__item__box:
   i = CarmailpriceItem()
            #This will create a "Автомат,  задний привод,  бензин, 9.2 с до 100 км/ч" 
            i['description'] = sel.xpath('.//div[@class="catalog-age__mod__item__note"] /text()').extract()
            #This will create a list 136 л.с.
            i['power'] = sel.xpath('.//div[@class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_hp"]/text()').extract()
            
            #This will parse link to specifications folder and text in link
            #"https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/specifications/?gear_type=204&amp;fuel=222&amp;modification_id=22938"
            # "316i AT"
            i['specification_link'] = tree.xpath('//a[@class="catalog-age__mod__item__type__link"]/@href').extract()
            i['specification_text'] = tree.xpath('//a[@class="catalog-age__mod__item__type__link"]/text()').extract()
            
            #<span>Special Edition SKD</span>
            i['in_this_name'] = tree.xpath('//a[@class="catalog-age__mod__item__equip clear"]/span[0]/text()').extract()
            #<span class="rank"><i>1</i><i>398</i><i>000</i></span>
            i['in_this_price'] = tree.xpath('//span[@class="rank"]/text()').extract()

            links = sel.xpath('//div[@class="catalog-age__mod__item__equip clear"]')    
            
            
            item['name'] = sel.xpath('.//a[@class="catalog-generation__card__title"]/text()').extract()
   item['link'] = sel.xpath('.//a[@class="catalog-generation__card__title"]/@href').extract()
   item['price'] = sel.xpath('.//span[@class="rank"]/i/text()').extract()
   items.append(item)
        
        return items 
In [65]:
''.join(tree.xpath('//span[@class="rank"]/i/text()'))
Out[65]:
'1398000'
In []:
 
In [61]:
tree.xpath('//a[@class="catalog-age__mod__item__equip clear"]/span[1]/text()')
Out[61]:
['Special Edition SKD']
In [54]:
print ''.join(specification_link).encode('utf8'), ''.join(specification_text).encode('utf8')
https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/specifications/?gear_type=204&fuel=222&modification_id=22938 316i AT

In [56]:
print ''.join(this_link).encode('utf8'),  this_text
https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/32100/ ['\n ', '\n\n ', '\n ']

In [57]:
print ''.join(in_this_name).encode('utf8'), ''.join(in_this_price).encode('utf8')
 

In [58]:
in_this_name, in_this_price
Out[58]:
([], [])
In []:
 

Приложения

In [33]:
import codecs
In [42]:
help(string.join)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-42-6e696091e0d0> in <module>()
----> 1 help(string.join)

NameError: name 'string' is not defined
In [24]:
dir(tree) #tree=html.fromstring(s.data)
Out[24]:
['__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__getitem__',
 '__hash__',
 '__init__',
 '__iter__',
 '__len__',
 '__module__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_init',
 '_label__del',
 '_label__get',
 '_label__set',
 'addnext',
 'addprevious',
 'append',
 'attrib',
 'base',
 'base_url',
 'body',
 'clear',
 'cssselect',
 'drop_tag',
 'drop_tree',
 'extend',
 'find',
 'find_class',
 'find_rel_links',
 'findall',
 'findtext',
 'forms',
 'get',
 'get_element_by_id',
 'getchildren',
 'getiterator',
 'getnext',
 'getparent',
 'getprevious',
 'getroottree',
 'head',
 'index',
 'insert',
 'items',
 'iter',
 'iterancestors',
 'iterchildren',
 'iterdescendants',
 'iterfind',
 'iterlinks',
 'itersiblings',
 'itertext',
 'keys',
 'label',
 'make_links_absolute',
 'makeelement',
 'nsmap',
 'prefix',
 'remove',
 'replace',
 'resolve_base_href',
 'rewrite_links',
 'set',
 'sourceline',
 'tag',
 'tail',
 'text',
 'text_content',
 'values',
 'xpath']
In [34]:
dir(codecs)
Out[34]:
['BOM',
 'BOM32_BE',
 'BOM32_LE',
 'BOM64_BE',
 'BOM64_LE',
 'BOM_BE',
 'BOM_LE',
 'BOM_UTF16',
 'BOM_UTF16_BE',
 'BOM_UTF16_LE',
 'BOM_UTF32',
 'BOM_UTF32_BE',
 'BOM_UTF32_LE',
 'BOM_UTF8',
 'BufferedIncrementalDecoder',
 'BufferedIncrementalEncoder',
 'Codec',
 'CodecInfo',
 'EncodedFile',
 'IncrementalDecoder',
 'IncrementalEncoder',
 'StreamReader',
 'StreamReaderWriter',
 'StreamRecoder',
 'StreamWriter',
 '__all__',
 '__builtin__',
 '__builtins__',
 '__doc__',
 '__file__',
 '__name__',
 '__package__',
 '_false',
 'ascii_decode',
 'ascii_encode',
 'backslashreplace_errors',
 'charbuffer_encode',
 'charmap_build',
 'charmap_decode',
 'charmap_encode',
 'decode',
 'encode',
 'escape_decode',
 'escape_encode',
 'getdecoder',
 'getencoder',
 'getincrementaldecoder',
 'getincrementalencoder',
 'getreader',
 'getwriter',
 'ignore_errors',
 'iterdecode',
 'iterencode',
 'latin_1_decode',
 'latin_1_encode',
 'lookup',
 'lookup_error',
 'make_encoding_map',
 'make_identity_dict',
 'mbcs_decode',
 'mbcs_encode',
 'open',
 'raw_unicode_escape_decode',
 'raw_unicode_escape_encode',
 'readbuffer_encode',
 'register',
 'register_error',
 'replace_errors',
 'strict_errors',
 'sys',
 'unicode_escape_decode',
 'unicode_escape_encode',
 'unicode_internal_decode',
 'unicode_internal_encode',
 'utf_16_be_decode',
 'utf_16_be_encode',
 'utf_16_decode',
 'utf_16_encode',
 'utf_16_ex_decode',
 'utf_16_le_decode',
 'utf_16_le_encode',
 'utf_32_be_decode',
 'utf_32_be_encode',
 'utf_32_decode',
 'utf_32_encode',
 'utf_32_ex_decode',
 'utf_32_le_decode',
 'utf_32_le_encode',
 'utf_7_decode',
 'utf_7_encode',
 'utf_8_decode',
 'utf_8_encode',
 'xmlcharrefreplace_errors']
In [35]:
help(codecs)
Help on module codecs:

NAME
    codecs - codecs -- Python Codec Registry, API and helpers.

FILE
    c:\users\kiss\anaconda\lib\codecs.py

DESCRIPTION
    
    Written by Marc-Andre Lemburg (mal@lemburg.com).
    
    (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.

FUNCTIONS
    EncodedFile(file, data_encoding, file_encoding=None, errors='strict')
        Return a wrapped version of file which provides transparent
        encoding translation.
        
        Strings written to the wrapped file are interpreted according
        to the given data_encoding and then written to the original
        file as string using file_encoding. The intermediate encoding
        will usually be Unicode but depends on the specified codecs.
        
        Strings are read from the file using file_encoding and then
        passed back to the caller as string using data_encoding.
        
        If file_encoding is not given, it defaults to data_encoding.
        
        errors may be given to define the error handling. It defaults
        to 'strict' which causes ValueErrors to be raised in case an
        encoding error occurs.
        
        The returned wrapped file object provides two extra attributes
        .data_encoding and .file_encoding which reflect the given
        parameters of the same name. The attributes can be used for
        introspection by Python programs.
    
    ignore_errors(...)
        Implements the 'ignore' error handling, which ignores malformed data and continues.
    
    lookup(...)
        lookup(encoding) -> CodecInfo
        
        Looks up a codec tuple in the Python codec registry and returns
        a CodecInfo object.
    
    lookup_error(...)
        lookup_error(errors) -> handler
        
        Return the error handler for the specified error handling name
        or raise a LookupError, if no handler exists under this name.
    
    open(filename, mode='rb', encoding=None, errors='strict', buffering=1)
        Open an encoded file using the given mode and return
        a wrapped version providing transparent encoding/decoding.
        
        Note: The wrapped version will only accept the object format
        defined by the codecs, i.e. Unicode objects for most builtin
        codecs. Output is also codec dependent and will usually be
        Unicode as well.
        
        Files are always opened in binary mode, even if no binary mode
        was specified. This is done to avoid data loss due to encodings
        using 8-bit values. The default file mode is 'rb' meaning to
        open the file in binary read mode.
        
        encoding specifies the encoding which is to be used for the
        file.
        
        errors may be given to define the error handling. It defaults
        to 'strict' which causes ValueErrors to be raised in case an
        encoding error occurs.
        
        buffering has the same meaning as for the builtin open() API.
        It defaults to line buffered.
        
        The returned wrapped file object provides an extra attribute
        .encoding which allows querying the used encoding. This
        attribute is only available if an encoding was specified as
        parameter.
    
    register(...)
        register(search_function)
        
        Register a codec search function. Search functions are expected to take
        one argument, the encoding name in all lower case letters, and return
        a tuple of functions (encoder, decoder, stream_reader, stream_writer)
        (or a CodecInfo object).
    
    register_error(...)
        register_error(errors, handler)
        
        Register the specified error handler under the name
        errors. handler must be a callable object, that
        will be called with an exception instance containing
        information about the location of the encoding/decoding
        error and must return a (replacement, new position) tuple.
    
    replace_errors(...)
        Implements the 'replace' error handling, which replaces malformed data with a replacement marker.
    
    strict_errors(...)
        Implements the 'strict' error handling, which raises a UnicodeError on coding errors.
    
    xmlcharrefreplace_errors(...)
        Implements the 'xmlcharrefreplace' error handling, which replaces an unencodable character with the appropriate XML character reference.

DATA
    BOM = '\xff\xfe'
    BOM32_BE = '\xfe\xff'
    BOM32_LE = '\xff\xfe'
    BOM64_BE = '\x00\x00\xfe\xff'
    BOM64_LE = '\xff\xfe\x00\x00'
    BOM_BE = '\xfe\xff'
    BOM_LE = '\xff\xfe'
    BOM_UTF16 = '\xff\xfe'
    BOM_UTF16_BE = '\xfe\xff'
    BOM_UTF16_LE = '\xff\xfe'
    BOM_UTF32 = '\xff\xfe\x00\x00'
    BOM_UTF32_BE = '\x00\x00\xfe\xff'
    BOM_UTF32_LE = '\xff\xfe\x00\x00'
    BOM_UTF8 = '\xef\xbb\xbf'
    __all__ = ['register', 'lookup', 'open', 'EncodedFile', 'BOM', 'BOM_BE...



In [26]:
tree.text_content()
Out[26]:
u'\n \n \n 316i AT\n \n 136 \u043b.\u0441.\n \n\n \n \u0410\u0432\u0442\u043e\u043c\u0430\u0442,\n \u0437\u0430\u0434\u043d\u0438\u0439\xa0\u043f\u0440\u0438\u0432\u043e\u0434,\n \u0431\u0435\u043d\u0437\u0438\u043d, 9.2\xa0\u0441\xa0\u0434\u043e\xa0100\xa0\u043a\u043c/\u0447\n \n\n \n Special Edition SKD\n\n \n \n 1398000\n \n \u0440\u0443\u0431.\n \n \n \n \n '
In [27]:
tree.text()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-27-005d59e4a3f3> in <module>()
----> 1 tree.text()

TypeError: 'str' object is not callable


Посты чуть ниже также могут вас заинтересовать

Комментариев нет:

Отправить комментарий