Здесь я хотел подготовить код для краулера bmw_mail, не удержался и опробовал сразу несколько разных идей: import HTML, import HTML,
''.join(tree.xpath('//span[@class="rank"]/i/text()')), codecs, lxml, requests, .replace('\n','').encode('utf8'), print ''.join(this_link).encode('utf8')
Получилось все, но этот пост не тянет даже на черновик. Здесь просто сумбур из фрагманты кода, но зато я здесь изобрел сразу несколько велосипедов.
In [2]:
from IPython.display import Image, HTML
Image ('C:\\Users\\kiss\\Pictures\\pythonR\\bmw_mail.png')
Out[2]:
In []:
//div[@class="catalog-age__mod__item__box"]
//div[@class="catalog-age__mod__item__note"]
#- Автомат, задний привод, бензин, 9.2 с до 100 км/ч
//div[@class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_hp"]
#136 л.с.
//a[@class="catalog-age__mod__item__type__link"]/text()
//a[@class="catalog-age__mod__item__type__link"]/@href
//a[@class="catalog-age__mod__item__equip clear"]
In [4]:
from IPython.display import HTML
In []:
/html/body/div[2]/div/div[2]/div[1]/div/div/div[2]/div/div[2]/div[2]/div/div[1]/div/div[1]/div[2]/div[7]/div/div[1]/div[2]
In [6]:
<div class="catalog-age__mod__item__box">
<div class="catalog-age__mod__item__type clear">
<div class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_title">
<a class="catalog-age__mod__item__type__link" href="https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/specifications/?gear_type=204&fuel=222&modification_id=22938">316i AT</a>
</div>
<div class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_hp">136 л.с.</div>
</div>
<div class="catalog-age__mod__item__note">
Автомат,
задний привод,
бензин, 9.2 с до 100 км/ч
</div>
<a class="catalog-age__mod__item__equip clear" href="https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/32100/">
<span>Special Edition SKD</span>
<div class="catalog-age__mod__item__equip__price">
<span class="catalog-age__mod__item__equip__price__value">
<span class="rank"><i>1</i><i>398</i><i>000</i></span>
</span>
<span class="catalog-age__mod__item__equip__price__note">руб.</span>
</div>
</a>
</div>
In [7]:
s=import HTML
In [12]:
s.data
Out[12]:
In [8]:
from lxml import html
import requests
In [13]:
tree=html.fromstring(s.data)
In [22]:
tree.
Out[22]:
In [14]:
#This will create a "Автомат, задний привод, бензин, 9.2 с до 100 км/ч"
description = tree.xpath('//div[@class="catalog-age__mod__item__note"] /text()')
#This will create a list 136 л.с.
power = tree.xpath('//div[@class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_hp"]/text()')
In [17]:
print description, power
А здесь меня заколдобило от этих юникодов, бросил все и нашел ''.join(description).encode('utf8')¶
In [39]:
# -*- coding: utf-8 -*-
# coding: utf8
''.join(power).encode('utf8')# Расодируем Unicode в 16-ричные символы кириллицы
Out[39]:
In [40]:
print '136 \xd0\xbb.\xd1\x81.'
In [47]:
print ''.join(description).encode('utf8')
In [50]:
# Или берем просто первый элемент списка
# и по пути убираем разбив ку на строки
print description[0].replace('\n','').encode('utf8')
In []:
<div class="catalog-age__mod__item__box">
<div class="catalog-age__mod__item__type clear">
<div class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_title">
<a href="https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/specifications/?gear_type=204&fuel=59&modification_id=22939" class="catalog-age__mod__item__type__link">320d 184hp xDrive AT</a>
</div>
<div class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_hp">184 л.с.</div>
</div>
<div class="catalog-age__mod__item__note">
Автомат,
полный привод,
дизель, 7.4 с до 100 км/ч
</div>
<a href="https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/30440/" class="catalog-age__mod__item__equip clear">
<span>Sport Line SKD</span>
<div class="catalog-age__mod__item__equip__price">
<span class="catalog-age__mod__item__equip__price__value">
<span class="rank"><i>1</i><i>752</i><i>000</i></span>
</span>
<span class="catalog-age__mod__item__equip__price__note">руб.</span>
</div>
</a>
<a href="https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/35796/" class="catalog-age__mod__item__equip clear">
<span>Modern Line SKD</span>
<div class="catalog-age__mod__item__equip__price">
<span class="catalog-age__mod__item__equip__price__value">
<span class="rank"><i>1</i><i>883</i><i>000</i></span>
</span>
<span class="catalog-age__mod__item__equip__price__note">руб.</span>
</div>
</a>
<a href="https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/30439/" class="catalog-age__mod__item__equip clear">
<span>Luxury Line SKD</span>
<div class="catalog-age__mod__item__equip__price">
<span class="catalog-age__mod__item__equip__price__value">
<span class="rank"><i>1</i><i>950</i><i>000</i></span>
</span>
<span class="catalog-age__mod__item__equip__price__note">руб.</span>
</div>
</a>
</div>
In [51]:
#This will create a "Автомат, задний привод, бензин, 9.2 с до 100 км/ч"
description = tree.xpath('//div[@class="catalog-age__mod__item__note"] /text()')
#This will create a list 136 л.с.
power = tree.xpath('//div[@class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_hp"]/text()')
#This will parse link to specifications folder and text in link
#"https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/specifications/?gear_type=204&fuel=222&modification_id=22938"
# "316i AT"
specification_link = tree.xpath('//a[@class="catalog-age__mod__item__type__link"]/@href')
specification_text = tree.xpath('//a[@class="catalog-age__mod__item__type__link"]/text()')
#
this_link = tree.xpath('//a[@class="catalog-age__mod__item__equip clear"]/@href')
this_text = tree.xpath('//a[@class="catalog-age__mod__item__equip clear"]/text()')
#Into link this tag <a
#<span>Special Edition SKD</span>
in_this_name=tree.xpath('//a[@class="catalog-age__mod__item__equip clear"]/span[0]/text()')
#<span class="rank"><i>1</i><i>398</i><i>000</i></span>
in_this_price=tree.xpath('//span[@class="rank"]/text()')
#
#This will create a list 136 л.с.
power = tree.xpath('//div[@class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_hp"]/text()')
In []:
mod__item__box = response.xpath('//div[@class="catalog-age__mod__item__box"]')
items = []
for sel in mod__item__box:
i = CarmailpriceItem()
#This will create a "Автомат, задний привод, бензин, 9.2 с до 100 км/ч"
i['description'] = sel.xpath('.//div[@class="catalog-age__mod__item__note"] /text()').extract()
#This will create a list 136 л.с.
i['power'] = sel.xpath('.//div[@class="catalog-age__mod__item__type__text catalog-age__mod__item__type__text_hp"]/text()').extract()
#This will parse link to specifications folder and text in link
#"https://cars.mail.ru/catalog/bmw/3/f30_31/sedan/specifications/?gear_type=204&fuel=222&modification_id=22938"
# "316i AT"
i['specification_link'] = tree.xpath('//a[@class="catalog-age__mod__item__type__link"]/@href').extract()
i['specification_text'] = tree.xpath('//a[@class="catalog-age__mod__item__type__link"]/text()').extract()
#<span>Special Edition SKD</span>
i['in_this_name'] = tree.xpath('//a[@class="catalog-age__mod__item__equip clear"]/span[0]/text()').extract()
#<span class="rank"><i>1</i><i>398</i><i>000</i></span>
i['in_this_price'] = tree.xpath('//span[@class="rank"]/text()').extract()
links = sel.xpath('//div[@class="catalog-age__mod__item__equip clear"]')
item['name'] = sel.xpath('.//a[@class="catalog-generation__card__title"]/text()').extract()
item['link'] = sel.xpath('.//a[@class="catalog-generation__card__title"]/@href').extract()
item['price'] = sel.xpath('.//span[@class="rank"]/i/text()').extract()
items.append(item)
return items
In [65]:
''.join(tree.xpath('//span[@class="rank"]/i/text()'))
Out[65]:
In []:
In [61]:
tree.xpath('//a[@class="catalog-age__mod__item__equip clear"]/span[1]/text()')
Out[61]:
In [54]:
print ''.join(specification_link).encode('utf8'), ''.join(specification_text).encode('utf8')
In [56]:
print ''.join(this_link).encode('utf8'), this_text
In [57]:
print ''.join(in_this_name).encode('utf8'), ''.join(in_this_price).encode('utf8')
In [58]:
in_this_name, in_this_price
Out[58]:
In []:
Приложения¶
In [33]:
import codecs
In [42]:
help(string.join)
In [24]:
dir(tree) #tree=html.fromstring(s.data)
Out[24]:
In [34]:
dir(codecs)
Out[34]:
In [35]:
help(codecs)
In [26]:
tree.text_content()
Out[26]:
In [27]:
tree.text()
Посты чуть ниже также могут вас заинтересовать
Комментариев нет:
Отправить комментарий