1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
|
# initilisation d'un selector sur le produit
productselector=Selector(response)
#recuperation et initialisation du shortname de l'item
productshortname=productselector.xpath('//div[@id="vmMainPage"]/*/*/*/*/h1/text()').extract()[0]
self.log('Extracting short name : %s'%(productshortname))
self.currentitem['shortname']=productshortname
#recuperation et initialisation du shortname de l'item
productname=productselector.xpath('//div[@id="vmMainPage"]/*/*/*/*/h3[1]/text()').extract()[0]+" "+productselector.xpath('//div[@id="vmMainPage"]/*/*/*/*/h3[2]/text()').extract()[0]
self.log('Extracting name : %s'%(productname))
self.currentitem['name']=productname
#recuperation et initilisation du tableau des specification techniques
tableproduits=productselector.xpath('//table[@id="produits-titre"]')
self.currentitem['table_spec']=tableproduits
# avec beautiful soup
for eachspectable in tableproduits:
#print eachspectable.extract()
import re
print "eachspectable.extract()",eachspectable.extract()
sleep(2)
eachspectablesanitized = re.sub('<br>','',eachspectable.extract())
print eachspectablesanitized
from bs4 import BeautifulSoup as bs
sleep(5)
"""
techsoup = bs(eachspectable.extract())
print techsoup.find('tbody').find_all('tr')
sleep(10)
for row in techsoup.find('tbody').find_all('tr'):
cells=row.find_all('td',text=True)
tech_name=cells[0].text
tech_value=cells[1].text
print row,len(cells),tech_name, tech_value
sleep(5)
"""
"""
#Avec scrapy
print len(tableproduits.extract())
for eachtable in tableproduits:
print eachtable.xpath('.//tr')
for eachspec in eachtable.xpath('.//tr'):
print "eachspec",eachspec.xpath('.//td')
sleep(20)
break;
"""
"""
#avec lxml
from lxml.html import fromstring
from lxml.html.clean import clean_html
for eachtable in self.currentitem['table_spec']:
TableSpecTree=fromstring(clean_html(eachtable.extract()))
for eachtr in TableSpecTree.xpath('//tr/td/text()'):
print eachtr
""" |
Partager