Mise en forme xlsx après scraping impossible

**agatheldr** · 05/01/2018, 19h11

Bonsoir,

Etudiante en Master 1 d'innovation numérique, je dois rendre un petit projet sur le scraping. En effet, nous avons décidé de scraper les restaurants d'un site, le problème c'est que au moment de mettre en forme en xlsx, il y a un message qui apparaît.

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
Traceback (most recent call last):
  File "C:\Users\UC\Desktop\PYTHON DOSSIER FINAL\programme.py", line 197, in <module>
    workbook.close()
  File "C:\Users\UC\AppData\Local\Programs\Python\Python36-32\lib\site-packages\xlsxwriter\workbook.py", line 310, in close
    self._store_workbook()
  File "C:\Users\UC\AppData\Local\Programs\Python\Python36-32\lib\site-packages\xlsxwriter\workbook.py", line 624, in _store_workbook
    allowZip64=self.allow_zip64)
  File "C:\Users\UC\AppData\Local\Programs\Python\Python36-32\lib\zipfile.py", line 1090, in __init__
    self.fp = io.open(file, filemode)
OSError: [Errno 22] Invalid argument: 'yelpscrap-2018-01-05 17:53:24.495625.xlsx'

et voici le programme :

Code :

Sélectionner tout - Visualiser dans une fenêtre à part

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# -*- coding: utf-8 -*-
 
from bs4 import BeautifulSoup
from random import randint
from time import sleep
from datetime import datetime
import re
import requests
import xlsxwriter
 
CITY = "Rennes"
 
CFLTS = ["Restaurants"]
 
DEBUG = False
MAX_SLEEP = 30000
 
class YelpShop(object):
 
 
    def __init__(self, name="", address="", zipcode="", phone="", url="", reviewCount="", ratingValue="", categories=[]):
 
        self.name = name
        self.address = address
        self.zipcode = zipcode
        self.phone = phone
        self.url = url
        self.reviewCount = reviewCount
        self.ratingValue = ratingValue
        self.categories = categories
 
    def __str__(self):
        return "{0} ({1})".format(self.name, self.phone)
 
def mylog(chargement):
    print("En cours de" + chargement)
 
def page_to_index(page_num):
    return(page_num - 1)*10
 
def build_arglist(elts):
 
    res = "["
    for elt in elts[:-1]:
        res += elt + ","
    res += elts[-1] + "]"
 
    return res
 
def build_yelp_url(page, c):
 
    url = "http://www.yelp.fr/search?&start={0}".format(page_to_index(page))
    if CITY:
        url += "&find_loc={0}".format(CITY)
    url += "&cflt={0}".format(c)
 
    return url
 
def extract_zipcode(adr):
 
    try:
        res = re.compile('\d{5}').finall(adr)[0]
    except:
        res = ""
 
    return res
 
def is_advertisement(search_result):
 
    if search_result.find('span', attrs={"class":u"yloca-tip"}):
        return True
    return False
 
def r_sleep():
 
    length = float(randint(2000, MAX_SLEEP)) / 1000
    mylog("Safety Random Sleep has started for {0} sec".format(length))
    sleep(length)
    mylog("Safety Random Sleep is over")
 
def write_query():
 
    res = ""
    if CITY:
        res += "City: {0} - ".format(CITY)
 
    return res
 
#SCRIPT
 
mylog("Script has started")
 
shops = []
 
for cflt in CFLTS:
    cur_page = 0
    while True:
        cur_page += 1
 
        cur_url = build_yelp_url(page=cur_page, c=cflt)
        mylog("Start scraping {0} at {1}".format(cur_page, cur_url))
 
 
        fake_headers = {
 
            'Connection': 'keep-alive',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36',
            'Accept-Encoding': 'gzip, deflate, sdch',
            'Accept-Language': 'fr,en-US;q=0.8,en;q=0.6'}
 
        r = requests.get(cur_url, headers=fake_headers)
        soup = BeautifulSoup(r.text,"html.parser")
 
 
        cpt = 0
 
        for sr in soup.find_all('div', attrs={"class":u"search-result"}):
 
            if is_advertisement(sr):
                continue
 
            try:
                cpt += 1
                ext_name = sr.find('a', attrs={"class":u"biz-name"}) \
                           .get_text().strip()
                ext_address = sr.find('address').get_text().strip()
                ext_phone = sr.find('span', attrs={"class":u"biz-phone"}) \
                            .get_text().strip()
                ext_url = sr.find('a', attrs={"class":u"biz-name"})['href']
                ext_categories = [e.get_text().strip() for e in sr.find(
                    'span', attrs={"class":u"category-str-list"}).find_all('a')]
            except:
                    mylog("A shop has been ignored because of parsing error")
                    continue
 
            if not ext_url in [s.url for s in shops]:
                    shops.append(YelpShop(
                        name=ext_name,
                        address=ext_address,
                        zipcode=extract_zipcode(ext_address),
                        phone=ext_phone,
                        url=ext_url,
                        categories=ext_categories))
 
            mylog("New shop created: {0}".format(ext_name))
 
        if cpt == 0: # There is no more shops to aspire, time to exit
            break
 
        mylog("Finish scraping page {0} ({1} shops aspirated)".format(cur_page,
                                                                      cpt))
 
        # Time to sleep for safety
        r_sleep()
 
mylog("Scraping finished")
 
# -- XLSX EXPORT
mylog("Start XLSX export, there is {0} shops to write".format(len(shops)))
 
# Init workbook/worksheet
now = datetime.now()
filename = "yelpscrap.xlsx".format(date=str(now))
workbook = xlsxwriter.Workbook(filename)
worksheet = workbook.add_worksheet()
 
# Write Metadata
 
# -- Query
row = 0
col = 0
worksheet.write(row, col, write_query())
 
# -- Headers
row = 1
col = 0
heads = ("Shop name", "Address", "ZipCode", "Phone", "Categories")
for head in heads:
    worksheet.write(row, col, head)
    col += 1
 
# Write Data
row = 2
col = 0
url_format = url_format = workbook.add_format({'font_color': 'blue',
                                               'underline':  1}) # for URLs
for shop in shops:
    worksheet.write_url(row, col, "http://www.yelp.fr{0}".format(shop.url),
                        url_format, shop.name)
    worksheet.write(row, col+1, shop.address)
    worksheet.write(row, col+2, shop.zipcode)
    worksheet.write(row, col+4, shop.phone)
    worksheet.write(row, col+5, ';'.join(shop.categories)) # Clean display
    row += 1
 
workbook.close()
mylog("Finish XLSX export at {0}".format(filename))

Je vous remercie d'avance ,

Au revoir, et bonne année 2018

Mise en forme xlsx après scraping impossible

Réseau/Web Python

Mode arborescent

Discussions similaires

Partager

Partager