1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
|
import Queue
import threading
import urllib2
from urllib2 import urlopen
import time
from bs4 import BeautifulSoup as BeautifulSoup
from urllib3 import HTTPConnectionPool
#from urllib3 import urlopen
import xlwt
import time
import socket
socket.setdefaulttimeout(5.0)
class Retry(object):
default_exceptions = (Exception,)
def __init__(self, tries, exceptions=None, delay=0):
"""
Decorator for retrying a function if exception occurs
tries -- num tries
exceptions -- exceptions to catch
delay -- wait between retries
"""
self.tries = tries
if exceptions is None:
exceptions = Retry.default_exceptions
self.exceptions = exceptions
self.delay = delay
def __call__(self, f):
def fn(*args, **kwargs):
exception = None
for _ in range(self.tries):
try:
return f(*args, **kwargs)
except self.exceptions, e:
print "Retry, exception: "+str(e)
time.sleep(self.delay)
exception = e
#if no success after tries, raise last exception
raise exception
return fn
@Retry(5)
def open_url(source):
print("Retrying to open and read the page")
resp = urlopen(source).read()
#resp = urllib3.urlopen(source)
return resp
hosts = ["http://www.bulats.org/agents/find-an-agent", "http://bulats.org//agents/find-an-agent?field_continent_tid=All&field_country_tid=All&page=1", "http://bulats.org//agents/find-an-agent?field_continent_tid=All&field_country_tid=All&page=2", "http://bulats.org//agents/find-an-agent?field_continent_tid=All&field_country_tid=All&page=3", "http://bulats.org//agents/find-an-agent?field_continent_tid=All&field_country_tid=All&page=4", "http://bulats.org//agents/find-an-agent?field_continent_tid=All&field_country_tid=All&page=5", "http://bulats.org//agents/find-an-agent?field_continent_tid=All&field_country_tid=All&page=6", "http://bulats.org//agents/find-an-agent?field_continent_tid=All&field_country_tid=All&page=7", "http://bulats.org//agents/find-an-agent?field_continent_tid=All&field_country_tid=All&page=8", "http://bulats.org//agents/find-an-agent?field_continent_tid=All&field_country_tid=All&page=9", "http://bulats.org//agents/find-an-agent?field_continent_tid=All&field_country_tid=All&page=10", "http://bulats.org//agents/find-an-agent?field_continent_tid=All&field_country_tid=All&page=11", "http://bulats.org//agents/find-an-agent?field_continent_tid=All&field_country_tid=All&page=12"]
queue = Queue.Queue()
out_queue = Queue.Queue()
class ThreadUrl(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, queue, out_queue):
threading.Thread.__init__(self)
self.queue = queue
self.out_queue = out_queue
def run(self):
while True:
#grabs host from queue
host = self.queue.get()
#grabs urls of hosts and then grabs chunk of webpage
chunk = open_url(host)
#url = urllib2.urlopen(host)
#chunk = url.read()
#place chunk into out queue
self.out_queue.put(chunk)
#signals to queue job is done
self.queue.task_done()
class DatamineThread(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, out_queue):
threading.Thread.__init__(self)
self.out_queue = out_queue
def run(self):
global x
while True:
#grabs host from queue
chunk = self.out_queue.get()
#parse the chunk
soup = BeautifulSoup(chunk)
tableau = soup.find('table')
rows = tableau.findAll('tr')
for tr in rows:
cols = tr.findAll('td')
y = 0
x = x + 1
for td in cols:
texte_bu = td.text
texte_bu = texte_bu.encode('utf-8')
print("1")
ws.write(x,y,td.text)
y = y + 1
wb.save("BULATS_TTTTT.xls")
#signals to queue job is done
self.out_queue.task_done()
break
start = time.time()
def main():
#spawn a pool of threads, and pass them queue instance
for i in range(13):
t = ThreadUrl(queue, out_queue)
t.setDaemon(True)
t.start()
#populate queue with data
for host in hosts:
queue.put(host)
for i in range(13):
dt = DatamineThread(out_queue)
dt.setDaemon(True)
dt.start()
#wait on the queue until everything has been processed
queue.join()
out_queue.join()
global x
x = 0
wb = xlwt.Workbook(encoding='utf-8')
ws = wb.add_sheet("BULATS_IA_PARSED")
main()
print "Elapsed Time: %s" % (time.time() - start) |
Partager