1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
|
import time
import urllib.request
from html.parser import HTMLParser
from threading import Thread, Timer
def get_page(url):
user_agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:49.0)'\
' Gecko/20100101 Firefox/49.0'
req = urllib.request.Request(url, data=None,
headers={"User-Agent": user_agent})
try:
content = urllib.request.urlopen(req)
except Exception as why:
print('urllib error: %s, %s' % (url, why))
return False
return str(content.read())
class ImgParser(HTMLParser):
def __init__(self):
super().__init__()
self.in_td = False
self.imgs = []
def handle_starttag(self, tag, attrs):
if tag == "td":
self.in_td = True
if tag == "a" and self.in_td:
for att in attrs:
if att[0] == "href" and att[1].endswith(".IMG"):
self.imgs.append(att[1])
break
self.in_td = False
def rerun(threads, lst_0, lst_1, lst_2):
wait_for(threads, lst_0, lst_1, lst_2)
def wait_for(threads, lst_0, lst_1, lst_2):
if len(threads) < 3:
t = Timer(10, rerun, args=(threads, lst_0, lst_1, lst_2))
t.start()
else:
tot = len(lst_0) + len(lst_1) + len(lst_2)
end = time.perf_counter()
print("Found %s IMG at: %s" %(tot, end-begin))
# TODO Save the lists into a file !
indexes = []
with open("index.html.1", "r") as inf:
for l in inf:
if 'alt="[DIR]"' in l:
indexes.append(l.split('href="')[1][:6])
urls = []
base = "https://pds-imaging.jpl.nasa.gov/data/mars2020/mars2020_imgops/data_rmi_imgops/sol/"
for i in indexes:
urls.append("%s%sids/edr/scam/" %(base, i))
urls.append("%s%sids/fdr/scam/" %(base, i))
urls.append("%s%sids/rdr/scam/" %(base, i))
image_urls = []
begin = time.perf_counter()
print("Scan %s pages" % len(urls))
part = int(len(urls) / 3)
flags = []
img_part_0 = []
img_part_1 = []
img_part_2 = []
def scan_pages(*args):
urls, images, flags = args
for u in urls:
content = get_page(u)
if content:
p = ImgParser()
p.feed(content)
images.extend("".join([u, i]) for i in p.imgs)
flags.append(1)
Thread(target=scan_pages, args=(urls[0:part], img_part_0, flags)).start()
Thread(target=scan_pages, args=(urls[part:part*2], img_part_1, flags)).start()
Thread(target=scan_pages, args=(urls[part*2:], img_part_2, flags)).start()
wait_for(flags, img_part_0, img_part_1, img_part_2) |
Partager