1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
| #!/usr/bin/env python
import requests
from BeautifulSoup import BeautifulSoup
url = "https://www.jumia.tn"
response = requests.get(url)
# parse html
page = str(BeautifulSoup(response.content))
def getURL(page):
"""
:param page: html of web page (here: Python home page)
:return: urls in that page
"""
start_link = page.find("a href")
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1: end_quote]
if 'html' in url:
print url
#return url, end_quote
while True:
url, n = getURL(page)
page = page[n:]
if url:
print url
else:
break |
Partager