1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
| from datetime import datetime, timedelta
import time
import requests, pandas, lxml
from lxml import html
def format_date(date_datetime):
date_timetuple = date_datetime.timetuple()
date_mktime = time.mktime(date_timetuple)
date_int = int(date_mktime)
date_str = str(date_int)
return date_str
def subdomain(symbol, start, end, filter='history'):
subdoma="/quote/{0}/history?period1={1}&period2={2}&interval=1d&filter={3}&frequency=1d"
subdomain = subdoma.format(symbol, start, end, filter)
return subdomain
def header_function(subdomain):
hdrs = {"authority": "finance.yahoo.com",
"method": "GET",
"path": subdomain,
"scheme": "https",
"accept": "text/html",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-US,en;q=0.9",
"cache-control": "no-cache",
"cookie": "Cookie:identifier",
"dnt": "1",
"pragma": "no-cache",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"}
return hdrs
def scrape_page(url, header):
page = requests.get(url, headers=header)
element_html = html.fromstring(page.content)
table = element_html.xpath('//table')
print(table)
table_tree = lxml.etree.tostring(table[0], method='xml')
panda = pandas.read_html(table_tree)
return panda
if __name__ == '__main__':
symbol = 'BB'
dt_start = datetime.today() - timedelta(days=365)
dt_end = datetime.today()
start = format_date(dt_start)
end = format_date(dt_end)
sub = subdomain(symbol, start, end)
header = header_function(sub)
print(header)
base_url = 'https://finance.yahoo.com'
url = base_url + sub
print(url)
price_history = scrape_page(url, header) |
Partager