Bonjour j'essaie de scraper des fichiers de la SEC EDGAR pour obtenir des données financières sur des sociétés:
Ci dessous les lignes de codes et l'erreur que j'obtiens
Merci d'avance pour votre aide
#Import libraries
import edgar
import pandas as pd
import requests
import html5lib
import lxml
#access to the SEC dierctory of quarterly reports since 2018
edgar.download_index("C:/Users/XXXX/Desktop/SEC_Files", 2018, "XXXXXX@orange.fr", skip_all_present_except_last=False)
#we look at APPLE through quaterly reports
Selectedcompany='Apple Inc.'
Selectedreport='10-Q'
#to get information to build the right URL to read with read_html
csv=pd.read_csv('C:/Users/Laurent/Desktop/SEC_Files/2022-QTR3.tsv',sep='\t',lineterminator='\r',names=None)
csv.columns.values[0]='Item'
companyreport=csv[(csv['Item'].str.contains(Selectedcompany))&(csv['Item'].str.contains(Selectedreport))]
Filing=companyreport['Item'].str.split('|')
Filing=Filing.to_list()
#to select HTML file for Apple including financial reports
for item in Filing[0]:
if 'html' in item:
report=item
url='https://www.sec.gov/Archives/'+report
my_header = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36", "X-Requested-With": "XMLHttpRequest" }
html_page_text = requests.get(url, headers=my_header)
#create the URL
dfTables = pd.read_html(html_page_text.text);
document_index=dfTables[0].dropna()
document_name=document_index[document_index['Description'].str.contains(Selectedreport)]
document_name=document_name['Document'].str.split(' ')
document_name=document_name[0][0]
report=report.replace('-','').replace('index.html','')
url='https://www.sec.gov/Archives/'+report+'/'+document_name
url
#this is here where I collect an error => HTTPError: HTTP Error 403: Forbidden
df = pd.read_html(url)
for item in df:
BS=(item[0].str.contains('Total current liabilities') | item[0].str.contains('Total assets'))
if BS.any():
Balance_Sheet=item
Balanc_Sheet
HTTPError Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_7980\1123660223.py in <module>
5
6
----> 7 df = pd.read_html(url)
8 for item in df:
9 BS=(item[0].str.contains('Total current liabilities') | item[0].str.contains('Total assets'))
~\anaconda3\lib\site-packages\pandas\util\_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~\anaconda3\lib\site-packages\pandas\io\html.py in read_html(io, match, flavor, header, index_col, skiprows, attrs, parse_dates, thousands, encoding, decimal, converters, na_values, keep_default_na, displayed_only)
1111 io = stringify_path(io)
1112
-> 1113 return _parse(
1114 flavor=flavor,
1115 io=io,
~\anaconda3\lib\site-packages\pandas\io\html.py in _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs)
917
918 try:
--> 919 tables = p.parse_tables()
920 except ValueError as caught:
921 # if `io` is an io-like object, check if it's seekable
~\anaconda3\lib\site-packages\pandas\io\html.py in parse_tables(self)
237 list of parsed (header, body, footer) tuples from tables.
238 """
--> 239 tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
240 return (self._parse_thead_tbody_tfoot(table) for table in tables)
241
~\anaconda3\lib\site-packages\pandas\io\html.py in _build_doc(self)
756 pass
757 else:
--> 758 raise e
759 else:
760 if not hasattr(r, "text_content"):
~\anaconda3\lib\site-packages\pandas\io\html.py in _build_doc(self)
737 try:
738 if is_url(self.io):
--> 739 with urlopen(self.io) as f:
740 r = parse(f, parser=parser)
741 else:
~\anaconda3\lib\site-packages\pandas\io\common.py in urlopen(*args, **kwargs)
234 import urllib.request
235
--> 236 return urllib.request.urlopen(*args, **kwargs)
237
238
~\anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
212 else:
213 opener = _opener
--> 214 return opener.open(url, data, timeout)
215
216 def install_opener(opener):
~\anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
521 for processor in self.process_response.get(protocol, []):
522 meth = getattr(processor, meth_name)
--> 523 response = meth(req, response)
524
525 return response
~\anaconda3\lib\urllib\request.py in http_response(self, request, response)
630 # request was successfully received, understood, and accepted.
631 if not (200 <= code < 300):
--> 632 response = self.parent.error(
633 'http', request, response, code, msg, hdrs)
634
~\anaconda3\lib\urllib\request.py in error(self, proto, *args)
559 if http_err:
560 args = (dict, 'default', 'http_error_default') + orig_args
--> 561 return self._call_chain(*args)
562
563 # XXX probably also want an abstract factory that knows when it makes
~\anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
492 for handler in handlers:
493 func = getattr(handler, meth_name)
--> 494 result = func(*args)
495 if result is not None:
496 return result
~\anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
639 class HTTPDefaultErrorHandler(BaseHandler):
640 def http_error_default(self, req, fp, code, msg, hdrs):
--> 641 raise HTTPError(req.full_url, code, msg, hdrs, fp)
642
643 class HTTPRedirectHandler(BaseHandler):
HTTPError: HTTP Error 403: Forbidden

 

 
		
		 
        

 
			
			
 
   
 


 Scraping des fichiers de la SEC EDGAR avec Python
 Scraping des fichiers de la SEC EDGAR avec Python
				 Répondre avec citation
  Répondre avec citation


 
  
  
  
 
 
			 Envoyé par ElGringo56
 Envoyé par ElGringo56
					
 comme
 comme 


 
   
Partager