1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
| import time
from datetime import timedelta
import keyboard
import yaml
import requests
from random import choice
from bs4 import BeautifulSoup
import os
#url site à scraper
url = "https://"
#Programation de la temporalité pour la boucle
Nowdate = time.time()
Now = time.localtime()
Heure = Now.tm_hour
Min = Now.tm_min
Sec = Now.tm_sec
print(time.strftime("%Y-%m-%d %H:%M:%S %Z", Now))
start = 0
#Programation horaire :
Heure_début = 10
Minute_début = 15
Heure_fin = 23
Minute_fin = 5
#Progamation attente réponse serveur scraping
timeout = 0.45
#Ouverture et lecture des proxy
proxies = [word.strip() for word in open("proxy_liste.txt", encoding="utf-8")]
#Changement headers pour se faire passer pour une recherche sur un explorer :
with open("headers.yml") as f_headers:
browser_headers = yaml.safe_load(f_headers)
mylist = ["Firefox","Chrome","Edge","IE","Brave"]
#Boucle temporelle
while (Heure == Heure_début and Min >= Minute_début) or Heure > Heure_début:
Nowdate_bis = time.time()
Now_bis = time.localtime()
Heure_bis = Now_bis.tm_hour
Min_bis = Now_bis.tm_min
egal_mlsec = Nowdate_bis - Nowdate
if start == 0 and ((Heure_bis == Heure_fin and Min_bis <= Minute_fin ) or Heure_bis < Heure_fin):
print('start')
print('egal_mlsec : ',egal_mlsec)
start = 1
if egal_mlsec >= 0.5 :
print('mlsec_bis : ',egal_mlsec)
proxies_rand = choice(proxies)
browser_headers_rand = browser_headers[choice(mylist)]
proxies_rand = {"http": proxies_rand}
print('Proxies : ',proxies_rand)
try:
response = requests.get(url, headers=browser_headers_rand, proxies=proxies_rand, timeout=timeout)
if response.ok :
soup = BeautifulSoup(response.text, 'html.parser')
tds = soup.find_all('td')
links =[]
links=list(td.text for td in tds)
with open('scraping.txt', 'w', encoding = 'utf-8') as file:
for annonce in links:
annonce = annonce.replace(',','.')
annonce = annonce.replace(' (Jan.)','')
annonce = annonce.replace(' (Févr.)','')
file.write(annonce + ';')
except Exception:
print(f"Proxy {proxies_rand} failed, trying another one")
Nowdate = Nowdate_bis
if (Heure_bis == Heure_fin and Min_bis >= Minute_fin ) or Heure_bis > Heure_fin:
print('stop')
break
if keyboard.is_pressed("ctrl + 9"):
print("stop by pressing ctrl + 9")
break |
Partager