1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
|
### ------Erreurs 404 -----------------
# Dans head
h1Tag = soup2.h1
maListe = str(h1Tag).split(" ")
maListeNonPertinente = ["404","trouvée</h1>","Error", "<h1>Forbidden</h1>","Forbidden</h1>","404</h1>","...</h1>","Forbidden"]
for element in maListeNonPertinente :
if element in maListe:
print('ERREUR 404')
cpt = cpt+1
#--------------------------------------
### WIKTIONARY
pwiktionary = ''
key = "ol"
try:
for txt in soup2.findAll('div',{'id':'mw-content-text'}):
pwiktionary = txt.findAll('ol')
pwiktionary = pwiktionary[0].get_text()
except:
print
#------------------------------------------
#### LAROUSSE
plarousse = ''
for p in soup2.findAll('ul',{"class":"Definitions"}) :
plarousse = p.get_text()
#--------------------------------------------
cnrtl = ''
#### CNRTL
for art in soup2.findAll('div',{"id":"contentbox"}):
cnrtl = art.get_text()
#-----------------------------------------------
lint = ''
#### LINTERNAUTE
for sec in soup2.findAll('section',{"class":"grid_line dico_definition tabsContent jContentDefinition"}):
lint = sec.get_text()
### AMELIORER SA SANTE
amel = ''
for art in soup2.findAll('div',{"class":"headline-article"}):
amel = art.get_text()
### MR-PLANTES.COM
# Supprimer le p
for p in soup2.findAll('p',{"class":"postmetadata"}):
p.extract()
for a in soup2.findAll('p',{"a":"href"}):
a.extract() |
Partager