1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
| from bs4 import BeautifulSoup
from lxml.html import fromstring
import time
import re
htmlfile = "d:/temp/test.html"
markup = open(htmlfile)
htmlString = markup.read()
markup.close()
start = time.time()
soup = BeautifulSoup(htmlString,'html.parser')
for i in range(1000):
motsEnGras = soup.findAll('b')
end = time.time()
print("temps de recherche beautifulsoup : " + str(end-start) + " secondes")
print("beautifulsoup a trouvé " + str(len(motsEnGras)) + " balises")
print(motsEnGras)
regex = '<b>(.+?)</b>'
start = time.time()
for i in range(1000):
match = re.findall(regex, htmlString)
end = time.time()
print("temps de recherche regex : " + str(end-start) + " secondes")
print("regex a trouvé " + str(len(match)) + " balises")
print(match)
innerTree = fromstring(htmlString)
start = time.time()
for i in range(1000):
bolds = innerTree.cssselect('b')
end = time.time()
print("temps de recherche lxml : " + str(end-start) + " secondes")
print("lxml a trouvé " + str(len(bolds)) + " balises" )
print( [mot.text_content() for mot in bolds])
start = time.time()
for i in range(1000):
listeMots = [obj[obj.index('<b>')+3:] for obj in htmlString.split('</b>')[:-1]]
end = time.time()
print("temps de recherche du code de LeNarvalo : " + str(end-start) + " secondes")
print("Le code de LeNarvalo a trouvé " + str(len(listeMots)) + " balises")
print(listeMots) |