script python Web scrapping
Code:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
| import requests as rq
from bs4 import BeautifulSoup
import pandas as pd
from random import randint
from time import sleep
headers = ({'User-Agent':
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})
def initial_scrape(url): # scrapes initial page - used to find number of pages
r = rq.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
def find_pages(soup): # finds total number of pages
pages_html = soup.select('.page-link')
totalPages = pages_html[-4].get_text()
return totalPages
def convertUnit(price): # converts price format from string to integer e.g. 100K -> 100000
if 'K' in price:
price = price.partition('K')[0]
return int(float(price) * 1000)
elif 'M' in price:
price = price.partition('M')[0]
return int((float(price) * 1000000))
else:
return int(price)
def gather_data(totalPages): # gathers the data of each player from each page
database = []
playerContainers = []
pageCounter = 0
totalCounter = 0
for page in range(int(totalPages)): # for every page
url = f'https://www.futbin.com/players?page=1&xbox_price=10000-50000&version=if_gold' # change url accordingly
r = rq.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
playerContainers = soup.select('tbody > tr')[2:]
counter = 0
for player in playerContainers: # for every player
stats = {} # adds stats to a dictionary (one for each player), which is then added to a list
values = player.select('.num_td')
name_position = player.select('.player_name_players_table')
stats['name'] = name_position[0].get_text().partition('(')[0].strip()
stats['position'] = name_position[0].get_text().partition('(')[2].strip(')')
stats['rating'] = int(player.select('span[class*="form rating"]')[0].get_text())
stats['price'] = convertUnit(player.select('.ps4_color')[0].get_text())
database.append(stats)
counter += 1
print(f'Player: {counter}/{len(playerContainers)} ; {stats["name"]}') # tracks progress
totalCounter += counter
pageCounter += 1
print(f'Page: {pageCounter}/{totalPages}') # more progress tracking
sleep(randint(1, 3)) # timeout - can be removed
return database |
Bonjour, je suis débutant en python et je cherche a comprendre pourquoi le script voit le nombre de page du site web mais ne scrap seulement que la première. je ne cherche pas une réponse toute faite plus de l'aiguillage pour comprendre ou se situe le problème
merci d'avance