Bonjour, je suis débutant en python et je cherche a comprendre pourquoi le script voit le nombre de page du site web mais ne scrap seulement que la première. je ne cherche pas une réponse toute faite plus de l'aiguillage pour comprendre ou se situe le problème
Code : Sélectionner tout - Visualiser dans une fenêtre à part
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71 import requests as rq from bs4 import BeautifulSoup import pandas as pd from random import randint from time import sleep headers = ({'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}) def initial_scrape(url): # scrapes initial page - used to find number of pages r = rq.get(url, headers=headers) soup = BeautifulSoup(r.text, 'html.parser') return soup def find_pages(soup): # finds total number of pages pages_html = soup.select('.page-link') totalPages = pages_html[-4].get_text() return totalPages def convertUnit(price): # converts price format from string to integer e.g. 100K -> 100000 if 'K' in price: price = price.partition('K')[0] return int(float(price) * 1000) elif 'M' in price: price = price.partition('M')[0] return int((float(price) * 1000000)) else: return int(price) def gather_data(totalPages): # gathers the data of each player from each page database = [] playerContainers = [] pageCounter = 0 totalCounter = 0 for page in range(int(totalPages)): # for every page url = f'https://www.futbin.com/players?page=1&xbox_price=10000-50000&version=if_gold' # change url accordingly r = rq.get(url, headers=headers) soup = BeautifulSoup(r.text, 'html.parser') playerContainers = soup.select('tbody > tr')[2:] counter = 0 for player in playerContainers: # for every player stats = {} # adds stats to a dictionary (one for each player), which is then added to a list values = player.select('.num_td') name_position = player.select('.player_name_players_table') stats['name'] = name_position[0].get_text().partition('(')[0].strip() stats['position'] = name_position[0].get_text().partition('(')[2].strip(')') stats['rating'] = int(player.select('span[class*="form rating"]')[0].get_text()) stats['price'] = convertUnit(player.select('.ps4_color')[0].get_text()) database.append(stats) counter += 1 print(f'Player: {counter}/{len(playerContainers)} ; {stats["name"]}') # tracks progress totalCounter += counter pageCounter += 1 print(f'Page: {pageCounter}/{totalPages}') # more progress tracking sleep(randint(1, 3)) # timeout - can be removed return database
merci d'avance
Partager