Issue
I am trying to scrape https://www.sports-reference.com/cbb/seasons/men/2024-school-stats.html, and grab the "per game" stats, which requires the user to toggle on/off, in order to show info in table. I am able to scrape the total stats, but am struggling to figure out how to scrape the per-game stats, since it requires toggle on/off within the table.
Do I have to use Selenium + BeautifulSoup? Or am I able to accomplish this with just BeautifulSoup4? (I am a novice who knows enough to build a simple scraper but this is a bit beyond my reach.) Thanks for any help!
This is what I've tried so far...
!pip install requests beautifulsoup4 pandas
import pandas as pd
import requests
from bs4 import BeautifulSoup
import csv
def scrape_team_stats(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Attempt to find the table with the specific class
table = soup.find('table', class_='modified')
if not table:
# If the specific table is not found, fall back to finding any table
table = soup.find('table')
if not table:
return [] # Return an empty list if no table is found
# Extracting team data
team_stats = []
for row in table.find_all('tr'):
# Find the per-game stats using data-stat attributes
team_name = row.find('td', {'data-stat': 'school_name'})
fg_percentage = row.find('td', {'data-stat': 'fg_pct'})
orb_per_game = row.find('td', {'class': 'right modified' }, {'data-stat': 'orb'})
ast_per_game = row.find('td', {'data-stat': 'ast'})
stl_per_game = row.find('td', {'data-stat': 'stl'})
blk_per_game = row.find('td', {'data-stat': 'blk'})
tov_per_game = row.find('td', {'data-stat': 'tov'})
pf_per_game = row.find('td', {'data-stat': 'pf'})
if team_name:
team_stats.append([
team_name.text.strip(),
fg_percentage.text.strip() if fg_percentage else 'N/A',
orb_per_game.text.strip() if orb_per_game else 'N/A',
ast_per_game.text.strip() if ast_per_game else 'N/A',
stl_per_game.text.strip() if stl_per_game else 'N/A',
blk_per_game.text.strip() if blk_per_game else 'N/A',
tov_per_game.text.strip() if tov_per_game else 'N/A',
pf_per_game.text.strip() if pf_per_game else 'N/A'
])
return team_stats
def write_to_csv(data, filename):
with open(filename, 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(['Team', 'FG%', 'ORB/G', 'AST/G', 'STL/G', 'BLK/G', 'TOV/G', 'PF/G']) # Writing header
for row in data:
writer.writerow(row)
url = 'https://www.sports-reference.com/cbb/seasons/men/2024-school-stats.html'
team_stats = scrape_team_stats(url)
csv_filename = 'team_per_game_stats.csv'
write_to_csv(team_stats, csv_filename)
print(f"Data has been written to {csv_filename}")
Solution
Don't use Selenium if you don't need to. Much quicker to get the data from the requests. The "Per Game" toggle is just javascript that takes the given columns and divides by games. you can do that easily with pandas.
Code:
import pandas as pd
url = 'https://www.sports-reference.com/cbb/seasons/men/2024-school-stats.html'
df = pd.read_html(url, header=1)[0]
df['Rk'] = pd.to_numeric(df['Rk'], errors='coerce')
df = df.dropna(subset=['Rk'])
cols = ['Tm.', 'Opp.', 'MP', 'FG', 'FGA',
'3P', '3PA', 'FT', 'FTA', 'ORB', 'TRB', 'AST', 'STL',
'BLK', 'TOV', 'PF']
df[['G'] + cols] = df[['G'] + cols].astype(int)
df[cols] = df[cols].div(df['G'], axis=0)
Output:
print(df.head().to_string())
Rk School G W L W-L% SRS SOS Unnamed: 8 W.1 L.1 Unnamed: 11 W.2 L.2 Unnamed: 14 W.3 L.3 Unnamed: 17 Tm. Opp. Unnamed: 20 MP FG FGA FG% 3P 3PA 3P% FT FTA FT% ORB TRB AST STL BLK TOV PF
0 1.0 Abilene Christian 12 5 7 .417 -3.40 0.60 NaN 0 2 NaN 2 3 NaN 1 3 NaN 75.750000 75.166667 NaN 40.416667 25.666667 58.916667 .436 5.583333 17.000000 .328 18.833333 26.166667 .720 9.750000 33.750000 13.833333 8.333333 1.416667 13.583333 20.916667
1 2.0 Air Force 12 7 5 .583 -4.31 -8.48 NaN NaN NaN NaN 4 3 NaN 3 1 NaN 69.166667 65.000000 NaN 40.416667 24.083333 52.166667 .462 8.583333 23.833333 .360 12.416667 18.250000 .680 7.000000 31.666667 15.583333 7.916667 5.000000 12.583333 16.666667
2 3.0 Akron 11 8 3 .727 3.29 0.51 NaN NaN NaN NaN 5 0 NaN 2 1 NaN 78.272727 66.818182 NaN 40.454545 27.454545 58.090909 .473 9.545455 26.636364 .358 13.818182 19.090909 .724 11.090909 36.909091 15.545455 6.545455 2.545455 13.272727 15.454545
3 4.0 Alabama 12 7 5 .583 24.55 9.89 NaN NaN NaN NaN 6 1 NaN 0 1 NaN 92.166667 77.500000 NaN 40.000000 30.750000 63.333333 .486 11.250000 29.500000 .381 19.416667 25.083333 .774 12.750000 39.750000 16.083333 7.666667 4.333333 11.583333 19.166667
4 5.0 Alabama A&M 12 1 11 .083 -18.58 -0.49 NaN NaN NaN NaN 1 2 NaN 0 8 NaN 69.750000 87.833333 NaN 40.416667 23.250000 58.916667 .395 4.000000 16.000000 .250 19.250000 26.666667 .722 12.000000 34.500000 9.750000 8.250000 3.250000 15.666667 21.666667
Answered By - chitown88
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.