Issue
So I have been able to scrape the first 50 teams in the team rankings webpage from 247sports.
I was able to get the following results:
index Rank Team Total Recruits Average Rating Total Rating
0 0 1 Ohio State 17 94.35 286.75
1 10 11 Alabama 10 94.16 210.61
2 8 9 Georgia 11 93.38 219.60
3 31 32 Clemson 8 92.02 161.74
4 3 4 LSU 14 91.92 240.57
5 4 5 Oklahoma 13 91.81 229.03
6 22 23 USC 9 91.60 174.69
7 11 12 Texas A&M 11 91.59 203.03
8 1 2 Notre Dame 18 91.01 250.35
9 2 3 Penn State 18 90.04 243.95
10 6 7 Texas 14 90.04 222.03
11 14 15 Missouri 12 89.94 196.37
12 7 8 Oregon 15 89.91 220.66
13 5 6 Florida State 15 89.88 224.51
14 25 26 Florida 10 89.15 167.89
15 37 38 North Carolina 9 88.94 152.79
16 9 10 Michigan 16 88.76 216.07
17 33 34 UCLA 10 88.49 160.00
18 23 24 Kentucky 11 88.46 173.12
19 12 13 Rutgers 14 88.44 198.56
20 19 20 Indiana 12 88.41 181.20
21 49 50 Washington 8 88.21 132.55
22 20 21 Oklahoma State 13 88.18 177.91
23 43 44 Ole Miss 10 87.80 143.35
24 44 45 California 9 87.78 141.80
25 17 18 Arkansas 15 87.75 188.64
26 16 17 South Carolina 15 87.61 190.84
27 32 33 Georgia Tech 11 87.30 161.33
28 35 36 Tennessee 11 87.25 157.77
29 39 40 NC State 11 87.18 150.18
30 46 47 SMU 9 87.08 138.50
31 36 37 Wisconsin 11 87.00 157.55
32 21 22 Mississippi State 15 86.96 177.33
33 24 25 West Virginia 13 86.78 171.72
34 30 31 Northwestern 14 86.76 162.66
35 40 41 Maryland 12 86.31 149.77
36 15 16 Virginia Tech 18 86.23 191.06
37 18 19 Baylor 19 85.90 184.68
38 13 14 Boston College 22 85.88 197.15
39 26 27 Michigan State 14 85.85 167.60
40 29 30 Cincinnati 14 85.68 164.90
41 34 35 Minnesota 13 85.55 159.35
42 28 29 Iowa State 14 85.54 166.50
43 48 49 Virginia 10 85.39 133.93
44 45 46 Arizona 11 85.27 140.90
45 41 42 Pittsburgh 12 85.10 147.58
46 47 48 Duke 13 85.02 137.40
47 27 28 Vanderbilt 16 85.01 166.77
48 38 39 Purdue 13 84.83 152.55
49 42 43 Illinois 13 84.15 143.86
From the following script:
year = '2022'
url = 'https://247sports.com/Season/' + str(year) + '-Football/CompositeTeamRankings/'
print(url)
# Add the `user-agent` otherwise we will get blocked when sending the request
headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"}
response = requests.get(url, headers = headers).content
soup = BeautifulSoup(response, "html.parser")
data = []
for tag in soup.find_all("li", class_="rankings-page__list-item"):
rank = tag.find('div',{'class':'primary'}).text.strip()
team = tag.find('div',{'class':'team'}).find('a').text.strip()
total_recruits = tag.find('div',{'class':'total'}).find('a').text.split(' ')[0].strip()
# five_stars = tag.find('div',{'class':'gold'}).text.strip()
# four_stars = tag.find('div',{'class':'gold'}).text.strip()
# three_stars = tag.find('div',{'class':'metrics'}).text.strip()
avg_rating = tag.find('div',{'class':'avg'}).text.strip()
total_rating = tag.find('div',{'class':'points'}).text.strip()
data.append(
{
"Rank": rank,
"Team": team,
"Total Recruits": total_recruits,
# "Five-Star Recruits": five_stars,
# "Four-Star Recruits": four_stars,
# "Three-Star Recruits": three_stars,
"Average Rating": avg_rating,
"Total Rating": total_rating
}
)
df = pd.DataFrame(data)
df[['Rank', 'Total Recruits', 'Average Rating', 'Total Rating']] = df[['Rank', 'Total Recruits', 'Average Rating', 'Total Rating']].apply(pd.to_numeric)
df.sort_values('Average Rating', ascending = False).reset_index()
# soup
However, I would like to achieve three things.
- I would like to grab the data from the "5-stars", "4-stars", "3-stars" columns in the webpage.
- I would like to not just get the first 50 schools, but also tell the webpage to click "load more" enough times so that I can get the table with ALL schools in it.
- I want to not only get the 2022 team rankings, but every team ranking that 247sports has to offer (2000 through 2024).
I tried to give it a go with this one script, but I constantly get the top-50 schools being outputted in one loop in the "print(row) portion" of the code.
print(datetime.datetime.now().time())
# years = ['2000', '2001', '2002', '2003', '2004',
# '2005', '2006', '2007', '2008', '2009',
# '2010', '2011', '2012', '2013', '2014',
# '2015', '2016', '2017', '2018', '2019',
# '2020', '2021', '2022', '2023']
years = ['2022']
rows = []
page_totals = []
# recruits_final = []
for year in years:
url = 'https://247sports.com/Season/' + str(year) + '-Football/CompositeTeamRankings/'
print(url)
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Mobile Safari/537.36'}
page = 0
while True:
page +=1
payload = {'Page': '%s' %page}
response = requests.get(url, headers=headers, params=payload)
soup = BeautifulSoup(response.text, 'html.parser')
tags = soup.find_all('li',{'class':'rankings-page__list-item'})
if len(tags) == 0:
print('Page: %s' %page)
page_totals.append(page)
break
continue_loop = True
while continue_loop == True:
for tag in tags:
if tag.text.strip() == 'Load More':
continue_loop = False
continue
# primary_rank = tag.find('div',{'class':'rank-column'}).find('div',{'class':'primary'}).text.strip()
# try:
# other_rank = tag.find('div',{'class':'rank-column'}).find('div',{'class':'other'}).text.strip()
# except:
# other_rank = ''
rank = tag.find('div',{'class':'primary'}).text.strip()
team = tag.find('div',{'class':'team'}).find('a').text.strip()
total_recruits = tag.find('div',{'class':'total'}).find('a').text.split(' ')[0].strip()
# five_stars = tag.find('div',{'class':'gold'}).text.strip()
# four_stars = tag.find('div',{'class':'gold'}).text.strip()
# three_stars = tag.find('div',{'class':'metrics'}).text.strip()
avg_rating = tag.find('div',{'class':'avg'}).text.strip()
total_rating = tag.find('div',{'class':'points'}).text.strip()
try:
team = athlete.find('div',{'class':'status'}).find('img')['title']
except:
team = ''
row = {'Rank': rank,
'Team': team,
'Total Recruits': total_recruits,
'Average Rating': avg_rating,
'Total Rating': total_rating,
'Year': year}
print(row)
rows.append(row)
recruits = pd.DataFrame(rows)
print(datetime.datetime.now().time())
Any assistance on this is truly appreciated. Thanks in advance.
Solution
First, you can extract the year ranges from the dropdown with BeautifulSoup
(no need to click the button, as the dropdown is already on the page), and then navigate to each link with selenium
, using the latter to interact with the "load more" toggle, and then finally scraping the resulting tables:
from bs4 import BeautifulSoup as soup
from selenium import webdriver
import time, urllib.parse, re
d = webdriver.Chrome('path/to/chromedriver')
d.get((url:='https://247sports.com/Season/2022-Football/CompositeTeamRankings/'))
result = {}
for i in soup(d.page_source, 'html.parser').select('.rankings-page__header-nav > .rankings-page__nav-block .flyout_cmp.year.tooltip li a'):
if (y:=int(i.get_text(strip=True))) > 1999:
d.get(urllib.parse.urljoin(url, i['href']))
while d.execute_script("""return document.querySelector('a[data-js="showmore"]') != null"""):
d.execute_script("""document.querySelector('a[data-js="showmore"]').click()""")
time.sleep(1)
result[y] = [{"Rank":i.select_one('div.wrapper .rank-column .other').get_text(strip=True),
"Team":i.select_one('.team').get_text(strip=True),
"Total":i.select_one('.total').get_text(strip=True).split()[0],
"5-Stars":i.select_one('.star-commits-list li:nth-of-type(1) div').get_text(strip=True),
"4-Stars":i.select_one('.star-commits-list li:nth-of-type(2) div').get_text(strip=True),
"3-Stars":i.select_one('.star-commits-list li:nth-of-type(3) div').get_text(strip=True),
"Ave":i.select_one('.avg').get_text(strip=True),
"Points":i.select_one('.points').get_text(strip=True),
}
for i in soup(d.page_source, 'html.parser').select("""ul[data-js="rankings-list"].rankings-page__list li.rankings-page__list-item""")]
result
stores all the team rankings for a given year, 2000-2024 (list(result)
produces [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000]
). To convert the results to a pandas.DataFrame
:
import pandas as pd
df = pd.DataFrame([{'Year':a, **i} for a, b in result.items() for i in b])
print(df)
Output:
Year Rank Team Total 5-Stars 4-Stars 3-Stars Ave Points
0 2024 N/A Iowa 1 0 0 0 0.00 0.00
1 2024 N/A Florida State 3 0 0 0 0.00 0.00
2 2024 N/A BYU 1 0 0 0 0.00 0.00
3 2023 1 Georgia 4 0 4 0 93.86 93.65
4 2023 3 Notre Dame 2 1 1 0 95.98 51.82
... ... ... ... ... ... ... ... ... ...
3543 2000 N/A NC State 18 0 0 0 70.00 0.00
3544 2000 N/A Colorado State 14 0 0 0 70.00 0.00
3545 2000 N/A Oregon 27 0 0 0 70.00 0.00
3546 2000 N/A California 25 0 0 0 70.00 0.00
3547 2000 N/A Texas Tech 20 0 0 0 70.00 0.00
[3548 rows x 9 columns]
Edit: instead of using selenium
, you can send requests to the API endpoints that the site uses to retrieve and display the ranking data:
import requests, pandas as pd
from bs4 import BeautifulSoup as soup
def extract_rankings(source):
return [{"Rank":i.select_one('div.wrapper .rank-column .other').get_text(strip=True),
"Team":i.select_one('.team').get_text(strip=True),
"Total":i.select_one('.total').get_text(strip=True).split()[0],
"5-Stars":i.select_one('.star-commits-list li:nth-of-type(1) div').get_text(strip=True),
"4-Stars":i.select_one('.star-commits-list li:nth-of-type(2) div').get_text(strip=True),
"3-Stars":i.select_one('.star-commits-list li:nth-of-type(3) div').get_text(strip=True),
"Ave":i.select_one('.avg').get_text(strip=True),
"Points":i.select_one('.points').get_text(strip=True),
}
for i in soup(source, 'html.parser').select("""li.rankings-page__list-item""")]
def year_rankings(year):
page, results = 1, []
vals = extract_rankings(requests.get(f'https://247sports.com/Season/{year}-Football/CompositeTeamRankings/?ViewPath=~%2FViews%2FSkyNet%2FInstitutionRanking%2F_SimpleSetForSeason.ascx&Page={page}', headers={'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Mobile Safari/537.36'}).text)
while vals:
results.extend(vals)
page += 1
vals = extract_rankings(requests.get(f'https://247sports.com/Season/{year}-Football/CompositeTeamRankings/?ViewPath=~%2FViews%2FSkyNet%2FInstitutionRanking%2F_SimpleSetForSeason.ascx&Page={page}', headers={'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Mobile Safari/537.36'}).text)
return results
results = {y:year_rankings(y) for y in range(2000, 2025)}
df = pd.DataFrame([{'Year':a, **i} for a, b in results.items() for i in b])
print(df)
Answered By - Ajax1234
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.