Issue
I am having a heck of a time turning data that i have into a dataframe through Pandas. I feel like this is far from a difficult task but i can't seem to figure it out. I have the headers i want for the dataframe and i have the data but this is data from the web. I know i need to turn it into a list and then put that into a DataFrame function but i am unable to figure out how to put this thing into a list.
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
import pandas as pd
PATH = "C:\Program Files (x86)\Chrome\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(PATH)
driver.get("https://www.espn.com/golf/leaderboard?tournamentId=401353232")
number_of_players = 52
round_to_select = 3
for idx,down_arrow in enumerate(driver.find_elements(By.CSS_SELECTOR, '.Table__TD:first-child')):
if idx < number_of_players:
down_arrow.click()
time.sleep(.5)
else:
break
if round_to_select < 4:
for idx,menu in enumerate(driver.find_elements(By.CSS_SELECTOR, '.competitors select[class=dropdown__select]')):
if idx < number_of_players:
Select(menu).select_by_visible_text(f'Round {round_to_select}')
time.sleep(.5)
else:
break
R1_page_source = driver.page_source
R1_soup = BeautifulSoup(R1_page_source, 'html.parser')
R1_leaderboard = R1_soup.find('table' , class_ = 'Table Table--align-right Full__Table')
for R1_player in R1_leaderboard.find_all('tbody'):
R1_rows = R1_player.find_all('tr' , class_ = 'Table__TD--PlayerDetail Table__TR Table__even')
for R1_row in R1_rows:
R1_Tournament = R1_soup.find('h1' , class_ = 'headline headline__h1 Leaderboard__Event__Title').text
R1_Course = R1_soup.find('div' , class_ = 'Leaderboard__Course__Location n8 clr-gray-04').text
R1_Players = R1_row.find('a').text
R1_Round = R1_row.find_all("select")[1].text
R1_H1 = R1_row.find_all('span')[1].text
R1_H2 = R1_row.find_all('span')[2].text
R1_H3 = R1_row.find_all('span')[3].text
R1_H4 = R1_row.find_all('span')[4].text
R1_H5 = R1_row.find_all('span')[5].text
R1_H6 = R1_row.find_all('span')[6].text
R1_H7 = R1_row.find_all('span')[7].text
R1_H8 = R1_row.find_all('span')[8].text
R1_H9 = R1_row.find_all('span')[9].text
R1_H10 = R1_row.find_all('span')[11].text
R1_H11 = R1_row.find_all('span')[12].text
R1_H12 = R1_row.find_all('span')[13].text
R1_H13 = R1_row.find_all('span')[14].text
R1_H14 = R1_row.find_all('span')[15].text
R1_H15 = R1_row.find_all('span')[16].text
R1_H16 = R1_row.find_all('span')[17].text
R1_H17 = R1_row.find_all('span')[18].text
R1_H18 = R1_row.find_all('span')[19].text
print(R1_Players, R1_Tournament, R1_Course, R1_Round, R1_H1, R1_H2, R1_H3, R1_H4, R1_H5, R1_H6, R1_H7, R1_H8, R1_H9, R1_H10, R1_H11, R1_H12, R1_H13, R1_H14, R1_H15, R1_H16, R1_H17, R1_H18)
Solution
As stated, you could have Selenium click through each, then use pandas
' .read_html()
to parse the tables. However, there's an espn api, and if there is an api available, it's far better (more robust and efficient) to fetch the data that way as opposed to using Selenium. There's also far more data than what's shown on the site that you can get too. Basically as long as you have the tournament number/id, you feed that in, get the player ids form the leader board, then iterate through those:
import requests
import pandas as pd
tournamentId = '401353232'
url = 'https://site.web.api.espn.com/apis/site/v2/sports/golf/leaderboard'
payload = {
'league': 'pga',
'region': 'us',
'lang': 'en',
'event': '%s' %tournamentId}
jsonData = requests.get(url, params=payload).json()
tournament = jsonData['events'][0]['name']
courses_dict = {}
for event in jsonData['events']:
for course in event['courses']:
courses_dict[str(course['id'])] = course['name']
print(tournament)
payload = {
'region': 'us',
'lang': 'en',
'season': '2022'}
results = pd.DataFrame()
for event in jsonData['events']:
competitions = event['competitions']
for competition in competitions:
competitors = competition['competitors']
for competitor in competitors:
playerName = competitor['athlete']['displayName']
playerId = competitor['athlete']['id']
status = competitor['status']
lastRound = status['period']
lastStatus = status['type']['detail']
url = f'https://site.web.api.espn.com/apis/site/v2/sports/golf/pga/leaderboard/{tournamentId}/competitorsummary/{playerId}'
rounds = requests.get(url, params=payload).json()['rounds']
for r in rounds:
courseName = courses_dict[str(r['courseId'])]
teeTime = r['teeTime']
period = r['period']
roundTotal = r['value']
roundStatus = lastStatus
if period < lastRound:
roundStatus = 'Finish'
linescores = r['linescores']
if len(linescores) == 0:
data = {'player':playerName,
'tournament':tournament,
'round':period}
temp_df = pd.DataFrame([data])
else:
temp_df = pd.DataFrame(linescores)
temp_df['player'] = playerName
temp_df['tournament'] = tournament
temp_df['round'] = period
temp_df = temp_df.pivot(
index=['player','tournament','round'],
columns='period',
values='value').reset_index(drop=False)
temp_df['course'] = courseName
temp_df['teeTime'] = teeTime
temp_df['status'] = roundStatus
results = pd.concat([results, temp_df], axis=0).reset_index(drop=True)
print('Collected: ', playerName)
Output:
print(results.head(10).to_string())
period player tournament course round 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
0 Scottie Scheffler Masters Tournament Augusta National Golf Club 1 4.0 5.0 4.0 3.0 4.0 3.0 4.0 4.0 3.0 4.0 4.0 2.0 5.0 4.0 5.0 3.0 3.0 5.0
1 Scottie Scheffler Masters Tournament Augusta National Golf Club 2 5.0 4.0 5.0 3.0 4.0 3.0 3.0 4.0 4.0 4.0 4.0 2.0 4.0 4.0 4.0 2.0 4.0 4.0
2 Scottie Scheffler Masters Tournament Augusta National Golf Club 3 4.0 4.0 3.0 4.0 4.0 2.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 5.0 6.0 3.0 3.0 5.0
3 Scottie Scheffler Masters Tournament Augusta National Golf Club 4 4.0 5.0 3.0 3.0 4.0 3.0 3.0 5.0 4.0 5.0 4.0 3.0 5.0 3.0 4.0 3.0 4.0 6.0
4 Rory McIlroy Masters Tournament Augusta National Golf Club 1 4.0 4.0 4.0 3.0 4.0 4.0 4.0 5.0 4.0 4.0 3.0 3.0 5.0 5.0 5.0 4.0 4.0 4.0
5 Rory McIlroy Masters Tournament Augusta National Golf Club 2 4.0 4.0 4.0 3.0 5.0 3.0 4.0 5.0 4.0 5.0 6.0 3.0 4.0 4.0 5.0 2.0 4.0 4.0
6 Rory McIlroy Masters Tournament Augusta National Golf Club 3 5.0 5.0 4.0 2.0 4.0 4.0 3.0 5.0 4.0 4.0 4.0 4.0 4.0 4.0 4.0 3.0 4.0 4.0
7 Rory McIlroy Masters Tournament Augusta National Golf Club 4 3.0 5.0 3.0 3.0 4.0 3.0 3.0 4.0 4.0 3.0 4.0 3.0 3.0 4.0 5.0 3.0 4.0 3.0
8 Shane Lowry Masters Tournament Augusta National Golf Club 1 4.0 5.0 5.0 3.0 4.0 3.0 4.0 5.0 4.0 5.0 4.0 3.0 3.0 3.0 7.0 3.0 4.0 4.0
9 Shane Lowry Masters Tournament Augusta National Golf Club 2 5.0 4.0 4.0 3.0 4.0 3.0 3.0 5.0 4.0 3.0 4.0 3.0 4.0 4.0 4.0 3.0 4.0 4.0
....
Answered By - chitown88
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.