Issue
Good day, everyone.
I'm trying to get the table on each page from the links appended to 'player_page.' I want the stats per game for each player in that season, and the table I want is listed on the players' individual page. Each link appended is correct, but I'm having trouble capturing the correct info when running my loops.
Any idea what I'm doing wrong here?
Any help is appreciated.
from bs4 import BeautifulSoup
import requests
import pandas as pd
from numpy import sin
url = 'https://www.pro-football-reference.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}
year = 2018
r = requests.get(url + '/years/' + str(year) + '/fantasy.htm')
soup = BeautifulSoup(r.content, 'lxml')
player_list = soup.find_all('td', attrs= {'class': 'left', 'data-stat': 'player'})
player_page = []
for player in player_list:
for link in player.find_all('a', href= True):
#names = str(link['href'])strip('')
link = str(link['href'].strip('.htm'))
player_page.append(url + link + '/gamelog' + '/' + str(year))
for page in player_page:
dfs = pd.read_html(page)
yearly_stats = []
for df in dfs:
yearly_stats.append(df)
final_stats = pd.concat(yearly_stats)
final_stats.to_excel('Fantasy2018.xlsx')
Solution
This works. The table columns change according to the player's position, I believe. Not everyone has tackle information, for example.
import pandas as pd
from bs4 import BeautifulSoup
import requests
import pandas as pd
url = 'https://www.pro-football-reference.com'
year = 2018
r = requests.get(url + '/years/' + str(year) + '/fantasy.htm')
soup = BeautifulSoup(r.content, 'lxml')
player_list = soup.find_all('td', attrs= {'class': 'left', 'data-stat': 'player'})
dfs = []
for player in player_list:
for link in player.find_all('a', href= True):
name = link.getText()
link = str(link['href'].strip('.htm'))
try:
df = pd.read_html(url + link + '/gamelog' + '/' + str(year))[0]
for i, columns_old in enumerate(df.columns.levels):
columns_new = np.where(columns_old.str.contains('Unnamed'), '' , columns_old)
df.rename(columns=dict(zip(columns_old, columns_new)), level=i, inplace=True)
df.columns = df.columns.map('|'.join).str.strip('|')
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.dropna(subset=['Date'])
df.insert(0,'Name',name)
df.insert(1,'Moment','Regular Season')
dfs.append(df)
except:
pass
try:
df1 = pd.read_html(url + link + '/gamelog' + '/' + str(year))[1]
for i, columns_old in enumerate(df1.columns.levels):
columns_new = np.where(columns_old.str.contains('Unnamed'), '' , columns_old)
df1.rename(columns=dict(zip(columns_old, columns_new)), level=i, inplace=True)
df1.columns = df1.columns.map('|'.join).str.strip('|')
df1['Date'] = pd.to_datetime(df1['Date'], errors='coerce')
df1 = df1.dropna(subset=['Date'])
df1.insert(0,'Name',name)
df1.insert(1,'Moment','Playoffs')
dfs.append(df1)
except:
pass
dfall = pd.concat(dfs)
dfall.to_excel('Fantasy2018.xlsx')
Answered By - Wilian
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.