Issue
I am trying to scrape the schedule/scores from https://www.baseball-reference.com/leagues/majors/2023-schedule.shtml and save them in a csv file that I can import to google drive. For some reason my parsed_data.csv is only displaying "Date,Away Team,Away Score,Home Team,Home Score" and not the actual data needed.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import csv
# First script: Fetch the HTML from baseball-reference.com and save it to a file
url = "https://www.baseball-reference.com/leagues/majors/2023-schedule.shtml"
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
with open("2023 MLB Schedule _ Baseball-Reference.com.html", "w") as file:
file.write(soup.prettify())
else:
print(f"Failed to fetch URL with status code {response.status_code}")
# Second script: Load the HTML file, parse it, and save the extracted data to a CSV file
with open('2023 MLB Schedule _ Baseball-Reference.com.html', 'r') as html_file:
html_content = html_file.read()
soup = BeautifulSoup(html_content, 'html.parser')
div = soup.find('div', {'id': 'div_655919807'})
if div:
div_text = div.get_text(separator='\n')
df = pd.DataFrame(div_text.split('\n'), columns=['Text'])
df.to_csv('extracted_data.csv', index=False)
print('Data saved to data.csv')
else:
print('Div not found')
# Third script: Load the CSV file, parse the data, and save it to a new CSV file
df = pd.read_csv('extracted_data.csv')
parsed_data = pd.DataFrame(columns=['Date', 'Away Team', 'Away Score', 'Home Team', 'Home Score'])
for index, row in df.iterrows():
soup = BeautifulSoup(str(row['Text']), 'html.parser')
date_text = soup.find(string=re.compile(r'\w+, [A-Za-z]+ \d+, \d{4}'))
if date_text is None:
continue
date = date_text.strip()
games = soup.find_all(string=re.compile(r'Boxscore'))
for game in games:
away_team = game.find_previous(string=re.compile(r'\w')).strip()
away_score = game.find_previous(string=away_team).find_previous(string=re.compile(r'\d')).strip()
home_team = game.find_previous(string=away_score).find_previous(string=re.compile(r'\w')).strip()
home_score = game.find_previous(string=home_team).find_previous(string=re.compile(r'\d')).strip()
parsed_data = parsed_data.append({'Date': date, 'Away Team': away_team, 'Away Score': away_score, 'Home Team': home_team, 'Home Score': home_score}, ignore_index=True)
parsed_data.to_csv('parsed_data.csv', index=False)
print('Parsed data saved to parsed_data.csv')
# Fourth script: Fetch data from pine-sports.com and save it to a CSV file
url = 'https://www.pine-sports.com/stats/props/MLB/'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', {'id': 'myTable'})
headers = [th.text.strip() for th in table.find_all('th')]
rows = []
for tr in table.find_all('tr'):
row = [td.text.strip() for td in tr.find_all('td')]
if row:
rows.append(row)
with open('mlb_stats.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(headers)
writer.writerows(rows)
parsed_data.csv should "clean" and save a csv file so its organized & clean in google sheets.
this is what I would like it to look like when i import it to sheets.
Solution
To parse the data into a pandas dataframe you can use next example (you can then save the dataframe into a CSV/XLS):
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://www.baseball-reference.com/leagues/majors/2023-schedule.shtml"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
all_data = []
for g in soup.select(".game"):
data = g.get_text(strip=True, separator="|").split("|")
if "@" in data[1]:
data = [data[0], *data[1].split("\n"), *data[2:]]
if len(data) == 5:
time, home, _, away, _ = data
score_home, score_away = None, None
elif len(data) == 6:
time = None
home, score_home, _, away, score_away, _ = data
score_home = score_home.strip("()")
score_away = score_away.strip("()")
else:
raise Exception("Invalid data")
date = g.find_previous("h3").text
all_data.append((date, time, home, away, score_home, score_away))
df = pd.DataFrame(
all_data, columns=["Date", "Time", "Home", "Away", "Score Home", "Score Away"]
)
print(df)
Prints:
Date Time Home Away Score Home Score Away
0 Thursday, March 30, 2023 None Baltimore Orioles Boston Red Sox 10 9
1 Thursday, March 30, 2023 None Milwaukee Brewers Chicago Cubs 0 4
2 Thursday, March 30, 2023 None Pittsburgh Pirates Cincinnati Reds 5 4
3 Thursday, March 30, 2023 None Chicago White Sox Houston Astros 3 2
...
2425 Sunday, October 1, 2023 3:10 pm Chicago Cubs Milwaukee Brewers None None
2426 Sunday, October 1, 2023 3:10 pm Philadelphia Phillies New York Mets None None
2427 Sunday, October 1, 2023 3:10 pm Texas Rangers Seattle Mariners None None
2428 Sunday, October 1, 2023 3:15 pm Cincinnati Reds St. Louis Cardinals None None
Answered By - Andrej Kesely
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.