Issue
I am trying to extract game data from the site https://www.oddsportal.com/matches/football/
The selectors I have constructed are as:
def generate_matches(pgSoup, defaultVal=None):
evtSel = {
'time': 'div.main-row p.flex',
'game': 'div.main-row a[title]',
'score': 'a a[title]+div:has(+a[title])',
'home_odds': 'a:has(a[title])~div:not(.hidden)',
'draw_odds': 'a:has(a[title])~div:not(.hidden)+div:nth-last-of-type(3)',
'away_odds': 'a:has(a[title])~div:nth-last-of-type(2)',
}
events, current_group = [], {}
pgDate = pgSoup.select_one('h1.title[id="next-matches-h1"]')
if pgDate: pgDate = pgDate.get_text().split(',', 1)[-1].strip()
for evt in pgSoup.select('div[set]>div:last-child'):
if evt.parent.select(f':scope>div:first-child+div+div'):
cgVals = [v.get_text(' ').strip() if v else defaultVal for v in [
evt.parent.select_one(s) for s in
[':scope>div:first-child+div>div:first-child',
':scope>div:first-child>a:nth-of-type(2):nth-last-of-type(2)',
':scope>div:first-child>a:nth-of-type(3):last-of-type']]]
current_group = dict(zip(['date', 'country', 'league'], cgVals))
if pgDate: current_group['date'] = pgDate
evtRow = {'date': current_group.get('date', defaultVal)}
for k, v in evtSel.items():
v = evt.select_one(v).get_text(' ') if evt.select_one(v) else defaultVal
evtRow[k] = ' '.join(v.split()) if isinstance(v, str) else v
evtTeams = evt.select('a div>a[title]')
evtRow['game'] = ' – '.join(a['title'] for a in evtTeams)
evtRow['country'] = current_group.get('country', defaultVal)
evtRow['league'] = current_group.get('league', defaultVal)
events.append(evtRow)
return events
I am getting the below dataframe:
date time game score home_odds draw_odds away_odds country league
0 25 July 2023 NaN NaN NaN NaN NaN Argentina Reserve League
1 25 July 2023 NaN NaN NaN NaN NaN Argentina Reserve League
2 25 July 2023 NaN NaN NaN NaN NaN Argentina Reserve League
3 25 July 2023 NaN NaN NaN NaN NaN Argentina Reserve League
4 25 July 2023 NaN NaN NaN NaN NaN Norway Division 3 - Group 6
I am getting NaN values for time, game, score, home_odds, draw_odds and away_odds. What would be the correct element selector to get all the rows populated?
example dataframe:
Unnamed: 0 date time game score home_odds draw_odds away_odds country league
0 0 01 Apr 2023 01:00 Widad Adabi de Boufarik – Temouchent 1 – 1 2.38 2.93 3.06 Algeria Ligue 2
1 1 01 Apr 2023 01:00 Relizane – Oued Sly 1 – 2 10.02 5.00 1.28 Algeria Ligue 2
Solution
To answer the additional question you asked in your previous post, Here's the complete solution:
import time
import threading
import pandas as pd
from math import nan
from datetime import datetime, timedelta
from multiprocessing.pool import ThreadPool
from bs4 import BeautifulSoup as bs
import undetected_chromedriver as uc
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
self.driver = uc.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def generate_matches(pgSoup, defaultVal=None):
evtSel = {
'time': 'div>div>div[class="flex basis-[10%]"]',
'game': 'a div:has(>a[title])',
'score': 'a[title]~div:not(.hidden)',
'home_odds': 'div[class^="flex-center flex-col gap-1 border-l border-black-ma"]:nth-child(2)',
'draw_odds': 'div[class^="flex-center flex-col gap-1 border-l border-black-ma"]:nth-child(3)',
'away_odds': 'div[class^="flex-center flex-col gap-1 border-l border-black-ma"]:nth-child(4)'
}
events, current_group = [], {}
pgDate = pgSoup.select_one('h1.title[id="next-matches-h1"]')
if pgDate: pgDate = pgDate.get_text().split(',', 1)[-1].strip()
for evt in pgSoup.select('div[set]>div:last-child'):
if evt.parent.select(f':scope>div:first-child+div+div'):
cgVals = [v.get_text(' ').strip() if v else defaultVal for v in [
evt.parent.select_one(s) for s in
[':scope>div:first-child+div>div:first-child',
':scope>div:first-child>a:nth-of-type(2):nth-last-of-type(2)',
':scope>div:first-child>a:nth-of-type(3):last-of-type']]]
current_group = dict(zip(['date', 'country', 'league'], cgVals))
if pgDate: current_group['date'] = pgDate
evtRow = {'date': current_group.get('date', defaultVal)}
for k, v in evtSel.items():
v = evt.select_one(v).get_text(' ') if evt.select_one(v) else defaultVal
evtRow[k] = ' '.join(v.split()) if isinstance(v, str) else v
# evtTeams = evt.select('a div>a[title]')
evtTeams = evt.select('div[class^="relative w-full flex-col"]>a')
evtRow['game'] = ' – '.join(a['title'] for a in evtTeams)
evtRow['country'] = current_group.get('country', defaultVal)
evtRow['league'] = current_group.get('league', defaultVal)
events.append(evtRow)
return events
def parse_data(url, return_urls=False):
print(f'Parsing URL: {url}\n')
browser = create_driver()
browser.get(url)
WebDriverWait(browser, 20).until(EC.presence_of_all_elements_located(
(By.CSS_SELECTOR, "div[set]>div:last-child")))
# ########## For page to scroll to the end ###########
scroll_pause_time = 2
# Get scroll height
last_height = browser.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(scroll_pause_time)
# Calculate new scroll height and compare with last scroll height
new_height = browser.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# ########## For page to scroll to the end ###########
time.sleep(5)
soup = bs(browser.page_source, "lxml")
game_data = GameData()
game_keys = [a for a, av in game_data.__dict__.items() if isinstance(av, list)]
for row in generate_matches(soup, defaultVal=nan):
for k in game_keys: getattr(game_data, k).append(row.get(k, nan))
if return_urls:
ac_sel = 'div:has(>a.active-item-calendar)' # a_cont selector
a_sel = f'{ac_sel}>a[href]:not([href^="#"]):not(.active-item-calendar)'
a_tags = soup.select(a_sel)
if a_tags:
urls = ['https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags]
print(f'urls after initial creation: {urls}')
# Extract the date from the first URL
last_date_str = urls[0].split('/')[-2]
print(f'last date str: {last_date_str}')
last_date = datetime.strptime(last_date_str, '%Y%m%d')
# Generate the additional URLs
for i in range(1, 4):
new_date = last_date - timedelta(days=i)
new_date_str = new_date.strftime('%Y%m%d')
new_url = f'https://www.oddsportal.com/matches/football/{new_date_str}/'
urls.append(new_url)
print(f'urls after generating additional URL #{i}: {urls}')
else:
urls = []
print(f'final urls: {urls}')
if urls and urls[-1].startswith('https://www.oddsportal.com/matches/football/'):
# Extract the date from the last URL
last_date_str = urls[0].split('/')[-2]
print(last_date_str)
else:
print('No valid URLs found')
return game_data, urls
return game_data
if __name__ == '__main__':
games = None
pool = ThreadPool(5)
# Get today's data and the Urls for the other days:
url_today = 'https://www.oddsportal.com/matches/soccer'
game_data_today, urls = pool.apply(parse_data, args=(url_today, True))
game_data_results = pool.imap(parse_data, urls)
# ########################### BUILD DATAFRAME ############################
game_data_dfList, added_todayGame = [], False
for game_data in game_data_results:
try:
game_data_dfList.append(pd.DataFrame(game_data.__dict__))
if not added_todayGame:
game_data_dfList += [pd.DataFrame(game_data_today.__dict__)]
added_todayGame = True
except Exception as e:
game_n = len(game_data_dfList) + 1
print(f'Error tabulating game_data_df#{game_n}:\n{repr(e)}')
try:
games = pd.concat(game_data_dfList, ignore_index=True)
except Exception as e:
print('Error concatenating DataFrames:', repr(e))
# #########################################################################
print('!?NO GAMES?!' if games is None else games)
# ensure all the drivers are "quitted":
del threadLocal # a little extra insurance
import gc
gc.collect()
games.to_csv()
output:
date time game score home_odds draw_odds away_odds country league
0 25 July 2023 00:00 Nordsjaelland – Viborg 4 – 1 1.84 3.86 4.08 Denmark Superliga
1 25 July 2023 00:00 FK Panevezys – Siauliai FA 2 – 0 2.02 3.24 3.69 Lithuania A Lyga
2 25 July 2023 00:00 Sandefjord 2 – Mjondalen 2 3 – 1 1.43 5.27 5.07 Norway Division 3 - Group 4 2023
3 25 July 2023 00:00 Rosenborg 2 – Tiller 0 – 2 1.24 6.70 8.02 Norway Division 3 - Group 5
4 25 July 2023 00:00 Asker – Sarpsborg 08 2 5 – 1 1.92 4.18 3.01 Norway Division 3 - Group 6
5 25 July 2023 00:00 Viking – Aalesund 3 – 1 1.24 6.44 10.03 Norway Eliteserien
6 25 July 2023 00:00 Korona Kielce – Slask Wroclaw 1 – 1 2.27 3.34 3.06 Poland Ekstraklasa
7 25 July 2023 00:00 Sirius – Mjallby 2 – 3 2.25 3.37 3.20 Sweden Allsvenskan
8 25 July 2023 00:00 GAIS – Vasteras SK 0 – 2 1.77 3.71 4.12 Sweden Superettan
9 25 July 2023 00:00 Helsingborg – Trelleborg 1 – 3 1.99 3.41 3.57 Sweden Superettan
10 26 July 2023 00:00 Gebenbach – Cham 2 – 2 1.82 3.93 3.49 Germany Oberliga Bayern Nord
11 26 July 2023 00:00 Ingolstadt II – ATSV 2 – 0 2.03 3.65 3.13 Germany Oberliga Bayern Nord
12 26 July 2023 00:00 Hottur / Huginn – Volsungur 2 – 1 1.97 4.09 3.06 Iceland Division 2
13 26 July 2023 00:00 Heracles (Ned) – FC Emmen (Ned) 1 – 0 1.84 3.76 3.67 World Club Friendly
14 26 July 2023 00:00 Koln II (Ger) – TuS Koblenz (Ger) 1 – 0 1.71 3.91 4.10 World Club Friendly
15 26 July 2023 00:15 Lahr (Ger) – Bahlinger (Ger) 0 – 4 6.58 5.27 1.36 World Club Friendly
16 26 July 2023 00:30 Erlbach – Kirchanschoring 1 – 0 2.82 3.33 2.31 Germany Oberliga Bayern Süd
17 26 July 2023 00:30 Baumberg (Ger) – Bonner (Ger) 2 – 0 2.73 3.65 2.28 World Club Friendly
18 26 July 2023 00:45 Zenit – Akhmat Grozny 2 – 0 1.51 4.34 5.81 Russia Russian Cup
19 26 July 2023 01:00 Dnipro-1 (Ukr) – Panathinaikos (Gre) 1 – 3 3.44 3.06 2.25 Europe Champions League
20 26 July 2023 01:00 D. Zagreb (Cro) – FC Astana (Kaz) 4 – 0 1.33 4.83 10.01 Europe Champions League
21 26 July 2023 01:00 TNS (Wal) – Hesperange (Lux) 1 – 1 2.53 3.26 2.75 Europe Europa Conference League
22 26 July 2023 01:00 Hamrun (Mlt) – Dinamo Tbilisi (Geo) 2 – 1 3.11 3.17 2.31 Europe Europa Conference League
23 26 July 2023 01:00 Thor Akureyri – Grotta 3 – 1 2.33 3.70 2.59 Iceland Division 1
24 26 July 2023 01:00 Fjallabyggd – Fjardabyggd/Leiknir 1 – 3 3.24 3.86 1.91 Iceland Division 2
25 26 July 2023 01:00 U. San Martin – Comerciantes 1 – 0 1.94 3.45 3.55 Peru Liga 2
26 26 July 2023 01:00 Fleetwood (Eng) – Preston (Eng) 0 – 0 2.95 3.42 2.24 World Club Friendly
27 26 July 2023 01:00 AFC Wimbledon (Eng) – Portsmouth (Eng) 1 – 0 3.44 3.56 1.96 World Club Friendly
28 26 July 2023 01:00 MK Dons (Eng) – Coventry (Eng) 1 – 5 3.95 3.60 1.83 World Club Friendly
29 26 July 2023 01:00 Rotherham (Eng) – Sheffield Utd (Eng) 1 – 0 4.02 3.75 1.78 World Club Friendly
30 26 July 2023 01:00 Swansea (Wal) – Bristol Rovers (Eng) 0 – 2 1.62 4.02 4.71 World Club Friendly
31 26 July 2023 01:00 Telstar (Ned) – Rijnsburgse Boys (Ned) 1 – 0 1.81 4.01 3.52 World Club Friendly
32 26 July 2023 01:00 Murcia (Esp) – Zaragoza (Esp) 0 – 1 2.85 3.23 2.41 World Club Friendly
33 26 July 2023 01:00 St. Gallen (Sui) – Valencia (Esp) 1 – 3 3.32 3.62 1.99 World Club Friendly
34 26 July 2023 01:00 FC Porto (Por) – Wolves (Eng) 0 – 1 1.78 3.92 3.86 World Club Friendly
35 26 July 2023 01:00 Kidderminster (Eng) – Northampton (Eng) 2 – 2 3.45 3.45 2.00 World Club Friendly
36 26 July 2023 01:00 Doncaster (Eng) – Sheffield Wed (Eng) 1 – 0 4.62 4.07 1.62 World Club Friendly
.
.
.
.
333 01 Aug 2023 00:00 Orebro – GAIS – 3.59 3.41 1.98 Sweden Superettan
334 01 Aug 2023 00:00 Utsikten – Helsingborg – 1.98 3.44 3.57 Sweden Superettan
335 01 Aug 2023 01:00 Def. de Belgrano – Flandria – 1.74 3.21 5.14 Argentina Primera Nacional
336 01 Aug 2023 01:00 Debrecen – Mezokovesd-Zsory – 1.87 3.63 3.72 Hungary OTP Bank Liga
337 24 July 2023 00:00 Flora – Narva 1 – 4 1.09 9.19 21.37 Estonia Meistriliiga
338 24 July 2023 00:00 Dalvik/Reynir – Olafsvik 3 – 0 2.20 3.79 2.73 Iceland Division 2
339 24 July 2023 00:00 Longford – St. Patricks 1 – 2 15.46 6.96 1.15 Ireland FAI Cup
340 24 July 2023 00:00 Rudar – Arsenal Tivat 0 – 1 2.26 3.16 3.08 Montenegro Prva Crnogorska Liga
341 24 July 2023 00:00 Vilaverdense – Casa Pia 0 – 2 4.19 3.41 1.85 Portugal League Cup
342 24 July 2023 00:00 Os Belenenses – Famalicao 3 – 2 3.48 3.43 2.03 Portugal League Cup
343 24 July 2023 00:00 FK Rostov – Fakel Voronezh 2 – 1 2.11 3.14 3.93 Russia Premier League
344 24 July 2023 00:00 Sochi – Baltika 2 – 0 2.05 3.43 3.78 Russia Premier League
.
.
.
362 22 July 2023 00:00 Vendsyssel – Sonderjyske 1 – 1 2.45 3.52 2.73 Denmark 1st Division
363 22 July 2023 00:00 Hillerod – Koge 0 – 4 2.28 3.50 2.92 Denmark 1st Division
364 22 July 2023 00:00 Midtjylland – Hvidovre IF 1 – 0 1.32 5.45 9.05 Denmark Superliga
365 22 July 2023 00:00 Atlantis – P-Iirot Rauma 2 – 1 1.35 5.86 6.07 Finland Kakkonen Group B
366 22 July 2023 00:00 Burghausen – Bayern II 3 – 3 4.27 4.26 1.63 Germany Regionalliga Bayern
Few things to note:
- Use these selectors:
Instead of:evtSel = { 'time': 'div>div>div[class="flex basis-[10%]"]', 'game': 'a div:has(>a[title])', 'score': 'a[title]~div:not(.hidden)', 'home_odds': 'div[class^="flex-center flex-col gap-1 border-l border-black-ma"]:nth-child(2)', 'draw_odds': 'div[class^="flex-center flex-col gap-1 border-l border-black-ma"]:nth-child(3)', 'away_odds': 'div[class^="flex-center flex-col gap-1 border-l border-black-ma"]:nth-child(4)' } evtTeams = evt.select('div[class^="relative w-full flex-col"]>a')
evtSel = { 'time': 'div.main-row p.flex', 'game': 'div.main-row a[title]', 'score': 'a a[title]+div:has(+a[title])', 'home_odds': 'a:has(a[title])~div:not(.hidden)', 'draw_odds': 'a:has(a[title])~div:not(.hidden)+div:nth-last-of-type(3)', 'away_odds': 'a:has(a[title])~div:nth-last-of-type(2)', } evtTeams = evt.select('a div>a[title]')
Answered By - Ajeet Verma
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.