Issue
This code gets data from www.oddsportal.com
How can I accomodate for when there is no score present for any event in this code?
Currently, the code scrapes all data from the pages:
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool
import os
import re
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
# print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def generate_matches(table):
tr_tags = table.findAll('tr')
for tr_tag in tr_tags:
if 'class' in tr_tag.attrs and 'dark' in tr_tag['class']:
th_tag = tr_tag.find('th', {'class': 'first2 tl'})
a_tags = th_tag.findAll('a')
country = a_tags[0].text
league = a_tags[1].text
else:
td_tags = tr_tag.findAll('td')
yield td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text, \
td_tags[4].text, td_tags[5].text, country, league
def parse_data(url, return_urls=False):
browser = create_driver()
browser.get(url)
soup = bs(browser.page_source, "lxml")
div = soup.find('div', {'id': 'col-content'})
table = div.find('table', {'class': 'table-main'})
h1 = soup.find('h1').text
print(h1)
m = re.search(r'\d+ \w+ \d{4}$', h1)
game_date = m[0]
game_data = GameData()
for row in generate_matches(table):
game_data.date.append(game_date)
game_data.time.append(row[0])
game_data.game.append(row[1])
game_data.score.append(row[2])
game_data.home_odds.append(row[3])
game_data.draw_odds.append(row[4])
game_data.away_odds.append(row[5])
game_data.country.append(row[6])
game_data.league.append(row[7])
if return_urls:
span = soup.find('span', {'class': 'next-games-date'})
a_tags = span.findAll('a')
urls = ['https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags]
return game_data, urls
return game_data
if __name__ == '__main__':
results = None
pool = ThreadPool(5) # We will be getting, however, 7 URLs
# Get today's data and the Urls for the other days:
game_data_today, urls = pool.apply(parse_data, args=('https://www.oddsportal.com/matches/soccer', True))
urls.pop(1) # Remove url for today: We already have the data for that
game_data_results = pool.imap(parse_data, urls)
for i in range(8):
game_data = game_data_today if i == 1 else next(game_data_results)
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
# print(results.head())
# ensure all the drivers are "quitted":
del threadLocal
import gc
gc.collect() # a little extra insurance
When scores are present, table-score
is populated
When scores are not present, table-score
is not present
Right now, the column values for home_odds
, away_odds
and draw_odds
shift when table-score
is not present and hence, providing data incorrectly.
How can I change
game_data.score.append(row[2])
game_data.home_odds.append(row[3])
game_data.draw_odds.append(row[4])
game_data.away_odds.append(row[5])
such that if table-score
is not present, game_data.score.append(row[2])
would return NaN
and
game_data.home_odds.append(row[2])
game_data.draw_odds.append(row[3])
game_data.away_odds.append(row[4])
else as the output is currently?
Solution
You need to first:
from numpy import nan
And then modify code as follows:
...
# Score present?
if ':' not in row[2]:
# No, shift a few columns right:
row[5], row[4], row[3], row[2] = row[4], row[3], row[2], nan
game_data.score.append(row[2])
game_data.home_odds.append(nan if row[3] == '-' else row[3])
game_data.draw_odds.append(nan if row[4] == '-' else row[4])
game_data.away_odds.append(nan if row[5] == '-' else row[5])
...
Note that generate_matches
has to be modified to return list
instances rather than tuple
instances since the above code now requires that the return values, i.e. row
, be modifiable.
Putting it all together:
import pandas as pd
from numpy import nan
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import threading
from multiprocessing.pool import ThreadPool, Pool
from functools import partial
import os
import re
class Driver:
def __init__(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
# Un-comment next line to supress logging:
options.add_experimental_option('excludeSwitches', ['enable-logging'])
self.driver = webdriver.Chrome(options=options)
def __del__(self):
self.driver.quit() # clean up driver when we are cleaned up
# print('The driver has been "quitted".')
threadLocal = threading.local()
def create_driver():
the_driver = getattr(threadLocal, 'the_driver', None)
if the_driver is None:
the_driver = Driver()
setattr(threadLocal, 'the_driver', the_driver)
return the_driver.driver
class GameData:
def __init__(self):
self.date = []
self.time = []
self.game = []
self.score = []
self.home_odds = []
self.draw_odds = []
self.away_odds = []
self.country = []
self.league = []
def generate_matches(table):
tr_tags = table.findAll('tr')
for tr_tag in tr_tags:
if 'class' in tr_tag.attrs and 'dark' in tr_tag['class']:
th_tag = tr_tag.find('th', {'class': 'first2 tl'})
a_tags = th_tag.findAll('a')
country = a_tags[0].text
league = a_tags[1].text
else:
td_tags = tr_tag.findAll('td')
yield [td_tags[0].text, td_tags[1].text, td_tags[2].text, td_tags[3].text, \
td_tags[4].text, td_tags[5].text, country, league]
def parse_data(process_pool, url, return_urls=False):
browser = create_driver()
browser.get(url)
# Wait for initial content to be dynamically updated with scores:
browser.implicitly_wait(5)
table = browser.find_element_by_xpath('//*[@id="table-matches"]/table')
# If you do not pass a Pool instance to this function to use
# multiprocessing for the more CPU-intensive work,
# then just replace next statement with: return process_page(browser.page_source, return_urls)
return process_pool.apply(process_page, args=(browser.page_source, return_urls))
def process_page(page_source, return_urls):
soup = bs(page_source, "lxml")
div = soup.find('div', {'id': 'table-matches'})
table = div.find('table', {'class': 'table-main'})
h1 = soup.find('h1').text
print(h1)
m = re.search(r'\d+ \w+ \d{4}$', h1)
game_date = m[0]
game_data = GameData()
for row in generate_matches(table):
game_data.date.append(game_date)
game_data.time.append(row[0])
game_data.game.append(row[1])
# Score present?
if ':' not in row[2]:
# No, shift a few columns right:
row[5], row[4], row[3], row[2] = row[4], row[3], row[2], nan
game_data.score.append(row[2])
game_data.home_odds.append(nan if row[3] == '-' else row[3])
game_data.draw_odds.append(nan if row[4] == '-' else row[4])
game_data.away_odds.append(nan if row[5] == '-' else row[5])
game_data.country.append(row[6])
game_data.league.append(row[7])
if return_urls:
span = soup.find('span', {'class': 'next-games-date'})
a_tags = span.findAll('a')
urls = ['https://www.oddsportal.com' + a_tag['href'] for a_tag in a_tags]
return game_data, urls
return game_data
if __name__ == '__main__':
results = None
pool = ThreadPool(3) # This seems to be optimal for this application
# Create multiprocessing pool to do the CPU-intensive processing:
process_pool = Pool(min(5, os.cpu_count())) # 5 seems to be optimal for this application
# Get today's data and the Urls for the other days:
game_data_today, urls = pool.apply(parse_data, args=(process_pool, 'https://www.oddsportal.com/matches/soccer', True))
urls.pop(1) # Remove url for today: We already have the data for that
game_data_results = pool.imap(partial(parse_data, process_pool), urls)
for i in range(8):
game_data = game_data_today if i == 1 else next(game_data_results)
result = pd.DataFrame(game_data.__dict__)
if results is None:
results = result
else:
results = results.append(result, ignore_index=True)
print(results)
# print(results.head())
# ensure all the drivers are "quitted":
del threadLocal
Prints:
Next Soccer Matches: Today, 10 Sep 2021
Next Soccer Matches: Tuesday, 14 Sep 2021
Next Soccer Matches: Wednesday, 15 Sep 2021
Next Soccer Matches: Thursday, 16 Sep 2021
Next Soccer Matches: Yesterday, 09 Sep 2021
Next Soccer Matches: Sunday, 12 Sep 2021
Next Soccer Matches: Monday, 13 Sep 2021
Next Soccer Matches: Tomorrow, 11 Sep 2021
date time game score home_odds draw_odds away_odds country league
0 09 Sep 2021 00:00 Cumbaya - Guayaquil SC 1:0 -169 +263 +462 Ecuador Serie B
1 09 Sep 2021 00:00 FC Tulsa - Indy Eleven 2:1 -104 +265 +237 USA USL Championship
2 09 Sep 2021 00:05 Pumas Tabasco - Atlante 0:2 +221 +186 +134 Mexico Liga de Expansion MX
3 09 Sep 2021 00:05 Panama - Mexico 1:1 +518 +250 -156 World World Cup 2022
4 09 Sep 2021 00:10 Defensa y Justicia - Tigre 0:1 pen. +138 +199 +214 Argentina Copa Argentina
... ... ... ... ... ... ... ... ... ...
1987 16 Sep 2021 19:00 Olympiacos Piraeus - Antwerp NaN -137 +296 +371 Europe Europa League
1988 16 Sep 2021 19:15 Academica - Estrela NaN -106 +231 +290 Portugal Liga Portugal 2
1989 16 Sep 2021 21:00 Barnechea - Rangers NaN +202 +202 +127 Chile Primera B
1990 16 Sep 2021 22:00 San Marcos de Arica - S. Morning NaN +212 +214 +122 Chile Primera B
1991 16 Sep 2021 23:30 U. De Concepcion - Coquimbo NaN +158 +198 +162 Chile Primera B
[1992 rows x 9 columns]
Answered By - Booboo
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.