Issue
I'm scraping a page with selenium and beautiful soup and I'm getting duplicates when I use a for loop to change the page URL and I have no idea why. This is my code
import json
import requests
import time
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from selenium import webdriver
from chromedriver_py import binary_path # this will get you the path variable
from selenium.webdriver import Chrome
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
teams = []
home_team = []
away_team = []
results_away = []
results_home = []
results_away1 = []
results_home1 = []
list3_away = []
list3_home = []
odds = []
odds_sub = []
odds_sub1 = []
days = ['20160826',
'20160827',
'20160901',
]
for date in days:
header = {"user-agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
url = 'https://www.sportsbookreview.com/betting-odds/college-football/totals/1st-half/?date='+date
page = requests.get(url, headers = header)
soup = BeautifulSoup(page.content, 'html.parser')
#### Selenium scraper for odds
driver = webdriver.Chrome(executable_path=binary_path)
driver.get(url)
time.sleep(5)
elements = WebDriverWait(driver, 5).until(lambda d: d.find_elements_by_xpath('//*[@data-vertical-sbid="238"]'))
for a in elements:
odds.append(a.text)
for a in odds:
string = a.split('''\n''', 1)[0]
mod_string = string[:-4]
odds_sub.append(mod_string)
driver.quit()
## All teams
for el in soup.find_all('span', class_='participantBox-3ar9Y'):
teams.append(el.text)
### Away result
for s in soup.find_all(class_="scoreboard-1TXQV"):
for a in s.select(".scoreboardColumn-2OtpR > div:nth-of-type(1)")[:2]:
results_away.append(a.text)
for a in results_away:
if a == '-':
a= a.replace(a, '10000')
results_away1.append(a)
results_away1 = [int(i) for i in results_away1]
for i in range(0, len(results_away1)-1, 2):
firstnum = results_away1[i]
secondnum = results_away1[i+1]
sumnum = firstnum + secondnum
list3_away.append(sumnum)
### Home results
for s in soup.find_all(class_="scoreboard-1TXQV"):
for a in s.select(".scoreboardColumn-2OtpR > div:nth-of-type(2)")[:2]:
results_home.append(a.text)
for a in results_home:
if a == '-':
a= a.replace(a, '10000')
results_home1.append(a)
results_home1 = [int(i) for i in results_home1]
for i in range(0, len(results_home1)-1, 2):
firstnum = results_home1[i]
secondnum = results_home1[i+1]
sumnum = firstnum + secondnum
list3_home.append(sumnum)
time.sleep(7)
#### Create lists for away and home teams
for team in teams[::2]:
away_team.append(team)
for team in teams[1::2]:
home_team.append(team)
print(home_team)
print('*****')
print(away_team)
print('*****')
print(list3_home)
print('*****')
print(list3_away)
print('*****')
print(odds_sub)
print('*****')
I receive results like this:
['California', 'North Dakota State', '(13) Louisville', 'Wake Forest', 'Central Michigan', 'Cincinnati', 'Connecticut', 'Florida International', '(21) Tennessee', 'North Carolina State', 'Western Kentucky', 'Vanderbilt', '(19) Utah', 'Utah State', 'Minnesota', 'New Mexico', 'Idaho', 'UNLV']
*****
['Hawaii', 'Charleston Southern', 'Charlotte', 'Tulane', 'Presbyterian', 'Tennessee-Martin', 'Maine', 'Indiana', 'Appalachian State', 'William & Mary', 'Rice', 'South Carolina', 'Southern Utah', 'Weber State', 'Oregon State', 'South Dakota', 'Montana State', 'Jackson State']
*****
[34, 34, 34, 3, 34, 34, 3, 34, 3, 56, 7, 14, 6, 7, 10, 3, 28, 30, 10, 17, 21, 17, 35, 20, 42]
*****
[14, 14, 14, 3, 14, 14, 3, 14, 3, 0, 3, 3, 7, 7, 12, 13, 7, 7, 0, 0, 6, 14, 14, 10, 10]
*****
['34', '34', '', '34', '', '34½', '21½', '', '', '', '32', '31', '', '33½', '20', '', '', '28', '', '', '']
*****
For example list3_home has 7 results at the beginning duplicated, odds_sub list has 3 duplicates at the beginning etc.
Solution
Ok so see if this works. There's quite a bit of parameter ids to use in the query if you want other data. But this will get you the 1st half stuff. I think I identified the correct id for Pinnacle.
import requests
import pandas as pd
from datetime import datetime, timezone, timedelta
from calendar import timegm
days = ['20160826',
'20160827',
'20160901',
'20200912',
]
homeTeam_list = []
awayTeam_list = []
homeScore_list = []
awayScore_list = []
odds_list = []
url = 'https://www.sportsbookreview.com/ms-odds-v2/odds-v2-service'
header = {"user-agent" : "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"}
for date in days:
print(date)
date_time_obj = datetime.strptime(date, '%Y%m%d')
year = date_time_obj.year
month = date_time_obj.month
day = date_time_obj.day
dt = datetime(year,month,day,0,0)
tz = timezone(timedelta(hours=0))
epochtime = timegm(dt.replace(tzinfo=tz).utctimetuple()) *1000
queryStr_participants = 'query=%7B+eventsByDateByLeagueGroup(+es:+[%22in-progress%22,+%22scheduled%22,+%22complete%22,+%22suspended%22,+%22delayed%22,+%22postponed%22,+%22retired%22,+%22canceled%22],+leagueGroups:+[%7B+mtid:+398,+lid:+6,+spid:+4+%7D],+providerAcountOpener:+3,+hoursRange:+25,+showEmptyEvents:+false,+marketTypeLayout:+%22PARTICIPANTS%22,+ic:+false,+startDate:+' + str(int(epochtime)) + ',+timezoneOffset:+-4,+nof:+true,+hl:+true,+sort:+%7Bby:+[%22lid%22,+%22dt%22,+%22des%22],+order:+ASC%7D+)+%7B+events+%7B+eid+lid+spid+des+dt+es+rid+ic+ven+tvs+cit+cou+st+sta+hl+seid+writeingame+plays(pgid:+2,+limitLastSeq:+3,+pgidWhenFinished:+-1)+%7B+eid+sqid+siid+gid+nam+val+tim+%7D+scores+%7B+partid+val+eid+pn+sequence+%7D+participants+%7B+eid+partid+partbeid+psid+ih+rot+tr+sppil+sppic+startingPitcher+%7B+fn+lnam+%7D+source+%7B+...+on+Player+%7B+pid+fn+lnam+%7D+...+on+Team+%7B+tmid+lid+tmblid+nam+nn+sn+abbr+cit+senam+imageurl+%7D+...+on+ParticipantGroup+%7B+partgid+nam+lid+participants+%7B+eid+partid+psid+ih+rot+source+%7B+...+on+Player+%7B+pid+fn+lnam+%7D+...+on+Team+%7B+tmid+lid+nam+nn+sn+abbr+cit+%7D+%7D+%7D+%7D+%7D+%7D+marketTypes+%7B+mtid+spid+nam+des+settings+%7B+sitid+did+alias+format+template+sort+url+%7D+%7D++eventGroup+%7B+egid+nam+%7D+statistics(sgid:+3,+sgidWhenFinished:+4)+%7B+val+eid+nam+partid+pid+typ+siid+sequence+%7D+league+%7B+lid+nam+rid+spid+sn+settings+%7B+alias+rotation+ord+shortnamebreakpoint+matchupline+%7D+%7D+%7D+maxSequences+%7B+events:+eventsMaxSequence+scores:+scoresMaxSequence+currentLines:+linesMaxSequence+statistics:+statisticsMaxSequence+plays:+playsMaxSequence+consensus:+consensusMaxSequence+%7D+%7D+%7D'
jsonData_participants = requests.get(url, headers = header, params=queryStr_participants).json()
events = jsonData_participants['data']['eventsByDateByLeagueGroup']['events']
for event in events:
des = event['des'].split('@')
if event['es'] != 'complete':
print(' @ '.join(des), ' - ', event['es'])
continue
else:
print(' @ '.join(des))
eventId = str(event['eid'])
queryStr_odds = 'query=%7B+currentLines(eid:+['+eventId+'],+mtid:+[398],+marketTypeLayout:+%22PARTICIPANTS%22,+catid:+133)+openingLines(eid:+['+eventId+'],+mtid:+[398],+marketTypeLayout:+%22PARTICIPANTS%22,+paid:+3)+bestLines(catid:+133,+eid:+['+eventId+'],+mtid:+[398])+consensus(eid:+['+eventId+'],+mtid:+[398])+%7B+eid+mtid+boid+partid+sbid+bb+paid+lineid+wag+perc+vol+tvol+sequence+tim+%7D+maxSequences+%7B+events:+eventsMaxSequence+scores:+scoresMaxSequence+currentLines:+linesMaxSequence+statistics:+statisticsMaxSequence+plays:+playsMaxSequence+consensus:+consensusMaxSequence+%7D+%7D'
jsonData_odds = requests.get(url, headers = header, params=queryStr_odds).json()
pinnacle_id = 20
currentLines = jsonData_odds['data']['currentLines']
for currentLine in currentLines:
if currentLine['paid'] == pinnacle_id:
odds = currentLine['adj']
break
else:
odds = None
teamIds = {}
for team in event['participants']:
teamIds[team['source']['tmid']] = team['source']['nam']
scores = pd.json_normalize(event['scores'])
scores['team_name'] = scores['partid'].map(teamIds)
awayTeam = event['participants'][0]['source']['nam']
homeTeam = event['participants'][1]['source']['nam']
awayScore = scores[(scores['team_name'] == awayTeam) & (scores['pn'] <= 2)]['val'].astype(int).sum()
homeScore = scores[(scores['team_name'] == homeTeam) & (scores['pn'] <= 2)]['val'].astype(int).sum()
homeTeam_list.append(homeTeam)
awayTeam_list.append(awayTeam)
homeScore_list.append(homeScore)
awayScore_list.append(awayScore)
odds_list.append(odds)
print(homeTeam_list)
print('*****')
print(awayTeam_list)
print('*****')
print(homeScore_list)
print('*****')
print(awayScore_list)
print('*****')
print(odds_list)
print('*****')
Output:
['California', 'Charleston Southern', 'Charlotte', 'Maine', 'Presbyterian', 'Tennessee-Martin', 'Wake Forest', 'Tennessee', 'Florida International', 'North Carolina State', 'Rice', 'Vanderbilt', 'Southern Utah', 'Weber State', 'Montana State', 'Oregon State', 'South Dakota', 'Jackson State']
*****
['Hawaii', 'North Dakota State', 'Louisville', 'Connecticut', 'Central Michigan', 'Cincinnati', 'Tulane', 'Appalachian State', 'Indiana', 'William & Mary', 'Western Kentucky', 'South Carolina', 'Utah', 'Utah State', 'Idaho', 'Minnesota', 'New Mexico', 'UNLV']
*****
[34, 3, 0, 7, 3, 7, 7, 3, 10, 28, 7, 10, 0, 6, 10, 14, 14, 10]
*****
[14, 3, 56, 7, 14, 6, 3, 13, 12, 7, 30, 0, 17, 21, 20, 17, 35, 42]
*****
[34, 34, 35, 35, 35, 35, 35, 31, 32, 32, 32, 21, 21, 21, 21, 28, 28, 28]
*****
Answered By - chitown88
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.