Issue
I am trying to parse the table of predictions here but https://theanalyst.com/na/2023/08/opta-football-predictions/
However, I am struggling to parse the divs table. I find_all("tr") and getting the row data doesnt work as I expect however each table looks sensible in the pase_table function below:
from selenium import webdriver
from pyvirtualdisplay import Display
from bs4 import BeautifulSoup
def parse_table(table):
#struggling here <-
def parse_tables(soup):
try:
# Find all tables on the page
tables = soup.find_all('table')
# Loop through each table and extract data
data = []
for table in tables:
table.append(parse_table(table))
except Exception as e:
print(f"An error occurred during table parsing: {e}")
def parse_website():
# Start a virtual display
display = Display(visible=0, size=(800, 600))
display.start()
try:
# Set Chrome options with the binary location
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = '/usr/bin/google-chrome'
# Initialize Chrome driver
driver = webdriver.Chrome(options=chrome_options)
# Open the desired URL
url = 'https://theanalyst.com/na/2023/08/opta-football-predictions/'
driver.get(url)
# Wait for the page to load completely (adjust the time as needed)
driver.implicitly_wait(10)
# Get the page source after waiting for it to load
page_source = driver.page_source
# Parse the page source using BeautifulSoup
soup = BeautifulSoup(page_source, 'html.parser')
# Call the function to parse tables
parse_tables(soup)
except Exception as e:
print(f"An error occurred: {e}")
finally:
# Quit the driver and stop the virtual display
if 'driver' in locals():
driver.quit()
display.stop()
# Call the function to parse the website
parse_website()
So I tried to parse search for find_all("tr") and iterating through but I am not corectly identifying the fields from here
Solution
Your tables list is placed in iframe, so to get frame content you should switch to it's context first.
Needed frame is last frame with selector iframe[src*=predictions]
Inside frame you should filter for visible tables as far as tables for other dates are rendered with empty content.
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
url = 'https://theanalyst.com/na/2023/08/opta-football-predictions/'
driver.get(url)
delay = 10
predictions = WebDriverWait(driver, delay).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'iframe[src*=predictions]')))
driver.switch_to.frame(predictions[-1])
tables = driver.find_elements(By.CSS_SELECTOR, '[class*=match-card-body] table')
visible_tables = [table for table in tables if table.is_displayed()]
for table in visible_tables:
rows = table.find_elements(By.CSS_SELECTOR, 'tr')
for row in rows:
print(row.text)
Answered By - Yaroslavm
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.