Monday, January 22, 2024

[FIXED] Scrapy Collecting Data From Table

January 22, 2024 python, scrapy No comments

Issue

I dont get errors from the script below but this script returns no data. I am trying to get all the games for each of the weeks which start in table 4 in the html. When I enter the xpath commands in the scrapy shell I get data but once I put in the parse definition I dont get anything in return.

import scrapy


class NFLOddsSpider(scrapy.Spider):
    name = 'NFLOdds'
    allowed_domains = ['www.sportsoddshistory.com']
    start_urls = ['https://www.sportsoddshistory.com/nfl-game-season/?y=2022']

    def parse(self, response):
        
        for row in response.xpath('//table[@class="soh1"]//tbody/tr'):

            day = row.xpath('td[1]//text()').extract_first()
            date = row.xpath('td[2]//text()').extract_first()
            time = row.xpath('td[3]//text()').extract_first()
            AtFav = row.xpath('td[4]//text()').extract_first()
            favorite = row.xpath('td[5]//text()').extract_first()
            score = row.xpath('td[6]//text()').extract_first()
            spread = row.xpath('td[7]//text()').extract_first()
            AtDog = row.xpath('td[8]//text()').extract_first()
            underdog = row.xpath('td[9]//text()').extract_first()
            OvUn = row.xpath('td[10]//text()').extract_first()
            notes = row.xpath('td[11]//text()').extract_first()
            week = row.xpath('//*[@id="content"]/div/table[4]/tbody/tr/td/h3').extract_first()

            oddsTable = {
                'day': day,
                'date': date,
                'time': time,
                'AtFav': AtFav,
                'favorite': favorite,
                'score': score,
                'spread': spread,
                'AtDog': AtDog,
                'underdog': underdog,
                'OvUn': OvUn,
                'notes': notes,
                'week' : week
            }
            yield oddsTable

Solution

Updating answer to include Playoffs table; code below

scrapy runspider NFLOddsSpider.py -O output.csv to execute

import re
import scrapy

class NFLOddsSpider(scrapy.Spider):
    name = 'NFLOdds'
    allowed_domains = ['www.sportsoddshistory.com']
    start_urls = ['https://www.sportsoddshistory.com/nfl-game-season/?y=2022']

    def parse(self, response):
        # Find all H3 tags
        for h3 in response.xpath('//h3'):
            h3_text = h3.xpath('./text()').get().strip()

            # Find headings for Week/Playoffs tables
            if (m := re.match(r'(\d{4}) Regular Season - Week (\d{1,2})', h3_text)):
                # Week
                year = m.group(1)
                week = m.group(2)

                # Pick out the one TABLE that immediatelly follows the H3 header
                for tab in h3.xpath('.//following-sibling::table[1]'):
                    # Pick out the rows from that table
                    for row in tab.xpath('./tbody[1]/tr'):
                        yield {
                            'year': year,
                            'week': week,
                            'round': None,
                            'day': row.xpath('td[1]//text()').get(),
                            'date': row.xpath('td[2]//text()').get(),
                            'time_et': row.xpath('td[3]//text()').get(),
                            'fav_at': row.xpath('td[4]//text()').get(),
                            'favorite': row.xpath('td[5]//text()').get(),
                            'fav_cover_spread': bool(row.xpath('td[5]//a/b/text()').get()),
                            'score': row.xpath('td[6]//text()').get(),
                            'spread': row.xpath('td[7]//text()').get(),
                            'und_at': row.xpath('td[8]//text()').get(),
                            'underdog': row.xpath('td[9]//text()').get(),
                            'und_cover_spread': bool(row.xpath('td[9]//a/b/text()').get()),
                            'over_under': row.xpath('td[10]//text()').get(),
                            'notes': row.xpath('td[11]//text()').get()
                        }
            elif (m := re.match(r'(\d{4}) Playoffs', h3_text)):
                # Playoffs
                year = m.group(1)
                week = None

                # Pick out the one TABLE that immediatelly follows the H3 header
                for tab in h3.xpath('.//following-sibling::table[1]'):
                    # Pick out the rows from that table
                    for row in tab.xpath('./tbody[1]/tr'):
                        yield {
                            'year': year,
                            'week': week,
                            'round': row.xpath('td[1]//text()').get(),
                            'day': row.xpath('td[2]//text()').get(),
                            'date': row.xpath('td[3]//text()').get(),
                            'time_et': row.xpath('td[4]//text()').get(),
                            'fav_at': row.xpath('td[5]//text()').get(),
                            'favorite': row.xpath('td[6]//text()').get(),
                            'fav_cover_spread': bool(row.xpath('td[6]//a/b/text()').get()),
                            'score': row.xpath('td[7]//text()').get(),
                            'spread': row.xpath('td[8]//text()').get(),
                            'und_at': row.xpath('td[9]//text()').get(),
                            'underdog': row.xpath('td[10]//text()').get(),
                            'und_cover_spread': bool(row.xpath('td[10]//a/b/text()').get()),
                            'over_under': row.xpath('td[11]//text()').get(),
                            'notes': None
                        }

CSV data screenshot:

Answered By - Aleksandr Krymskiy

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Monday, January 22, 2024

[FIXED] Scrapy Collecting Data From Table

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels