Wednesday, November 15, 2023

[FIXED] Scrapy extract table data if table exists

November 15, 2023 python, scrapy No comments

Issue

I am scraping this website https://www.handbook.fca.org.uk/handbook/PRIN/3/?date=2030-12-01&timeline=True&view=chapter and I'd like to parse the table data as well if the table exists in the page. I have tried catching table element on conditional basis but it is not working with that approach. Here is my code:

import scrapy
from urllib.parse import urlencode


class HandBook(scrapy.Spider):
    name = "handbook_spider"

    custom_settings = {
        "LOG_FILE": "handbook_spider.log",
        "ITEM_PIPELINES": {
            "handbook_spider.pipelines.HandbookExcelPipeline": 300,
        },
    }

    headers = {
        "authority": "www.handbook.fca.org.uk",
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
        "accept-language": "en,ru;q=0.9",
        "cache-control": "max-age=0",
        "sec-ch-ua": '"Chromium";v="106", "Yandex";v="22", "Not;A=Brand";v="99"',
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": '"Linux"',
        "sec-fetch-dest": "document",
        "sec-fetch-mode": "navigate",
        "sec-fetch-site": "cross-site",
        "sec-fetch-user": "?1",
        "upgrade-insecure-requests": "1",
        "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 YaBrowser/22.11.3.832 (beta) Yowser/2.5 Safari/537.36",
    }

    params = {
        "date": "2030-12-01",
        "timeline": "True",
        "view": "chapter",
    }

    url = "https://www.handbook.fca.org.uk/handbook/PRIN/3/?"

    def start_requests(self):
        base_url = self.url + urlencode(self.params)
        yield scrapy.Request(
            url=base_url, headers=self.headers, callback=self.parse_details
        )

    def parse_details(self, response):
        for content in response.css("div.handbook-content"):
            chapter_ref = content.xpath(
                "./header/h1/span[@class='extended']/text()"
            ).get()
            chapter = "".join(content.xpath("./header/h1/text()").getall()).strip()
            topic = None
            for section in content.css("section"):
                header = section.css("header")
                table_content = section.css("div.section-content-table")
                if header:
                    topic = header.css("h2.crosstitle::text").get()
                if table_content:
                    topic = section.css("h3::text").get()
                    content = section.css("td ::text").getall()
                    clause_text = " ".join(list(content))
                else:
                    content = section.xpath(
                        ".//div[@class='section-content']//text()"
                    ).getall()
                    clause_text = " ".join(list(map(str.strip, content)))
                    uid = section.xpath(".//span[@class='extended']/text()").get()
                    if section.css("span.section-type").get() is not None:
                        yield {
                            "Unique_ids": uid,
                            "Chapter_ref": chapter_ref,
                            "Chapter": chapter,
                            "Topic": topic,
                            "Clause": uid.split(".")[-2],
                            "Sub_Clause": uid.split(".")[-1],
                            "Type": section.css("span.section-type::text").get(),
                            "Date_applicable": section.xpath(
                                ".//time/span/text()"
                            ).get(),
                            "Text": clause_text,
                        }

scraper totally missing PRIN 3.3.1 section. Please anyone can help me out figure out parsing that table?

Solution

The reason it is skipping section with the table is because you are parsing the table in the if table_content: block. Since the parse method only ever yields an item from the else block, when if table_content is True it never executes the else block and no item is ever yielded.

Here is an example of how you could make it work.

    def parse_details(self, response):
        for content in response.css("div.handbook-content"):
            chapter_ref = content.xpath(
                "./header/h1/span[@class='extended']/text()"
            ).get()
            chapter = "".join(content.xpath("./header/h1/text()").getall()).strip()
            topic = None
            for section in content.css("section"):
                header = section.css("h2.crosstitle::text")
                if header:
                    topic = header.get()
                else:
                    table_content = section.css("div.section-content-table")
                    if table_content:
                        topic = section.xpath(".//header/h3/text()").get()
                        content = table_content.xpath(".//table//text()").getall()
                    else:
                        content = section.xpath(
                            ".//div[@class='section-content']//text()"
                        ).getall()
                    clause_text = " ".join(list(map(str.strip, content)))
                    uid = section.xpath(".//span[@class='extended']/text()").get()
                    if section.css("span.section-type").get() is not None:
                        yield {
                            "Unique_ids": uid,
                            "Chapter_ref": chapter_ref,
                            "Chapter": chapter,
                            "Topic": topic,
                            "Clause": uid.split(".")[-2],
                            "Sub_Clause": uid.split(".")[-1],
                            "Type": section.css("span.section-type::text").get(),
                            "Date_applicable": section.xpath(
                                ".//time/span/text()"
                            ).get(),
                            "Text": clause_text,
                        }

Answered By - Alexander

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Wednesday, November 15, 2023

[FIXED] Scrapy extract table data if table exists

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels