Issue
I am scraping this website https://www.handbook.fca.org.uk/handbook/PRIN/3/?date=2030-12-01&timeline=True&view=chapter and I'd like to parse the table data as well if the table exists in the page. I have tried catching table element on conditional basis but it is not working with that approach. Here is my code:
import scrapy
from urllib.parse import urlencode
class HandBook(scrapy.Spider):
name = "handbook_spider"
custom_settings = {
"LOG_FILE": "handbook_spider.log",
"ITEM_PIPELINES": {
"handbook_spider.pipelines.HandbookExcelPipeline": 300,
},
}
headers = {
"authority": "www.handbook.fca.org.uk",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-language": "en,ru;q=0.9",
"cache-control": "max-age=0",
"sec-ch-ua": '"Chromium";v="106", "Yandex";v="22", "Not;A=Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "cross-site",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 YaBrowser/22.11.3.832 (beta) Yowser/2.5 Safari/537.36",
}
params = {
"date": "2030-12-01",
"timeline": "True",
"view": "chapter",
}
url = "https://www.handbook.fca.org.uk/handbook/PRIN/3/?"
def start_requests(self):
base_url = self.url + urlencode(self.params)
yield scrapy.Request(
url=base_url, headers=self.headers, callback=self.parse_details
)
def parse_details(self, response):
for content in response.css("div.handbook-content"):
chapter_ref = content.xpath(
"./header/h1/span[@class='extended']/text()"
).get()
chapter = "".join(content.xpath("./header/h1/text()").getall()).strip()
topic = None
for section in content.css("section"):
header = section.css("header")
table_content = section.css("div.section-content-table")
if header:
topic = header.css("h2.crosstitle::text").get()
if table_content:
topic = section.css("h3::text").get()
content = section.css("td ::text").getall()
clause_text = " ".join(list(content))
else:
content = section.xpath(
".//div[@class='section-content']//text()"
).getall()
clause_text = " ".join(list(map(str.strip, content)))
uid = section.xpath(".//span[@class='extended']/text()").get()
if section.css("span.section-type").get() is not None:
yield {
"Unique_ids": uid,
"Chapter_ref": chapter_ref,
"Chapter": chapter,
"Topic": topic,
"Clause": uid.split(".")[-2],
"Sub_Clause": uid.split(".")[-1],
"Type": section.css("span.section-type::text").get(),
"Date_applicable": section.xpath(
".//time/span/text()"
).get(),
"Text": clause_text,
}
scraper totally missing PRIN 3.3.1
section. Please anyone can help me out figure out parsing that table?
Solution
The reason it is skipping section with the table is because you are parsing the table in the if table_content:
block. Since the parse method only ever yields an item from the else
block, when if table_content
is True
it never executes the else
block and no item is ever yielded.
Here is an example of how you could make it work.
def parse_details(self, response):
for content in response.css("div.handbook-content"):
chapter_ref = content.xpath(
"./header/h1/span[@class='extended']/text()"
).get()
chapter = "".join(content.xpath("./header/h1/text()").getall()).strip()
topic = None
for section in content.css("section"):
header = section.css("h2.crosstitle::text")
if header:
topic = header.get()
else:
table_content = section.css("div.section-content-table")
if table_content:
topic = section.xpath(".//header/h3/text()").get()
content = table_content.xpath(".//table//text()").getall()
else:
content = section.xpath(
".//div[@class='section-content']//text()"
).getall()
clause_text = " ".join(list(map(str.strip, content)))
uid = section.xpath(".//span[@class='extended']/text()").get()
if section.css("span.section-type").get() is not None:
yield {
"Unique_ids": uid,
"Chapter_ref": chapter_ref,
"Chapter": chapter,
"Topic": topic,
"Clause": uid.split(".")[-2],
"Sub_Clause": uid.split(".")[-1],
"Type": section.css("span.section-type::text").get(),
"Date_applicable": section.xpath(
".//time/span/text()"
).get(),
"Text": clause_text,
}
Answered By - Alexander
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.