Issue
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'book'
start_urls = ['https://www.amazon.com/s?k=school+bags&rh=n%3A1069242&ref=nb_sb_noss']
def parse(self, response):
books = response.xpath("//h2/a/@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
table=response.xpath("//table[@id='productDetails_detailBullets_sections1']").extract_first()
yield{
't':table
}
I am trying to scrape the table but I do not how to extract text
from table trying to scrape product information
this is the link in which I extract the table
https://www.amazon.com/Piel-Leather-Double-Flap-Over-Backpack/dp/B00GNEY85A/ref=sr_1_1_sspa?keywords=school%2Bbags&qid=1642846253&s=office-products&sr=1-1-spons&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUExMkdMT1hKSkI1UVFTJmVuY3J5cHRlZElkPUEwNTQxMDA5M0c1R0xRQVUwTVdKViZlbmNyeXB0ZWRBZElkPUEwNzc5Njc4MUdQR09VMVBGSTlGSSZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU&th=1
Solution
To scrape a table, you can iterate through the table header and table data and assign them to keys and values and then yield the full dictonary. See below sample
from scrapy import Spider
from scrapy.http import Request
class AuthorSpider(Spider):
name = 'book'
start_urls = ['https://www.amazon.com/s?k=school+bags&rh=n%3A1069242&ref=nb_sb_noss']
def parse(self, response):
books = response.xpath("//h2/a/@href").extract()
for book in books:
url = response.urljoin(book)
yield Request(url, callback=self.parse_book)
def parse_book(self, response):
details = {}
for product_detail in response.xpath("//*[contains(@id,'productDetails')]//table/tr"):
key = product_detail.xpath("normalize-space(./th/text())").get()
value = product_detail.xpath("normalize-space(./td/text())").get().replace("\u200e", "")
if "best sellers rank" in key.lower():
det_list = product_detail.xpath("./td/descendant::*/text()").getall()
value = "".join([i.strip() for i in det_list])
if "customer reviews" in key.lower():
det_list = product_detail.xpath("./td/descendant::span/text()").getall()
value = " ".join([i.strip() for i in det_list])
details[key] = value
yield details
Answered By - msenior_
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.