Issue
I want to scrape multiple pages for the same item. But every time I yield, it returns an increment of the list of items instead of all the sub-items in the same item's list.
class GdSpider(scrapy.Spider):
name = 'pcs'
start_urls = [...]
def parse(self, response):
PC= dict()
PC['Name'] = response.css('h2::text').get()
components_urls = response.css('a::attr(href)').get()
components = []
for url in components_urls:
req = yield scrapy.Request(response.urljoin(url), self.parse_component)
components.append(parse_component(req))
PC['components'] = components
yield PC
def parse_component(self, response):
component_name = response.css('h1::text')
component_tag = response.css('div[class="tag"]::text').get()
yield {"component_name": component_name, "component_tag": component_tag}
My out should look like:
{"Name": "HP 15", "components": [.....]}
But it scrapes everything independently:
{"Name": "HP 15", "components": [<generator object GdSpider.parse_part_component at 0x000001B8A7405230>]
{component1}
{component2}
How can I return one item with all the components inside it using @inline-requests decorator for example?
Solution
Option 1: Use async await
class GdSpider(scrapy.Spider):
name = 'pcs'
start_urls = [...]
async def parse(self, response):
PC = dict()
PC['Name'] = response.css('h2::text').get()
components_urls = response.css('a::attr(href)').get()
components = []
for url in components_urls:
req = scrapy.Request(response.urljoin(url), self.parse_component)
res = await self.crawler.engine.download(req, self)
components.append(self.parse_component(res))
PC['components'] = components
yield PC
def parse_component(self, response):
component_name = response.css('h1::text')
component_tag = response.css('div[class="tag"]::text').get()
return {"component_name": component_name, "component_tag": component_tag}
Options 2: Use a class member variable.
(Notice that CONCURRENT_REQUESTS
is 1).
class GdSpider(scrapy.Spider):
name = 'pcs'
start_urls = [...]
components = []
custom_settings = {'CONCURRENT_REQUESTS': 1}
def parse(self, response):
PC = dict()
PC['Name'] = response.css('h2::text').get()
components_urls = response.css('a::attr(href)').get()
for url in components_urls:
yield scrapy.Request(response.urljoin(url), self.parse_component)
PC['components'] = self.components
yield PC
def parse_component(self, response):
component_name = response.css('h1::text')
component_tag = response.css('div[class="tag"]::text').get()
self.components.append({"component_name": component_name, "component_tag": component_tag})
Answered By - SuperUser
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.