Issue
How do I pass items using meta in a 3 page level spider?
What am I missing? How to create a new item for every iteration?
Here is the code:
import scrapy
class GsmSpider(scrapy.Spider):
name = 'gsm'
allowed_domains = ['gsmarena.com']
start_urls = ['https://gsmarena.com/makers.php3']
# LEVEL1 | all brands
def parse(self, response):
gsms = response.xpath('//div[@class="st-text"]/table')
for gsm in gsms:
allbranddevicesurl = gsm.xpath('.//a/@href').get()
brandname = gsm.xpath('.//a/text()').get()
yield response.follow(allbranddevicesurl, callback=self.parse_allbranddevicesurl,
meta= {'brandname': brandname})
# LEVEL2 | all devices
def parse_allbranddevicesurl(self, response):
brandname = response.meta['brandname']
phones = response.xpath('//*[@id="review-body"]//li')
for phone in phones:
thumbnailurl = phone.xpath('.//a/img/@src').get()
detailpageurl = phone.xpath('.//a/@href').get()
yield response.follow(detailpageurl,
callback=self.parse_detailpage,
meta= {'brandname': brandname})
next_page = response.xpath('//a[@class="pages-next"]/@href').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse_allbranddevicesurl,
meta= {'brandname': brandname})
# LEVEL3 | detailpage
def parse_detailpage(self, response):
brandname = response.meta['brandname']
details = response.xpath('//div[@class="article-info"]')
for detail in details:
phonename = detail.xpath('.//h1/text()').get()
yield {'brandname': brandname,
'phonename': phonename}
I would appreciate an illustrative solution based on my example.
Solution
Your xpath for the brands was not correct. See the comments in the code.
import scrapy
# you may want to move this class to "items.py" and import it
class GsmItem(scrapy.Item):
brandname = scrapy.Field()
thumbnailurl = scrapy.Field()
detailpageurl = scrapy.Field()
phonename = scrapy.Field()
released = scrapy.Field()
class GsmSpider(scrapy.Spider):
name = 'gsm'
allowed_domains = ['gsmarena.com']
start_urls = ['https://gsmarena.com/makers.php3']
custom_settings = {
'CONCURRENT_REQUESTS': 4,
'DOWNLOAD_DELAY': 0.5
}
# LEVEL 1 | all brands
def parse(self, response):
# This is the original xpath which is wrong. You're just getting the table without the cell in the table.
# gsms = response.xpath('//div[@class="st-text"]/table')
# This is the fixed xpath:
gsms = response.xpath('//div[@class="st-text"]/table//td')
for gsm in gsms:
allbranddevicesurl = gsm.xpath('.//a/@href').get()
brandname = gsm.xpath('.//a/text()').get()
yield response.follow(allbranddevicesurl, callback=self.parse_allbranddevicesurl,
meta={'brandname': brandname})
# LEVEL 2 | all devices
def parse_allbranddevicesurl(self, response):
brandname = response.meta['brandname']
phones = response.xpath('//*[@id="review-body"]//li')
for phone in phones:
# we should create a new item for every iteration so we won't overwrite it
item = GsmItem()
thumbnailurl = phone.xpath('.//a/img/@src').get()
detailpageurl = phone.xpath('.//a/@href').get()
item['thumbnailurl'] = thumbnailurl
item['detailpageurl'] = detailpageurl
item['brandname'] = brandname
yield response.follow(detailpageurl,
callback=self.parse_detailpage,
meta={'item': item})
next_page = response.xpath('//a[@class="pages-next"]/@href').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse_allbranddevicesurl,
meta={'brandname': brandname})
# LEVEL 3 | detailpage
def parse_detailpage(self, response):
item = response.meta['item']
details = response.xpath('//div[@class="article-info"]')
for detail in details:
phonename = detail.xpath('.//h1/text()').get()
released = detail.xpath('.//ul/li[1]/span[1]/span/text()').get()
item['phonename'] = phonename
item['released'] = released
yield item
Answered By - SuperUser
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.