Issue
How do I include the items in pagination using meta?
next_page = response.xpath('//a[@class="pages-next"]/@href').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse_allbranddevicesurl)
I am pretty sure that I am passing the items incorrectly. (I managed to collect all datapoints only of the first page in LEVEL2.)
Here is the code:
class GsmSpider(scrapy.Spider):
name = 'gsm'
allowed_domains = ['gsmarena.com']
start_urls = ['https://gsmarena.com/makers.php3']
# LEVEL1 | all brands
def parse(self, response):
gsms = response.xpath('//div[@class="st-text"]/table//td')
for gsm in gsms:
allbranddevicesurl = gsm.xpath('.//a/@href').get()
brandname = gsm.xpath('.//a/text()').get()
devicecount = gsm.xpath('.//span/text()').get()
yield response.follow(allbranddevicesurl, callback=self.parse_allbranddevicesurl,
meta= {'brandname': brandname,
'devicecount': devicecount})
# LEVEL2 | all devices
def parse_allbranddevicesurl(self, response):
brandname = response.meta['brandname']
devicecount = response.meta['devicecount']
phones = response.xpath('//*[@id="review-body"]//li')
for phone in phones:
detailpageurl = phone.xpath('.//a/@href').get()
yield response.follow(detailpageurl,
callback=self.parse_detailpage,
meta= {'brandname': brandname,
'devicecount': devicecount,})
next_page = response.xpath('//a[@class="pages-next"]/@href').get()
if next_page is not None:
yield response.follow(next_page, callback=self.parse_allbranddevicesurl)
# LEVEL3 | detailpage
def parse_detailpage(self, response):
brandname = response.meta['brandname']
devicecount = response.meta['devicecount']
details = response.xpath('//div[@class="article-info"]')
for detail in details:
phonename = detail.xpath('.//h1/text()').get()
yield {'brandname': brandname,
'devicecount': devicecount,
'phonename': phonename}
Many thanks in advance for pointing out the error.
Solution
You forgot to add the meta data in the request of the next page inside level 2
class GsmSpider(scrapy.Spider):
name = 'gsm'
allowed_domains = ['gsmarena.com']
start_urls = ['https://gsmarena.com/makers.php3']
# LEVEL1 | all brands
def parse(self, response):
gsms = response.xpath('//div[@class="st-text"]/table//td')
for gsm in gsms:
allbranddevicesurl = gsm.xpath('.//a/@href').get()
brandname = gsm.xpath('.//a/text()').get()
devicecount = gsm.xpath('.//span/text()').get()
yield response.follow(allbranddevicesurl, callback=self.parse_allbranddevicesurl,
meta= {'brandname': brandname,
'devicecount': devicecount})
# LEVEL2 | all devices
def parse_allbranddevicesurl(self, response):
brandname = response.meta['brandname']
devicecount = response.meta['devicecount']
phones = response.xpath('//*[@id="review-body"]//li')
for phone in phones:
detailpageurl = phone.xpath('.//a/@href').get()
yield response.follow(detailpageurl,
callback=self.parse_detailpage,
meta= {'brandname': brandname,
'devicecount': devicecount,})
next_page = response.xpath('//a[@class="pages-next"]/@href').get()
if next_page is not None:
# This is the fixed line:
yield response.follow(next_page, callback=self.parse_allbranddevicesurl,
meta={'brandname': brandname,
'devicecount': devicecount})
# LEVEL3 | detailpage
def parse_detailpage(self, response):
brandname = response.meta['brandname']
devicecount = response.meta['devicecount']
details = response.xpath('//div[@class="article-info"]')
for detail in details:
phonename = detail.xpath('.//h1/text()').get()
yield {'brandname': brandname,
'devicecount': devicecount,
'phonename': phonename}
Answered By - SuperUser
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.