Issue
I have the following code:
import scrapy
from datetime import datetime, timedelta
class TigerOffer(scrapy.Item):
product_id = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
price = scrapy.Field()
city = scrapy.Field()
state = scrapy.Field()
post_date = scrapy.Field()
post_time = scrapy.Field()
thumb_url = scrapy.Field()
is_featured = scrapy.Field()
list_position = scrapy.Field()
class TigerOfferSpider(scrapy.Spider):
name = 'tigeroffs'
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
'CLOSESPIDER_ITEMCOUNT': 30
}
allowed_domains = ['https://sp.olx.com.br']
start_urls = ['https://sp.olx.com.br/autos-e-pecas/motos?q=tiger%20800']
def parse(self, response):
offerItem = TigerOffer()
offerList = response.xpath("//ul[@class='sc-1fcmfeb-1 kntIvV']/li")
for offer in offerList:
offerItem["product_id"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@data-lurker_list_id").get()
offerItem["url"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@href').get()
offerItem["title"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@title").get()
offerItem["price"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-ifAKCX eoKYee']/text()").get()
offerItem["post_time"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//p[@class='sc-1iuc9a2-4 hDBjae sc-ifAKCX fWUyFm']/text()").get()
offerItem["city"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-7l84qu-1 ciykCV sc-ifAKCX dpURtf']/@title").get()
offerItem["thumb_url"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//div[@class='fnmrjs-1 gIEtsI']//img/@src").get()
offerItem["is_featured"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_is_featured').get()
offerItem["list_position"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_list_position').get()
yield offerItem
next_page_url = response.xpath('//*[@id="listing-main-content-slot"]/div[9]/div/div/div[2]/div/div[1]/div/a/@href').extract_first()
if next_page_url is not None:
yield scrapy.Request(response.urljoin(next_page_url))
The problem is, I'm not able to go to the next page.
The variable next_page_url
exist and ins't none.
I've been looking for some answers, but I couldn't find anything.
Does anyone know how to figure it out?
I appreciate it.
Solution
Main Problem
The root cause for not being able to navigate to next page is because of 'CLOSESPIDER_ITEMCOUNT': 30
in your code.
Explanation
By including this line in your code 'CLOSESPIDER_ITEMCOUNT': 30
is signalling the spider to shutdown when item scraped count reaches to 30. But, due to Scrapy async nature it will finish scraping all the items and will not process any other requests afterwards. Now, as the spider had already been shutdown, the yield scrapy.Request(response.urljoin(next_page_url))
will not work.
Your Code after modification
import scrapy
from datetime import datetime, timedelta
class TigerOffer(scrapy.Item):
product_id = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
price = scrapy.Field()
city = scrapy.Field()
state = scrapy.Field()
post_date = scrapy.Field()
post_time = scrapy.Field()
thumb_url = scrapy.Field()
is_featured = scrapy.Field()
list_position = scrapy.Field()
class TigerOfferSpider(scrapy.Spider):
name = 'tigeroffs'
custom_settings = {
'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',
}
allowed_domains = ['sp.olx.com.br']
start_urls = ['https://sp.olx.com.br/autos-e-pecas/motos?q=tiger%20800']
def parse(self, response):
offerItem = TigerOffer()
offerList = response.xpath("//ul[@class='sc-1fcmfeb-1 kntIvV']/li")
for offer in offerList:
offerItem["product_id"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@data-lurker_list_id").get()
offerItem["url"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@href').get()
offerItem["title"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@title").get()
offerItem["price"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-ifAKCX eoKYee']/text()").get()
offerItem["post_time"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//p[@class='sc-1iuc9a2-4 hDBjae sc-ifAKCX fWUyFm']/text()").get()
offerItem["city"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-7l84qu-1 ciykCV sc-ifAKCX dpURtf']/@title").get()
offerItem["thumb_url"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//div[@class='fnmrjs-1 gIEtsI']//img/@src").get()
offerItem["is_featured"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_is_featured').get()
offerItem["list_position"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_list_position').get()
yield offerItem
next_page_url = response.xpath('//*[@id="listing-main-content-slot"]/div[9]/div/div/div[2]/div/div[1]/div/a/@href').extract_first()
if next_page_url is not None:
yield scrapy.Request(next_page_url, callback=self.parse)
Answered By - Shivam
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.