Sunday, January 2, 2022

[FIXED] Scrapy and next pages

January 02, 2022 python, scrapy, web-scraping No comments

Issue

I have the following code:

import scrapy
from datetime import datetime, timedelta

class TigerOffer(scrapy.Item):
    product_id = scrapy.Field()
    url = scrapy.Field()
    title = scrapy.Field()
    price = scrapy.Field()
    city = scrapy.Field()
    state = scrapy.Field()
    post_date = scrapy.Field()
    post_time = scrapy.Field()
    thumb_url = scrapy.Field()
    is_featured = scrapy.Field()
    list_position = scrapy.Field()

class TigerOfferSpider(scrapy.Spider):
    name = 'tigeroffs'
    custom_settings = {
    'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',    
    'CLOSESPIDER_ITEMCOUNT': 30  
    }

    allowed_domains = ['https://sp.olx.com.br']
    start_urls = ['https://sp.olx.com.br/autos-e-pecas/motos?q=tiger%20800']

    def parse(self, response):
        offerItem = TigerOffer()
        offerList = response.xpath("//ul[@class='sc-1fcmfeb-1 kntIvV']/li")
        
        for offer in offerList: 
            offerItem["product_id"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@data-lurker_list_id").get()
            offerItem["url"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@href').get()
            offerItem["title"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@title").get()
            offerItem["price"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-ifAKCX eoKYee']/text()").get()
            offerItem["post_time"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//p[@class='sc-1iuc9a2-4 hDBjae sc-ifAKCX fWUyFm']/text()").get()
            offerItem["city"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-7l84qu-1 ciykCV sc-ifAKCX dpURtf']/@title").get()
            offerItem["thumb_url"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//div[@class='fnmrjs-1 gIEtsI']//img/@src").get()
            offerItem["is_featured"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_is_featured').get()
            offerItem["list_position"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_list_position').get()

            yield offerItem

        next_page_url = response.xpath('//*[@id="listing-main-content-slot"]/div[9]/div/div/div[2]/div/div[1]/div/a/@href').extract_first()
        if next_page_url is not None:
            yield scrapy.Request(response.urljoin(next_page_url))

The problem is, I'm not able to go to the next page. The variable next_page_url exist and ins't none. I've been looking for some answers, but I couldn't find anything.

Does anyone know how to figure it out?

I appreciate it.

Solution

Main Problem

The root cause for not being able to navigate to next page is because of 'CLOSESPIDER_ITEMCOUNT': 30 in your code.

Explanation

By including this line in your code 'CLOSESPIDER_ITEMCOUNT': 30 is signalling the spider to shutdown when item scraped count reaches to 30. But, due to Scrapy async nature it will finish scraping all the items and will not process any other requests afterwards. Now, as the spider had already been shutdown, the yield scrapy.Request(response.urljoin(next_page_url)) will not work.

Your Code after modification

import scrapy
from datetime import datetime, timedelta

class TigerOffer(scrapy.Item):
    product_id = scrapy.Field()
    url = scrapy.Field()
    title = scrapy.Field()
    price = scrapy.Field()
    city = scrapy.Field()
    state = scrapy.Field()
    post_date = scrapy.Field()
    post_time = scrapy.Field()
    thumb_url = scrapy.Field()
    is_featured = scrapy.Field()
    list_position = scrapy.Field()

class TigerOfferSpider(scrapy.Spider):
    name = 'tigeroffs'
    custom_settings = {
    'USER_AGENT': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/71.0.3578.80 Chrome/71.0.3578.80 Safari/537.36',    
    }

    allowed_domains = ['sp.olx.com.br']
    start_urls = ['https://sp.olx.com.br/autos-e-pecas/motos?q=tiger%20800']

    def parse(self, response):
        offerItem = TigerOffer()
        offerList = response.xpath("//ul[@class='sc-1fcmfeb-1 kntIvV']/li")
        
        for offer in offerList: 
            offerItem["product_id"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@data-lurker_list_id").get()
            offerItem["url"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@href').get()
            offerItem["title"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']/@title").get()
            offerItem["price"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-ifAKCX eoKYee']/text()").get()
            offerItem["post_time"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//p[@class='sc-1iuc9a2-4 hDBjae sc-ifAKCX fWUyFm']/text()").get()
            offerItem["city"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//span[@class='sc-7l84qu-1 ciykCV sc-ifAKCX dpURtf']/@title").get()
            offerItem["thumb_url"] = offer.xpath("a[@class='fnmrjs-0 fyjObc']//div[@class='fnmrjs-1 gIEtsI']//img/@src").get()
            offerItem["is_featured"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_is_featured').get()
            offerItem["list_position"] = offer.xpath('a[@class="fnmrjs-0 fyjObc"]/@data-lurker_list_position').get()

            yield offerItem

        next_page_url = response.xpath('//*[@id="listing-main-content-slot"]/div[9]/div/div/div[2]/div/div[1]/div/a/@href').extract_first()
        if next_page_url is not None:
            yield scrapy.Request(next_page_url, callback=self.parse)

Answered By - Shivam

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Sunday, January 2, 2022

[FIXED] Scrapy and next pages

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels