Issue
I'm trying to tell Scrapy to move to the next page and scrape the content but it stops at the first page.
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class CasaSpider(CrawlSpider):
name = 'house'
start_urls = ['https://www.casa.it/affitto/residenziale/napoli/montecalvario-avvocata-san-giuseppe-porto-pendino-mercato?sortType=date_desc']
rules = [
(Rule(LinkExtractor(allow=(r'/immobili/.*'), deny=(r'/immagine-.*')),
callback='parse', follow = False)),
]
def parse(self, response):
yield {
'title': response.xpath('//*[@id="__next"]/div[2]/div[2]/div[1]/div/h1/text()').get(),
'price': response.xpath('//*[@id="__next"]/div[2]/div[2]/div[1]/div/ul/li[1]/text()').get()
}
next_page = response.css('a.paginator__page.tp-a--c.b-r--100.is-block.c-bg--w.tp-w--m.paginator__nav.next::attr(href)').get()
next_page = response.urljoin(next_page)
if next_page is not None:
yield scrapy.Request(url=next_page, callback=self.parse, dont_filter=True)
Do you have any idea on what I might be doing wrong? When I test the code for next_page
in the shell I get the correct result.
Thank you all for your help
Solution
Just create another rule for the pages:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class CasaSpider(CrawlSpider):
name = 'house'
start_urls = ['https://www.casa.it/affitto/residenziale/napoli/montecalvario-avvocata-san-giuseppe-porto-pendino-mercato?sortType=date_desc']
rules = (
Rule(LinkExtractor(allow=(r'/affitto/residenziale/napoli/montecalvario-avvocata-san-giuseppe-porto-pendino-mercato/*')), follow=True),
Rule(LinkExtractor(allow=(r'/immobili/.*'), deny=(r'/immagine-.*')), callback='parse', follow=False),
)
def parse(self, response):
yield {
'title': response.xpath('//*[@id="__next"]/div[2]/div[2]/div[1]/div/h1/text()').get(),
'price': response.xpath('//*[@id="__next"]/div[2]/div[2]/div[1]/div/ul/li[1]/text()').get()
}
Answered By - SuperUser
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.