Tuesday, December 19, 2023

[FIXED] Scrapy - Only first url in url list is scraped

December 19, 2023 html, python, scrapy, web-scraping, xpath No comments

Issue

I'm scraping reviews from restaurants in Rome, Milan and Bergamo. For each one of those cities there's one dedicated url containing 30 or more restaurants. The scraper starts crawling the Rome restaurants but never switches to the other cities. It correctly scrapes all the restaurants and reviews from Rome but then the spider is closed.

The Rome restaurants are scraped concurrently, I would expect the same behaviour with the starting urls, but only the first one is taken into consideration

class ReviewSpider2(scrapy.Spider):

    name= 'reviews2'


    def start_requests(self):
        urls = [
        'https://www.tripadvisor.it/Restaurants-g187791-Rome_Lazio.html'
        'https://www.tripadvisor.it/Restaurants-g187849-Milan_Lombardy.html'
        'https://www.tripadvisor.it/Restaurants-g187830-Bergamo_Province_of_Bergamo_Lombardy.html'
        ]
        for url in urls:
            yield scrapy.Request(url, callback = self.parse_restaurants)
        
    def parse_restaurants(self, response):    
        all_restaurants = list(set(response.xpath("//div[contains(@data-test,'_list_item')]//div/div/div/span/a[starts-with(@href,'/Restaurant_Review')]/@href").extract()))
        for restaurant in all_restaurants:
            url = 'https://www.tripadvisor.it' + restaurant
            yield response.follow(url, callback = self.parse_restaurant)

    def parse_restaurant(self, response):

        all_reviews_containers = response.xpath('//div[@class="rev_wrap ui_columns is-multiline"]/div[2]')
        if all_reviews_containers is not None:
            for review_container in all_reviews_containers:
                items = ReviewscraperItem()
                items['restaurant_name'] = response.css('.HjBfq::text').extract_first()
                items['rating'] = 0
                rating_classes = {
                    'ui_bubble_rating bubble_50': 5,
                    'ui_bubble_rating bubble_40': 4,
                    'ui_bubble_rating bubble_30': 3,
                    'ui_bubble_rating bubble_20': 2,
                    'ui_bubble_rating bubble_10': 1
                }
                rating_class = review_container.css('span::attr(class)').extract_first()
                items['rating'] = rating_classes.get(rating_class)
                items['quote'] = review_container.css('.noQuotes::text').extract_first()
                items['address'] = response.xpath("//span/span/a[@class='AYHFM']/text()").extract_first()
                items['review'] = review_container.css('.partial_entry::text').extract_first()
                yield items
            #check if the next page button is disabled (there are no pages left)
            if response.xpath('//a[@class = "nav next ui_button primary disabled"]').extract_first() is None:
                next_page = 'https://www.tripadvisor.it' + response.xpath('//a[@class = "nav next ui_button primary"]/@href').extract_first()
                yield response.follow(url=next_page, callback = self.parse_restaurant)

Solution

You're missing some commas, see the comments:

import scrapy


class ReviewSpider2(scrapy.Spider):
    name = 'reviews2'
    allowed_domains = ['tripadvisor.it']
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-US,en;q=0.5",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "DNT": "1",
        "Host": "www.tripadvisor.it",
        "Pragma": "no-cache",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "TE": "trailers",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
    }

    def start_requests(self):
        # missing commas:
        # urls = [
        #     'https://www.tripadvisor.it/Restaurants-g187791-Rome_Lazio.html'
        #     'https://www.tripadvisor.it/Restaurants-g187849-Milan_Lombardy.html'
        #     'https://www.tripadvisor.it/Restaurants-g187830-Bergamo_Province_of_Bergamo_Lombardy.html'
        # ]
        urls = [
            'https://www.tripadvisor.it/Restaurants-g187791-Rome_Lazio.html',
            'https://www.tripadvisor.it/Restaurants-g187849-Milan_Lombardy.html',
            'https://www.tripadvisor.it/Restaurants-g187830-Bergamo_Province_of_Bergamo_Lombardy.html'
        ]
        # example_list = ['1' '2' '3'] = ['123']
        for url in urls:
            # use headers
            yield scrapy.Request(url, callback=self.parse_restaurants, headers=self.headers)

    def parse_restaurants(self, response):
        # unnecessary because Scrapy has a built in duplicate filter:
        # all_restaurants = list(set(response.xpath("//div[contains(@data-test,'_list_item')]//div/div/div/span/a[starts-with(@href,'/Restaurant_Review')]/@href").extract()))
        all_restaurants = response.xpath("//div[contains(@data-test,'_list_item')]//div/div/div/span/a[starts-with(@href,'/Restaurant_Review')]/@href").getall()

        for restaurant in all_restaurants:
            url = 'https://www.tripadvisor.it' + restaurant
            print(url)
            # yield response.follow(url, callback = self.parse_restaurant)

Answered By - SuperUser

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Tuesday, December 19, 2023

[FIXED] Scrapy - Only first url in url list is scraped

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels