Issue
I'm scraping reviews from restaurants in Rome, Milan and Bergamo. For each one of those cities there's one dedicated url containing 30 or more restaurants. The scraper starts crawling the Rome restaurants but never switches to the other cities. It correctly scrapes all the restaurants and reviews from Rome but then the spider is closed.
The Rome restaurants are scraped concurrently, I would expect the same behaviour with the starting urls, but only the first one is taken into consideration
class ReviewSpider2(scrapy.Spider):
name= 'reviews2'
def start_requests(self):
urls = [
'https://www.tripadvisor.it/Restaurants-g187791-Rome_Lazio.html'
'https://www.tripadvisor.it/Restaurants-g187849-Milan_Lombardy.html'
'https://www.tripadvisor.it/Restaurants-g187830-Bergamo_Province_of_Bergamo_Lombardy.html'
]
for url in urls:
yield scrapy.Request(url, callback = self.parse_restaurants)
def parse_restaurants(self, response):
all_restaurants = list(set(response.xpath("//div[contains(@data-test,'_list_item')]//div/div/div/span/a[starts-with(@href,'/Restaurant_Review')]/@href").extract()))
for restaurant in all_restaurants:
url = 'https://www.tripadvisor.it' + restaurant
yield response.follow(url, callback = self.parse_restaurant)
def parse_restaurant(self, response):
all_reviews_containers = response.xpath('//div[@class="rev_wrap ui_columns is-multiline"]/div[2]')
if all_reviews_containers is not None:
for review_container in all_reviews_containers:
items = ReviewscraperItem()
items['restaurant_name'] = response.css('.HjBfq::text').extract_first()
items['rating'] = 0
rating_classes = {
'ui_bubble_rating bubble_50': 5,
'ui_bubble_rating bubble_40': 4,
'ui_bubble_rating bubble_30': 3,
'ui_bubble_rating bubble_20': 2,
'ui_bubble_rating bubble_10': 1
}
rating_class = review_container.css('span::attr(class)').extract_first()
items['rating'] = rating_classes.get(rating_class)
items['quote'] = review_container.css('.noQuotes::text').extract_first()
items['address'] = response.xpath("//span/span/a[@class='AYHFM']/text()").extract_first()
items['review'] = review_container.css('.partial_entry::text').extract_first()
yield items
#check if the next page button is disabled (there are no pages left)
if response.xpath('//a[@class = "nav next ui_button primary disabled"]').extract_first() is None:
next_page = 'https://www.tripadvisor.it' + response.xpath('//a[@class = "nav next ui_button primary"]/@href').extract_first()
yield response.follow(url=next_page, callback = self.parse_restaurant)
Solution
You're missing some commas, see the comments:
import scrapy
class ReviewSpider2(scrapy.Spider):
name = 'reviews2'
allowed_domains = ['tripadvisor.it']
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"DNT": "1",
"Host": "www.tripadvisor.it",
"Pragma": "no-cache",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"TE": "trailers",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
}
def start_requests(self):
# missing commas:
# urls = [
# 'https://www.tripadvisor.it/Restaurants-g187791-Rome_Lazio.html'
# 'https://www.tripadvisor.it/Restaurants-g187849-Milan_Lombardy.html'
# 'https://www.tripadvisor.it/Restaurants-g187830-Bergamo_Province_of_Bergamo_Lombardy.html'
# ]
urls = [
'https://www.tripadvisor.it/Restaurants-g187791-Rome_Lazio.html',
'https://www.tripadvisor.it/Restaurants-g187849-Milan_Lombardy.html',
'https://www.tripadvisor.it/Restaurants-g187830-Bergamo_Province_of_Bergamo_Lombardy.html'
]
# example_list = ['1' '2' '3'] = ['123']
for url in urls:
# use headers
yield scrapy.Request(url, callback=self.parse_restaurants, headers=self.headers)
def parse_restaurants(self, response):
# unnecessary because Scrapy has a built in duplicate filter:
# all_restaurants = list(set(response.xpath("//div[contains(@data-test,'_list_item')]//div/div/div/span/a[starts-with(@href,'/Restaurant_Review')]/@href").extract()))
all_restaurants = response.xpath("//div[contains(@data-test,'_list_item')]//div/div/div/span/a[starts-with(@href,'/Restaurant_Review')]/@href").getall()
for restaurant in all_restaurants:
url = 'https://www.tripadvisor.it' + restaurant
print(url)
# yield response.follow(url, callback = self.parse_restaurant)
Answered By - SuperUser
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.