Issue
I am writting my first web scrapping project and I want to scrap from booking.com.
I'd like to scrap info about include breakfast in hotel.
The problem is - I want every value to be ["Brekafast included"] or empty value [""] if there is no info about it. If Im runnig my code (below) I only get few values ["Brekafast included"].
I don't know how to solve this, bc when breakfast is not included there is no class "e05969d63d" in property card in this hotel (this class is directed to info about breakfast if it is included)
So if Hotel1 and Hotel3 have "Breakfast included" and Hotel2 doesn't have breakfast included.
I would like to export sth like ["Breakfast included","","Breakfast included"]
But I get only : ["Breakfast included", "Breakfast included"]
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import scrapy
import logging
from scrapy.crawler import CrawlerProcess
from scrapy.exporters import CsvItemExporter
class CsvPipeline(object):
def __init__(self):
self.file = open ('hotel.tmp','wb')
self.exporter = CsvItemExporter(self.file,str)
self.exporter.start_exporting()
def close_spider(self,spider):
self.exporter.finish_exporting()
self.file.close()
def process_items(self,item,spider):
self.exporter.export_item(item)
return item
class hotelsNY(scrapy.Spider):
name = "hotelsNY"
start_urls =[]
#start_urls = ['https://www.booking.com/searchresults.pl.html?label=gen173nr-1BCAEoggI46AdIM1gEaLYBiAEBmAEeuAEXyAEM2AEB6AEBiAIBqAIDuALX3uicBsACAdICJGRlODkzYmJmLTIyZjQtNDYwNi04YzYwLWIxOWRlMGU0MmM0MdgCBeACAQ&sid=7ab6fb8585341629f1a790546e37a1c5&aid=304142&ss=Nowy+Jork&ssne=Nowy+Jork&ssne_untouched=Nowy+Jork&lang=pl&sb=1&src_elem=sb&src=index&dest_id=20088325&dest_type=city&checkin=2022-12-30&checkout=2023-01-03&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset=0']
for i in range (0, 10):
start_urls.append('https://www.booking.com/searchresults.pl.html?label=gen173nr-1BCAEoggI46AdIM1gEaLYBiAEBmAEeuAEXyAEM2AEB6AEBiAIBqAIDuALX3uicBsACAdICJGRlODkzYmJmLTIyZjQtNDYwNi04YzYwLWIxOWRlMGU0MmM0MdgCBeACAQ&sid=7ab6fb8585341629f1a790546e37a1c5&aid=304142&ss=Nowy+Jork&ssne=Nowy+Jork&ssne_untouched=Nowy+Jork&lang=pl&sb=1&src_elem=sb&src=index&dest_id=20088325&dest_type=city&checkin=2022-12-30&checkout=2023-01-03&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset=' + str(i*25))
custom_settings = {
'LOG_LEVEL': logging.WARNING,
'FEED_EXPORTERS': {'csv': 'scrapy.exporters.CsvItemExporter'},
'FEED_FORMAT': 'csv',
'FEED_URI': 'hotels_tmp1.csv'
}
def parse(self, response):
nexturl = 'https://www.booking.com/searchresults.pl.html?label=gen173nr-1BCAEoggI46AdIM1gEaLYBiAEBmAEeuAEXyAEM2AEB6AEBiAIBqAIDuALX3uicBsACAdICJGRlODkzYmJmLTIyZjQtNDYwNi04YzYwLWIxOWRlMGU0MmM0MdgCBeACAQ&sid=7ab6fb8585341629f1a790546e37a1c5&aid=304142&ss=Nowy+Jork&ssne=Nowy+Jork&ssne_untouched=Nowy+Jork&lang=pl&sb=1&src_elem=sb&src=index&dest_id=20088325&dest_type=city&checkin=2022-12-30&checkout=2023-01-03&group_adults=2&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset=0'
#all_names = response.xpath('//*[@data-testid="title"]')
alH = response.xpath('//*[@data-testid="property-card"]').getall()
for name in allH:
hotelName = response.xpath('//*[@data-testid="title"]/text()').extract(),
address = response.xpath('//*[@data-testid="address"]/text()').extract(),
price = response.xpath('//*[@data-testid="price-and-discounted-price"]/text()').extract(),
dist = response.xpath('//span[@data-testid="distance"]/text()').extract(),
breakfast = response.xpath('//span[@class="e05969d63d"]/text()').extract(),
yield {'hotelName': hotelName, 'address': address, 'price': price, 'dist': dist, 'breakfast': breakfast}
process = CrawlerProcess(
{
'USER_AGENT':'Mozilla/4.0 (comatible;MSIE 7.0;Window NT 5.1)'
})
process.crawl(hotelsNY)
process.start()
Solution
There are a few issues with your spider.
once you use
getall()
onallH
xpath, you are extracting the text of that xpath expression and you can no longer use it as a selector for which you can chain.Use relative xpath expressions with the chained selectors so that instead of extracting lists of matching elements, you are iterating through the page row by row which I think was your intention in the first place.
To ensure that "breakfast" becomes an empty string you can just test if it is None, and explicitly set it to the empty string if needed.
here is an example:
notice that there is a './/' in the xpath expressions in the for loop. these are relative xpath expressions. and also notice how I chain the selectors by calling i.xpath
instead of response.xpath
inside the for loop.
allH = response.xpath('//*[@data-testid="property-card"]')
for i in allH:
hotelName = i.xpath('.//*[@data-testid="title"]//text()').get()
address = i.xpath('.//*[@data-testid="address"]//text()').get()
price = i.xpath('.//*[@data-testid="price-and-discounted-price"]//text()').get()
dist = i.xpath('.//span[@data-testid="distance"]//text()').get()
breakfast = i.xpath('//span[@class="e05969d63d"]//text()').get()
if breakfast is None:
breakfast = ""
yield {'hotelName': hotelName, 'address': address, 'price': price,
'dist': dist, 'breakfast': breakfast}
Answered By - Alexander
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.