Issue
I am scraping reviews from a website and these reviews tend to duplicate. The issue I am facing is with the mitatigation of duplicates and I am thinking my xpath may be an issue but I cannot solve this.
Here's what I have tried:
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst, MapCompose, Join
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from scrapy.http import JsonRequest
import pandas as pd
class CruisesItems(scrapy.Item):
user_rating = Field(output_processor = TakeFirst())
user = Field(output_processor = TakeFirst())
location = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
reviews = Field(output_processor = Join())
class CruisesSpider(scrapy.Spider):
name = 'cruises_reviews'
start_urls = ['https://www.tripadvisor.co.uk/Cruise_Review-d15691240-Reviews-AmaWaterways_AmaSerena']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
container = response.xpath("//div[@class='ui_column is-12-desktop is-12-tablet is-12-mobile cEMcR']/div[2]//div")
for reviews in container:
loader = ItemLoader(CruisesItems(), selector = reviews)
loader.add_xpath('user_rating', "//div[@class='eVykL Gi z cPeBe MD cwpFC']//div[@class='emWez F1']/span/@class")
loader.add_xpath('user', "(//div[@class='eVykL Gi z cPeBe MD cwpFC']//div[@class='xMxrO']//div[@class='bcaHz']//span//text())[position() mod 2=1]")
loader.add_xpath('location',"(//div[@class='eVykL Gi z cPeBe MD cwpFC']//div[@class='xMxrO']//div[@class='BZmsN']//span//text())[position() mod 5=1]")
loader.add_xpath('title', ".//div[@class='eVykL Gi z cPeBe MD cwpFC']//div[@class='fpMxB MC _S b S6 H5 _a']//text()")
loader.add_xpath('reviews', "//div[@class='eVykL Gi z cPeBe MD cwpFC']//div[@class='pIRBV _T']//span//text()")
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEEDS':{
'cruise_reviews.jl':{
'format':'jsonlines'
}
}
}
)
process.crawl(CruisesSpider)
process.start()
Solution
You need to use relative xpath.
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst, MapCompose, Join
from scrapy.loader import ItemLoader
from scrapy.crawler import CrawlerProcess
from scrapy.http import JsonRequest
import pandas as pd
class CruisesItems(scrapy.Item):
user_rating = Field(output_processor = TakeFirst())
user = Field(output_processor = TakeFirst())
location = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
reviews = Field(output_processor = Join())
class CruisesSpider(scrapy.Spider):
name = 'cruises_reviews'
start_urls = ['https://www.tripadvisor.co.uk/Cruise_Review-d15691240-Reviews-AmaWaterways_AmaSerena']
custom_settings = {
'DOWNLOAD_DELAY': 0.5
}
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback=self.parse
)
def parse(self, response):
# container = response.xpath("//div[@class='ui_column is-12-desktop is-12-tablet is-12-mobile cEMcR']/div[2]//div")
container = response.xpath('//div[@id="ship_reviews"]//div[@class="eVykL Gi z cPeBe MD cwpFC"]')
for reviews in container:
loader = ItemLoader(CruisesItems(), selector=reviews)
loader.add_xpath('user_rating', ".//div[@class='emWez F1']/span/@class")
loader.add_xpath('user', "(.//div[@class='xMxrO']//div[@class='bcaHz']//span//text())[position() mod 2=1]")
loader.add_xpath('location', "(//div[@class='xMxrO']//div[@class='BZmsN']//span//text())[position() mod 5=1]")
loader.add_xpath('title', ".//div[@class='fpMxB MC _S b S6 H5 _a']//text()")
loader.add_xpath('reviews', ".//div[@class='pIRBV _T']//span//text()")
yield loader.load_item()
process = CrawlerProcess(
settings = {
'FEEDS':{
'cruise_reviews.jl':{
'format':'jsonlines'
}
}
}
)
process.crawl(CruisesSpider)
process.start()
Notice that I moved //div[@class="eVykL Gi z cPeBe MD cwpFC"]
to the container because it's in all the item's xpaths.
Answered By - SuperUser
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.