Issue
I am trying to get scrapy-selenium to navigate a url while picking some data along the way. Problem is that it seems to be filtering out too much data. I am confident there is not that much data in there. My problem is I do not know where to apply dont_filter=True
.
This is my code
import scrapy
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from scrapy_selenium import SeleniumRequest
from shutil import which
class AsusSpider(scrapy.Spider):
name = 'asus'
allowed_domains = ['www.zandparts.com']
# start_urls = ['https://www.zandparts.com/en/gateway']
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_path = which("C:/Users/Hp/Downloads/chromedriver.exe")
#Starting request from just one url
def start_requests(self):
yield scrapy.Request('https://www.zandparts.com/en/gateway', self.parse)
#grabbing the categories and getting all the links to navigate
def parse(self,response):
cat_links = response.xpath("//div[@class='category-navigation']/a")
for link in cat_links:
category = link.xpath(".//@href").get()
cat_text = link.xpath(".//h2/text()").get().strip().replace('\r\n','')
#making the url absolute
abs = f"https://www.zandparts.com{category}"
yield scrapy.Request(url=abs, callback=self.parse_x, meta={'category':cat_text})
#grabbing the series and getting all the links as well
def parse_x(self, response):
ser_links = response.xpath("//div[@class='category-navigation']/a")
for link in ser_links:
series = link.xpath(".//@href").get()
ser_text = link.xpath(".//h2/text()").get().strip().replace('\r\n','')
abs2 = f"https://www.zandparts.com{series}"
cat1 = response.request.meta['category']
yield scrapy.Request(url=abs2, callback=self.parse_y, meta={'series':ser_text, 'category2':cat1})
#grabbing each model and navigating to the product page for all the data
def parse_y(self, response):
mod_links = response.xpath("//div[@class='category-navigation']/a")
for link in mod_links:
model = link.xpath(".//@href").get()
mod_text = link.xpath(".//h2/text()").get().strip().replace('\r\n','')
abs3 = f"https://www.zandparts.com{model}"
ser1 = response.request.meta['series']
cat2 = response.request.meta['category2']
yield scrapy.Request(url=abs3, callback=self.parse_z, meta={'model':mod_text, 'series2':ser1, 'category3':cat2})
#product page. Getting the data
def parse_z(self,response):
products = response.xpath("//div[@class='product__info']/a")
next_page = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get()
next_page_full = f"http://www.zandparts.com{next_page}"
# mod2 = response.request.meta['model']
# ser2 =response.request.meta['series2']
# cat3 = response.request.meta['category3']
for product in products:
link = product.xpath(".//@href").get()
absolute_url = f"http://www.zandparts.com{link}"
yield SeleniumRequest(
url = absolute_url,
callback = self.parse_m,
wait_time=10,
wait_until=EC.element_to_be_clickable((By.LINK_TEXT, 'Tillgängliga alternativ')),
meta={'links':absolute_url, 'model':response.request.meta['model'],'series':response.request.meta['series2'],'category':response.request.meta['category3']}
)
#navigating through each page to get the data on each page
if next_page:
yield scrapy.Request(url=next_page_full, callback=self.parse_z)
def parse_m(self, response):
alternate = response.selector.xpath("//div[@class='product__content']/div/a/span/text()").getall()
category = response.selector.xpath("//div[@class='product-detail']/ul/li[4]").get().strip().replace("\r\n","")
name = response.selector.xpath("//h1[@class='product-detail__name']/text()").get().strip().replace("\r\n","")
part = response.selector.xpath("//div[@class='product-detail']/ul/li").get().strip().replace("\r\n","")
desc = response.selector.xpath("//div[@class='product-detail__description']").get()
image = response.selector.xpath("//img[@class='product-detail__image--main']/@src").get()
absolute_image = f"http://www.zandparts.com{image}"
yield{
'product link':response.request.meta['links'],
'category':response.request.meta['category'],
'series':response.request.meta['series'],
'model':response.request.meta['model'],
'product category':category,
'product name':name,
'part number':part,
'description':desc,
'image link':absolute_image,
'alt':alternate
}
This is the result when i run the code:
{'downloader/request_bytes': 688983,
'downloader/request_count': 1987,
'downloader/request_method_count/GET': 1987,
'downloader/response_bytes': 22314989,
'downloader/response_count': 1987,
'downloader/response_status_count/200': 1063,
'downloader/response_status_count/301': 924,
'dupefilter/filtered': 10704,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2021, 9, 6, 1, 28, 46, 347956),
'httpcache/hit': 1987,
'item_scraped_count': 827,
'log_count/DEBUG': 2816,
'log_count/ERROR': 97,
'log_count/INFO': 10,
'log_count/WARNING': 1,
'request_depth_max': 4,
'response_received_count': 1063,
'robotstxt/request_count': 1,
'robotstxt/response_count': 1,
'robotstxt/response_status_count/200': 1,
'scheduler/dequeued': 1986,
'scheduler/dequeued/memory': 1986,
'scheduler/enqueued': 1986,
'scheduler/enqueued/memory': 1986,
'spider_exceptions/KeyError': 97,
'start_time': datetime.datetime(2021, 9, 6, 1, 28, 22, 201511)}
2021-09-06 01:28:46 [scrapy.core.engine] INFO: Spider closed (finished)
I seem to be getting something wrong as it is filtering out way too much, and it is not getting all the data. I also seem to be getting something wrong with the meta because I get a KeyError as soon as it goes to the next page.
Solution
I run your code on a clean, virtual environment and it is working as intended. It doesn't give me a KeyError either but has some problems on various xpath paths. I'm not quite sure what you mean by filtering out too much data but your code hands me this output:
You can fix the text errors (on product category
, part number
and description
) by changing xpath variables like this:
alternate = response.selector.xpath("//div[@class='product__content']/div/a/span/text()").getall()
category = response.selector.xpath("//div[@class='product-detail']/ul/li[4]/text()[2]").get().strip()
name = response.selector.xpath("//h1[@class='product-detail__name']/text()").get().strip()
part = response.selector.xpath("//div[@class='product-detail']/ul/li/text()[2]").get().strip()
desc = response.selector.xpath("//div[@class='product-detail__description']/text()").get().replace("\r\n","").strip()
image = response.selector.xpath("//img[@class='product-detail__image--main']/@src").get()
absolute_image = f"http://www.zandparts.com{image}"
So you can get a cleaner output:
EDIT:
How about changing the variables of the parse
functions and clearing out meta
information of the requests?
Noticed the global
variables? I have used them rather them meta
values. (I also added dont_filter=True
to the start_requests
function.) You can implement these global variables to the result yield
.
If these solutions don't satisfy your problem, we can discuss it further.
Answered By - Mert
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.