Issue
I got a massage lik ERROR: Spider error processing and line 276, in aiter_errback yield await it.anext() in my terminal, and my code given below Can anyone tell me where is the problem.
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class CandywareCrawlspiderSpider(CrawlSpider):
name = "candyware_crawlspider"
allowed_domains = ["www.candywarehouse.com"]
# start_urls = ["https://www.candywarehouse.com/collections/wedding?page=24"]
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
# Editing the user-agent in the request sent
def start_requests(self):
yield scrapy.Request(url='https://www.candywarehouse.com/collections/wedding?page=24', headers={
'user-agent': self.user_agent
})
# Setting rules for the crawler
rules = (
Rule(LinkExtractor(restrict_xpaths=('//ul[@class="pagination-custom"]//li/a[@title="Next »"]')), callback='parse_item', follow=True, process_request='set_user_agent'),)
#
# # Setting the user-agent
def set_user_agent(self, request, spider):
request.headers['User-Agent'] = self.user_agent
return request
def parse_item(self, response):
product_list = response.xpath('//div[@class="js-grid"]/div')
for product in product_list:
product_name = product.xpath('.//p[@class="product__grid__title"]/text()').get().strip()
price = product.xpath('.//span[@class="price"]/text()').get().strip()
review_counts = product.xpath('.//span[@class="tt-product-block__rating"]/text()').get().replace('\n', '').replace(' ', '')
yield {
'product_name': product_name,
'price': price,
'review_counts': review_counts,
'User-Agent': response.request.headers['User-Agent'],
}
Solution
If you're using string methods on the result from get()
function you need to make sure you're getting a string.
product_name = product.xpath('.//p[@class="product__grid__title"]/text()').get(default='').strip()
price = product.xpath('.//span[@class="price"]/text()').get(default='').strip()
review_counts = product.xpath('.//span[@class="tt-product-block__rating"]/text()').get(default='').replace('\n', '').replace(' ', '')
Answered By - SuperUser
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.