Issue
So I am trying to iterate from an all products page > category > series > product page. I am getting an error in the log, where it shows that I am not retrieving the expected id, but I think it has to do with how I am iterating into the pages, my suspicion is that I am not traveling all the way to the products page.
Start Request
def start_requests(self):
urls = [
'https://www.moxa.com/en/products',
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
Initial Parse into Products Page
def parse(self, response):
# iterate through each of the relative urls
for explore_products in response.css('li.alphabet-list--no-margin a.alphabet-list__link::attr(href)').getall():
category_url = response.urljoin(explore_products) # use variable
logging.info("Category_links: " + category_url)
yield scrapy.Request(category_url, callback=self.parse_categories)
2nd Parse for Series
def parse_categories(self, response):
for category_url in response.css('a.series-card__wrapper::attr(href)').getall():
series_url = response.urljoin(category_url)
logging.info("Series_links: " + series_url)
yield scrapy.Request(series_url, callback=self.parse_series)
3rd Part to reach product page itself (I think this is where it is breaking) I would like it that if its possible to check if the "target_id" is within the series_url that it only returns the passing results into the "product_url" List Example - target_id: TN-5916-WV-T, and the product_url: https://www.moxa.com/Products/INDUSTRIAL-NETWORK-INFRASTRUCTURE/Secure-Routers/EN-50155-Routers/TN-5900-Series/TN-5916-WV-T, it should pass as true and be passed into the product_links list. But if the product_url: https://www.moxa.com/en/products/quotation-list, then it does not pass and does not return into the list.
def parse_series(self, response):
for series_url in response.css('.model-table a::attr(href)').getall():
target_list = response.xpath('//table[@class="model-table"]//a/@href').getall()
target_id = response.css('table.model-table th::attr(data-id)').get()
target_path = [p for p in target_list if target_id in p]
product_url = response.urljoin(series_url)
self.logger.info("target_id: " + target_id)
self.logger.info("product_url: " + product_url)
logging.info("Product_links: " + product_url)
yield scrapy.Request(product_url, callback=self.parse_new_item)
Return the expected item results
def parse_new_item(self, response):
for product in response.css('section.main-section'):
items = MoxaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name_dirty = product.css('h5.series-card__heading.series-card__heading--big::text').get()
product_sku = name_dirty.strip()
product_store_description = product.css('p.series-card__intro').get()
product_sub_title = product_sku + ' ' + product_store_description
summary = product.css(('section.features h3 + ul')).getall()
datasheet = product.css(('li.side-section__item a::attr(href)'))
description = product.css('.products .product-overview::text').getall()
specification = product.css('div.series-card__table').getall()
products_zoom_image = name_dirty.strip() + '.jpg'
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
# weight = product.xpath('//div[@class="series-card__table"]//p[@class="title-list__heading"]/text()[contains(., "Weight")]following-sibiling::div//text()').get()
response.xpath("//div[@class='grdcpnsmllnks']//li[i[contains(@class, 'fa-clock-o')]]/text()").re_first(r"Valid till\s+(\d+/\d+/\d+)")
rel_links = product.xpath("//script/@src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['product_sku'] = product_sku,
items['product_sub_title'] = product_sub_title,
items['summary'] = summary,
items['description'] = description,
items['specification'] = specification,
items['products_zoom_image'] = products_zoom_image
items['main_image'] = main_image,
# items['weight'] = weight,
#items['rel_links'] = rel_links,
items['datasheet'] = datasheet,
yield items
My log where the error is appearing
File "/home/joel/Desktop/moxa/moxa/spiders/product_series.py", line 57, in parse_new_item
logging.info("name_dirty: " + name_dirty)
TypeError: can only concatenate str (not "NoneType") to str
Solution
Try doing it this way... I think this is a better way to make sure you are grabing the correct links. Doing it this way also eliminates the many duplicates that were being generated before.
def parse_series(self, response):
for column in response.xpath("//table[@class='model-table']//th"): # iterate columns
data_id = column.xpath("./@data-id") # grab the columns data_id
if not data_id: # Check if data id exists because the first column is
continue # is for the labels and doesn't have a data_id
data_id = data_id.get() # get the data_id text
for link in column.xpath("//a/@href").getall(): # crawl the links in the
if data_id not in link: # column and if the
continue # data_id is in the url
url = response.urljoin(link) # merge with the domain
yield scrapy.Request(url, callback=self.parse_new_item) # yield request
Answered By - Alexander
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.