Issue
I am currently working on a scrapy program that has the availability to download files from the page I'm scraping from, the issue that I am currently running into is that some of the pages have a datasheet like this page - https://www.tyconsystems.com/rpms24-720-720 - while others do not like this page - https://www.tyconsystems.com/tpdin-cable-232 -
What is the proper way of passing data for when there is no file found on the page? Additional question, is there anyway to fix the issue with the csv file having multiple lines per item when the item data length is too long? example item - rpms24-720-720.
Below is the code that I am using.
productInfo.py
from copyreg import clear_extension_cache
import scrapy
from ..items import tyconItem
class ProductInfoSpider(scrapy.Spider):
name = "productInfo"
allowed_domains = ['tyconsystems.com']
start_urls = [
'https://www.tyconsystems.com/rpms24-720-720',
'https://www.tyconsystems.com/tpdin-cable-232',
]
def parse(self, response):
for product in response.css('section#listing'):
items = tyconItem() # Unique item for each iteration
name_dirty = product.css('div.product-id span#product_id::text').get()
product_sku = name_dirty.strip()
product_sub_title_dirty = product.css('div.product-details h1.page_headers::text').get()
product_sub_title = product_sub_title_dirty.strip()
#product_store_description = product.css('p.series-card__intro').get()
if product.xpath('//p[contains(@class, "MsoNormal")]'):
summary = product.css('div.item > div p.MsoNormal').getall()
elif product.xpath('//div[contains(@class, "item")]/div'):
summary = product.css('div.item > div').getall()
else:
summary = product.css('div.item').getall()
category_list = product.xpath('//div[@class="container"]//ol//li//a/span//text()').getall()
category = category_list[-2].strip()
description = product.css('div.item > p.MsoNormal::text').getall()
if product.css('div.extrafieldsBlock span.info a::attr(href)').get() == '':
datasheet = 'no-file'
else:
datasheet = product.css('div.extrafieldsBlock span.info a::attr(href)').get()
file_urls = datasheet
specification = product.css('div#tab-6 div.info > table').getall()
price = product.css('span#price::text').get()
products_zoom_image = name_dirty.strip() + '.jpg'
main_image = product.css('div#addl-images a::attr(href)').getall()
image_urls = [response.urljoin(i) for i in main_image]
items['category'] = category,
items['datasheet'] = datasheet,
items['description'] = description,
items['main_image'] = main_image,
items['price'] = price,
items['product_link'] = response.url, # get the product link from response
items['product_sku'] = product_sku,
items['product_sub_title'] = product_sub_title,
items['products_zoom_image'] = products_zoom_image
items['specification'] = specification,
items['summary'] = summary,
items['file_urls'] = [file_urls]
items["name"] = product_sku
items["image_urls"] = image_urls
yield items
items.py
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
from scrapy import Field, Item
class tyconItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
category = scrapy.Field()
datasheet = scrapy.Field()
description = scrapy.Field()
file_urls = scrapy.Field()
files = scrapy.Field()
name = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
main_image = scrapy.Field()
price = scrapy.Field()
product_link = scrapy.Field()
product_sku = scrapy.Field()
product_sub_title = scrapy.Field()
products_zoom_image = scrapy.Field()
specification = scrapy.Field()
summary = scrapy.Field()
pipelines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
# from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request
from scrapy.pipelines.files import FilesPipeline
from scrapy.pipelines.images import ImagesPipeline
from io import BytesIO
from PIL import Image
class tyconPipeline:
def process_item(self, item, spider):
return item
class DownfilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
file_name: str = request.url.split("/")[-1]
return file_name
class ImagePipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, *args, item=None):
filename = request.meta["filename"].strip()
number = request.meta["file_num"]
return filename + "_" + str(number) + ".jpg"
def thumb_path(self, request, thumb_id, response=None, info=None):
filename = request.meta["filename"]
number = request.meta["file_num"]
return f'thumbs/{thumb_id}/{filename}_{str(number)}.jpg'
def get_media_requests(self, item, info):
name = item["name"]
for i, url in enumerate(item["image_urls"]):
meta = {"filename": name, "file_num": i}
yield Request(url, meta=meta)
def convert_image(self, image, size=None):
if size is not None:
# If the size is not None then it is a thumbnail
# so we resize it according the parameter
image = image.resize(size, Image.ANTIALIAS)
else:
# otherwise we give the image to back to the superclass version of
# this method for it to process.
return super().convert_image(image, size=size)
buf = BytesIO() # These next 3 lines are from the scrapy source code.
image.save(buf, 'JPEG', quality=72)
return image, buf
Scrapy Error in Log
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 857, in _runCallbacks
current.result = callback( # type: ignore[misc]
File "/usr/lib/python3/dist-packages/scrapy/utils/defer.py", line 162, in f
return deferred_from_coro(coro_f(*coro_args, **coro_kwargs))
File "/usr/lib/python3/dist-packages/scrapy/pipelines/media.py", line 87, in process_item
requests = arg_to_iter(self.get_media_requests(item, info))
File "/usr/lib/python3/dist-packages/scrapy/pipelines/files.py", line 492, in get_media_requests
return [Request(u) for u in urls]
File "/usr/lib/python3/dist-packages/scrapy/pipelines/files.py", line 492, in <listcomp>
return [Request(u) for u in urls]
File "/usr/lib/python3/dist-packages/scrapy/http/request/__init__.py", line 60, in __init__
self._set_url(url)
File "/usr/lib/python3/dist-packages/scrapy/http/request/__init__.py", line 98, in _set_url
raise TypeError(f"Request url must be str, got {type(url).__name__}")
TypeError: Request url must be str, got NoneType
Thanks everyone!
Solution
Two approaches are possible:
1. Override get_media_requests
Override get_media_requests
in your pipelines to check for the existence of URLs as follows:
class DownfilesPipeline(FilesPipeline):
def get_media_requests(self, item, info):
urls = ItemAdapter(item).get(self.files_urls_field, [])
if not all(urls):
return #THIS - Don't return Request if there is no URL
return [Request(u) for u in URLs]
# Rest of the code
class ImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
urls = item.get("image_urls")
if not all(urls):
return None #THIS - Don't return Request if there is no URL
name = item["name"]
for i, url in enumerate(item["image_urls"]):
meta = {"filename": name, "file_num": i}
yield Request(url, meta=meta)
2. Return different items
You can have different item types returned from the Spider based on if you have an image to download or not. For ease, I prefer using anonymous dictionaries as follows:
def parse(self, response)
item={}
items['category'] = category,
items['datasheet'] = datasheet,
...
if file_to_download:
items['file_urls'] = [file_urls]
if image_to_download:
items['image_urls'] = [image_urls]
items["name"] = product_sku
items["image_urls"] = image_urls
yield item
Hope it helps!
Answered By - Upendra
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.