Wednesday, December 20, 2023

[FIXED] Scrapy - What to do when no downloadable file is found?

December 20, 2023 python, python-3.x, scrapy, web-scraping No comments

Issue

I am currently working on a scrapy program that has the availability to download files from the page I'm scraping from, the issue that I am currently running into is that some of the pages have a datasheet like this page - https://www.tyconsystems.com/rpms24-720-720 - while others do not like this page - https://www.tyconsystems.com/tpdin-cable-232 -

What is the proper way of passing data for when there is no file found on the page? Additional question, is there anyway to fix the issue with the csv file having multiple lines per item when the item data length is too long? example item - rpms24-720-720.

Below is the code that I am using.

productInfo.py

from copyreg import clear_extension_cache
import scrapy
from ..items import tyconItem

class ProductInfoSpider(scrapy.Spider):
    
    name = "productInfo"
    allowed_domains = ['tyconsystems.com']
    start_urls = [
        'https://www.tyconsystems.com/rpms24-720-720',
        'https://www.tyconsystems.com/tpdin-cable-232',
    ]

    def parse(self, response):
        for product in response.css('section#listing'):
            items = tyconItem() # Unique item for each iteration
            name_dirty = product.css('div.product-id span#product_id::text').get()
            product_sku = name_dirty.strip()
            product_sub_title_dirty = product.css('div.product-details h1.page_headers::text').get()
            product_sub_title = product_sub_title_dirty.strip()
            #product_store_description = product.css('p.series-card__intro').get() 
            if product.xpath('//p[contains(@class, "MsoNormal")]'):
                summary = product.css('div.item > div p.MsoNormal').getall()
            elif product.xpath('//div[contains(@class, "item")]/div'):
                summary = product.css('div.item > div').getall()
            else:
                summary = product.css('div.item').getall()
            category_list = product.xpath('//div[@class="container"]//ol//li//a/span//text()').getall()
            category = category_list[-2].strip()
            description =   product.css('div.item > p.MsoNormal::text').getall()
            if product.css('div.extrafieldsBlock span.info a::attr(href)').get() == '':
                datasheet = 'no-file'
            else:
                datasheet = product.css('div.extrafieldsBlock span.info a::attr(href)').get()
            file_urls = datasheet
            specification = product.css('div#tab-6 div.info > table').getall()
            price = product.css('span#price::text').get()
            products_zoom_image = name_dirty.strip() + '.jpg'
            main_image = product.css('div#addl-images a::attr(href)').getall()
            image_urls = [response.urljoin(i) for i in main_image]

            items['category'] = category,
            items['datasheet'] = datasheet,
            items['description'] = description,
            items['main_image'] = main_image,
            items['price'] = price,
            items['product_link'] = response.url, # get the product link from response
            items['product_sku'] = product_sku,
            items['product_sub_title'] = product_sub_title,
            items['products_zoom_image'] = products_zoom_image
            items['specification'] = specification,
            items['summary'] = summary,

            items['file_urls'] = [file_urls]
            items["name"] = product_sku
            items["image_urls"] = image_urls

            yield items

items.py

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy
from scrapy import Field, Item

class tyconItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    
    category            = scrapy.Field()
    datasheet           = scrapy.Field()
    description         = scrapy.Field()
    file_urls           = scrapy.Field()
    files               = scrapy.Field()
    name                = scrapy.Field()
    image_urls          = scrapy.Field()
    images              = scrapy.Field()
    main_image          = scrapy.Field()
    price               = scrapy.Field()
    product_link        = scrapy.Field()
    product_sku         = scrapy.Field()
    product_sub_title   = scrapy.Field()
    products_zoom_image = scrapy.Field()
    specification       = scrapy.Field()
    summary             = scrapy.Field()

pipelines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface

# from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request
from scrapy.pipelines.files import FilesPipeline
from scrapy.pipelines.images import ImagesPipeline
from io import BytesIO
from PIL import Image

class tyconPipeline:
    def process_item(self, item, spider):
        return item

class DownfilesPipeline(FilesPipeline):
    def file_path(self, request, response=None, info=None):
        file_name: str = request.url.split("/")[-1]
        return file_name

class ImagePipeline(ImagesPipeline):

    def file_path(self, request, response=None, info=None, *args, item=None):
        filename = request.meta["filename"].strip()
        number = request.meta["file_num"]
        return filename + "_" + str(number) + ".jpg"
    
    def thumb_path(self, request, thumb_id, response=None, info=None):
        filename = request.meta["filename"]
        number = request.meta["file_num"]
        return f'thumbs/{thumb_id}/{filename}_{str(number)}.jpg'
    
    def get_media_requests(self, item, info):
        name = item["name"]
        for i, url in enumerate(item["image_urls"]):
            meta = {"filename": name, "file_num": i}
            yield Request(url, meta=meta)

    def convert_image(self, image, size=None):
        if size is not None:   
            # If the size is not None then it is a thumbnail
            # so we resize it according the parameter
            image = image.resize(size, Image.ANTIALIAS)
        else:
            # otherwise we give the image to back to the superclass version of 
            # this method for it to process.
            return super().convert_image(image, size=size)  
        buf = BytesIO()  #  These next 3 lines are from the scrapy source code.
        image.save(buf, 'JPEG', quality=72)  
        return image, buf

Scrapy Error in Log

Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/twisted/internet/defer.py", line 857, in _runCallbacks
    current.result = callback(  # type: ignore[misc]
  File "/usr/lib/python3/dist-packages/scrapy/utils/defer.py", line 162, in f
    return deferred_from_coro(coro_f(*coro_args, **coro_kwargs))
  File "/usr/lib/python3/dist-packages/scrapy/pipelines/media.py", line 87, in process_item
    requests = arg_to_iter(self.get_media_requests(item, info))
  File "/usr/lib/python3/dist-packages/scrapy/pipelines/files.py", line 492, in get_media_requests
    return [Request(u) for u in urls]
  File "/usr/lib/python3/dist-packages/scrapy/pipelines/files.py", line 492, in <listcomp>
    return [Request(u) for u in urls]
  File "/usr/lib/python3/dist-packages/scrapy/http/request/__init__.py", line 60, in __init__
    self._set_url(url)
  File "/usr/lib/python3/dist-packages/scrapy/http/request/__init__.py", line 98, in _set_url
    raise TypeError(f"Request url must be str, got {type(url).__name__}")
TypeError: Request url must be str, got NoneType

Thanks everyone!

Solution

Two approaches are possible:

1. Override `get_media_requests`

Override get_media_requests in your pipelines to check for the existence of URLs as follows:

class DownfilesPipeline(FilesPipeline):
    def get_media_requests(self, item, info):
        urls = ItemAdapter(item).get(self.files_urls_field, [])
        if not all(urls):
            return #THIS - Don't return Request if there is no URL
        return [Request(u) for u in URLs]
    # Rest of the code

class ImagePipeline(ImagesPipeline):
     def get_media_requests(self, item, info):
        urls = item.get("image_urls")
        if not all(urls):
            return None #THIS - Don't return Request if there is no URL
        name = item["name"]
        for i, url in enumerate(item["image_urls"]):
            meta = {"filename": name, "file_num": i}
            yield Request(url, meta=meta)

2. Return different items

You can have different item types returned from the Spider based on if you have an image to download or not. For ease, I prefer using anonymous dictionaries as follows:


def parse(self, response)
            item={}
            items['category'] = category,
            items['datasheet'] = datasheet,
            ...
            if file_to_download:
                items['file_urls'] = [file_urls]
            if image_to_download:
                items['image_urls'] = [image_urls]

            items["name"] = product_sku
            items["image_urls"] = image_urls

            yield item

Hope it helps!

Answered By - Upendra

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Wednesday, December 20, 2023

[FIXED] Scrapy - What to do when no downloadable file is found?

Issue

Solution

1. Override `get_media_requests`

2. Return different items

0 comments:

Post a Comment

Popular Posts

Labels

Wednesday, December 20, 2023

Issue

Solution

1. Override get_media_requests

2. Return different items

0 comments:

Post a Comment

Popular Posts

Labels

1. Override `get_media_requests`