Issue
Hello I am trying to resize the images for the thumbnails in scrapy. I have seen a few posts about resizing, but they seem to address past versions of scrapy.
Requirements - for the thumbnails, if they are too small to resize/upscale to the required size stated in the settings file - "
- 'zoro': (500, 500),
- 'small': (116, 90),
- 'large': (386, 300),
- 'zoom': (648, 504)
" Here is my code -
Settings.py
BOT_NAME = 'project'
SPIDER_MODULES = ['project.spiders']
NEWSPIDER_MODULE = 'project.spiders'
...
LOG_STDOUT = True
LOG_FILE = 'scrapy_log.log'
ITEM_PIPELINES = {
'project.pipelines.ImagePipeline': 1,
}
IMAGES_STORE = 'image_dir'
IMAGES_URLS_FIELD = 'image_urls'
IMAGES_RESULT_FIELD = 'images'
IMAGES_THUMBS = {
'zoro': (500, 500),
'small': (116, 90),
'large': (386, 300),
'zoom': (648, 504)
}
Spider.py
import scrapy
from ..items import ItemImage
class ImageDownload(scrapy.Spider):
name = 'ImageDownload'
allowed_domains = ['antaira.com']
def start_requests(self):
urls = [
'https://www.antaira.com/products/PCIe-RS232',
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[@class="product-container"]//a/@href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item, dont_filter=True)
def parse_new_item(self, response):
item = ItemImage()
raw_image_urls = response.xpath('//div[@class="selectors"]/a/@href').getall()
name = response.xpath("//h1[@class='product-name']/text()").get()
filename = name.split(' ')[0].strip()
urls = [response.urljoin(i) for i in raw_image_urls]
item["name"] = filename
item["image_urls"] = urls
yield item
Pipelines.py
from scrapy.http import Request
from scrapy.pipelines.images import ImagesPipeline
from cStringIO import StringIO
import PIL
from PIL import Image
class ImagePipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None, *args, item=None):
filename = request.meta["filename"].strip()
number = request.meta["file_num"]
return filename + "_" + str(number) + ".jpg"
def thumb_path(self, request, thumb_id, response=None, info=None):
filename = request.meta["filename"]
number = request.meta["file_num"]
return f'thumbs/{thumb_id}/{filename}_{str(number)}.jpg'
def get_media_requests(self, item, info):
name = item["name"]
for i, url in enumerate(item["image_urls"]):
meta = {"filename": name, "file_num": i}
yield Request(url, meta=meta)
def convert_image(self, image, size=None):
if image.format == 'PNG' and image.mode == 'RGBA':
background = Image.new('RGBA', image.size, (255, 255, 255))
background.paste(image, image)
image = background.convert('RGB')
elif image.mode != 'RGB':
image = image.convert('RGB')
if size is None:
image = image.copy()
basewidth = size[0] # the size from the settings.py
wpercent = (basewidth/float(image.size[0]))
hsize = int((float(image.size[1])*float(wpercent)))
image = image.resize((basewidth,hsize), Image.ANTIALIAS)
buf = StringIO()
image.save(buf, 'JPEG', quality=72)
return image, buf
Hey @Alex, I ran the code you replied with and it returned the expected regular images, but I did not get any sub-folders with the expected thumbnails. This is what I got this in the scrapy.log -
Scrapy.log
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/joel/.local/lib/python3.8/site-packages/scrapy/pipelines/files.py", line 465, in media_downloaded
checksum = self.file_downloaded(response, request, info, item=item)
File "/home/joel/.local/lib/python3.8/site-packages/scrapy/pipelines/media.py", line 140, in wrapper
return func(*args, **kwargs)
File "/home/joel/.local/lib/python3.8/site-packages/scrapy/pipelines/images.py", line 115, in file_downloaded
return self.image_downloaded(response, request, info, item=item)
File "/home/joel/.local/lib/python3.8/site-packages/scrapy/pipelines/media.py", line 140, in wrapper
return func(*args, **kwargs)
File "/home/joel/.local/lib/python3.8/site-packages/scrapy/pipelines/images.py", line 119, in image_downloaded
for path, image, buf in self.get_images(response, request, info, item=item):
File "/home/joel/.local/lib/python3.8/site-packages/scrapy/pipelines/images.py", line 145, in get_images
thumb_image, thumb_buf = self.convert_image(image, size)
File "/home/joel/Desktop/project/project/pipelines.py", line 29, in convert_image
image = image.resize(size, image.ANTIALIAS)
AttributeError: 'JpegImageFile' object has no attribute 'ANTIALIAS'
Solution
All you need to do is change the convert_image
method in your pipeline to the following code:
I added some inline notes...
from io import BytesIO
from PIL import Image
def convert_image(self, image, size=None):
if size is not None: # If the size is not None then it is a thumbnail
# so we resize it according the parameter
image = image.resize(size, Image.ANTIALIAS)
else:
# otherwise we give the image to back to the superclass version of
# this method for it to process.
return super().convert_image(image, size=size)
buf = BytesIO() # These next 3 lines are from the scrapy source code.
image.save(buf, 'JPEG')
return image, buf
you can remove all the unnecessary imports in the pipeline file as well especially the form cString import StringIO
since that would raise an error on it's own.
Answered By - Alexander
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.