Issue
I use Scrapy to scrape items of a website. There are bunch of informations including image urls. Can you help me figuring out how to extract images from these urls (data["image_urls"]
).
I understand that I have to extend the media pipeline because Scrapy doesn't manage nested urls but I'm lost in the process.
import json
import scrapy
import re
import pkgutil
from scrapy.loader import ItemLoader
from auctions_results.items import AuctionItem
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from datetime import datetime
class Spider(scrapy.Spider):
name = 'results'
def __init__(self, *args, **kwargs):
data_file = pkgutil.get_data(
"auctions_results", "json/input/scrape_demo_db.json")
self.data = json.loads(data_file)
def start_requests(self):
for item in self.data:
request = scrapy.Request(item['gm_url'], callback=self.parse)
request.meta['item'] = item
yield request
def parse(self, response):
item = response.meta['item']
item['results'] = []
for caritem in response.css("div.car-item-border"):
data = AuctionItem()
data["marque"] = caritem.css("div.make::text").extract_first().strip().split(" ", 2)[1]
data["auction_house"] = caritem.css("div.auctionHouse::text").extract_first().split("-", 1)[0].strip()
data["auction_country"] = caritem.css("div.auctionHouse::text").extract_first().rsplit(",", 1)[1].strip()
data["auction_date"] = caritem.css("div.date::text").extract_first().replace(",", "").strip()
data["image_urls"] = caritem.css("div.view-auction a img::attr(src)").extract_first()
item['results'].append(data)
yield item
When I encode the results in JSON it looks like this:
[{
"gm_url": "url",
"results": [{
"marque": "ferrari",
"auction_house": "auction",
"auction_country": "japan",
"auction_date": "2019",
"image_urls": "imgurl"
},
{
"marque": "porsche",
"auction_house": "auction2",
"auction_country": "gb",
"auction_date": "2018",
"image_urls": "imgurl2"
}]
}, .... ]
I add the image item in items.py, activate the ITEM_PIPELINE and IMAGES_STORE in settings.py.
Solution
You can overwrite the get_media_requests method of the ImagesPipeline for this:
class DownloadImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for result in item['results']:
image_url = result['image_urls']
request = Request(url=image_url,
headers=headers)
yield request
In your settings you should de-activate the ImagesPipeline and replace it with this 'DownloadImagesPipeline'.
Answered By - Wim Hermans
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.