Issue
I am new in scrapy.
I use FilesPipeline
to download some .pdf file.
I found that if the value of file_urls
of Scrapy.Item is same, the downloading process will not start again.
What I need is download again.
How can I solve it.
Thanks.
Solution
Add _onsuccess
function to your pipeline to override it. (Copy it from FilesPipelines
).
It looks like this:
def media_to_download(self, request, info, *, item=None):
def _onsuccess(result):
if not result:
return # returning None force download
last_modified = result.get('last_modified', None)
if not last_modified:
return # returning None force download
age_seconds = time.time() - last_modified
age_days = age_seconds / 60 / 60 / 24
if age_days > self.expires:
return # returning None force download
referer = referer_str(request)
logger.debug(
'File (uptodate): Downloaded %(medianame)s from %(request)s '
'referred in <%(referer)s>',
{'medianame': self.MEDIA_NAME, 'request': request,
'referer': referer},
extra={'spider': info.spider}
)
AND MORE CODE HERE THAT I DIDN'T COPY
Just add a return to skip the "uptodate" part and download anyway.
def media_to_download(self, request, info, *, item=None):
def _onsuccess(result):
if not result:
return # returning None force download
last_modified = result.get('last_modified', None)
if not last_modified:
return # returning None force download
age_seconds = time.time() - last_modified
age_days = age_seconds / 60 / 60 / 24
if age_days > self.expires:
return # returning None force download
return # force download
(You can also overwrite the function inside FilesPipeline
, but I don't recommend to do it).
Also, remember to activate your custom pipeline and add other functions you need.
Your pipelines.py file should look like this now:
from itemadapter import ItemAdapter
from scrapy.pipelines.files import FilesPipeline
from scrapy.http import Request
import logging
import time
from twisted.internet import defer
from scrapy.utils.log import failure_to_exc_info
logger = logging.getLogger(__name__)
class TempPipeline():
def process_item(self, item, spider):
return item
class ProcessPipeline(FilesPipeline):
def get_media_requests(self, item, info):
urls = ItemAdapter(item).get(self.files_urls_field, [])
return [Request(u) for u in urls]
def media_to_download(self, request, info, *, item=None):
def _onsuccess(result):
if not result:
return # returning None force download
last_modified = result.get('last_modified', None)
if not last_modified:
return # returning None force download
age_seconds = time.time() - last_modified
age_days = age_seconds / 60 / 60 / 24
if age_days > self.expires:
return # returning None force download
return
path = self.file_path(request, info=info, item=item)
dfd = defer.maybeDeferred(self.store.stat_file, path, info)
dfd.addCallbacks(_onsuccess, lambda _: None)
dfd.addErrback(
lambda f:
logger.error(self.__class__.__name__ + '.store.stat_file',
exc_info=failure_to_exc_info(f),
extra={'spider': info.spider})
)
return dfd
Answered By - SuperUser
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.