Issue
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.pipelines.files import FilesPipeline
from urllib.parse import urlparse
import os
class DatasetItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field()
class MyFilesPipeline(FilesPipeline):
pass
class DatasetSpider(scrapy.Spider):
name = 'Dataset_Scraper'
url = 'https://kern.humdrum.org/cgi-bin/browse?l=essen/europa/deutschl/allerkbd'
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 7.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
}
custom_settings = {
'FILES_STORE': 'Dataset',
'ITEM_PIPELINES':{"/home/LaxmanMaharjan/dataset/MyFilesPipeline":1}
}
def start_requests(self):
yield scrapy.Request(
url = self.url,
headers = self.headers,
callback = self.parse
)
def parse(self, response):
item = DatasetItem()
links = response.xpath('.//body/center[3]/center/table/tr[1]/td/table/tr/td/a[4]/@href').getall()
for link in links:
item['file_urls'] = [link]
yield item
break
if __name__ == "__main__":
#run spider from script
process = CrawlerProcess()
process.crawl(DatasetSpider)
process.start()
Error : Error loading object home-LaxmanMaharjan-dataset-Pipeline': not a full path
path is correct
How do i use custom file pipeline within this python file??? Help
I am trying to add custom file pipeline to download files with proper name. I cannot mention file pipeline class name cause it requires path so when entered path above error comes.
Solution
In case if pipeline code, spider code and process launcher stored in the same file
You can use __main__
in path to enable pipeline:
custom_settings = {
'FILES_STORE': 'Dataset',
'ITEM_PIPELINES':{"__main__.MyFilesPipeline":1}
}
Answered By - Georgiy
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.