Issue
I am currently building a scraper with scrapy which unfortunately fails and with the following error log. I tried running it with crawler runner and crawler process but both versions fail. I tried to figure out if i've used twisted incorrectly but i think i did it correctly.
2018-04-18 23:55:46 [twisted] CRITICAL:
Traceback (most recent call last):
File "/home/flo/PycharmProjects/Tensortest/Car_Analysis/lib/python3.6/site-packages/twisted/internet/defer.py", line 1386, in _inlineCallbacks
result = g.send(result)
File "/home/flo/PycharmProjects/Tensortest/Car_Analysis/lib/python3.6/site-packages/scrapy/crawler.py", line 79, in crawl
self.spider = self._create_spider(*args, **kwargs)
File "/home/flo/PycharmProjects/Tensortest/Car_Analysis/lib/python3.6/site-packages/scrapy/crawler.py", line 102, in _create_spider
return self.spidercls.from_crawler(self, *args, **kwargs)
File "/home/flo/PycharmProjects/Tensortest/Car_Analysis/lib/python3.6/site-packages/scrapy/spiders/__init__.py", line 52, in from_crawler
spider._set_crawler(crawler)
File "/home/flo/PycharmProjects/Tensortest/Car_Analysis/lib/python3.6/site-packages/scrapy/spiders/__init__.py", line 67, in _set_crawler
crawler.signals.connect(self.close, signals.spider_closed)
File "/home/flo/PycharmProjects/Tensortest/Car_Analysis/lib/python3.6/site-packages/scrapy/signalmanager.py", line 26, in connect
return dispatcher.connect(receiver, signal, **kwargs)
File "/home/flo/PycharmProjects/Tensortest/Car_Analysis/lib/python3.6/site-packages/pydispatch/dispatcher.py", line 130, in connect
receiver = saferef.safeRef(receiver, onDelete=_removeReceiver)
File "/home/flo/PycharmProjects/Tensortest/Car_Analysis/lib/python3.6/site-packages/pydispatch/saferef.py", line 32, in safeRef
return weakref.ref(target, onDelete)
TypeError: cannot create weak reference to 'NoneType' object
My Code looks like this
import scrapy
from twisted.internet import reactor, defer
from scrapy.crawler import CrawlerRunner
from scrapy.crawler import CrawlerProcess
from scrapy.utils.log import configure_logging
from classes import cars
LINKS = []
CARS = []
class AutoSpiderLinks(scrapy.Spider):
name = "Auto_get_links"
ROOT_URL = "https://www.somewebsite"
global LINKS
def geturls(self):
main_url = "https://www.somewebsite"
target_url = []
for x in range(1, 2):
target_url.append(main_url + "&page=" + str(x))
print(target_url.append(main_url + "&page=" + str(x)))
return target_url
def start_requests(self):
urls = AutoSpiderLinks.geturls(self)
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
important_divs = response.css('div.cldt-summary-titles').extract()
AutoSpiderLinks.convert(self, important_divs)
def main():
configure_logging()
runner = CrawlerRunner()
runner.crawl(AutoSpiderLinks)
runner.crawl(DeepSpider)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
if __name__ == '__main__'
main()
Thank you for your assistance
Solution
So i've figured it out. Apparently you can not configure the threading inside the main function. The fix for this issue was straight forward with reactor.
@defer.inlineCallbacks
def main():
configure_logging()
runner = CrawlerRunner()
yield runner.crawl(AutoSpiderLinks)
yield runner.crawl(DeepSpider)
reactor.stop()
if __name__ == '__main__':
main()
reactor.run()
Answered By - Fscir
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.