Issue
I'm quite new to webscraping. I'm trying to crawl at novel reader website, to get the novel info and chapter content, so the way i do it is by creating 2 spider, one to fetch novel information and another one to fetch content of the chapter
import scrapy
class BookSpider(scrapy.Spider):
name = "book"
def __init__(self, books=[], **kwargs):
if isinstance(books,str):
books = [books]
self.start_urls = [f'https://daonovel.com/novel/{book}/' for book in sorted(books)]
super().__init__(**kwargs)
def parse(self, response):
# self.remove_content(response.css("div.post-title h1 span"))
fullurl = response.url
url = fullurl.split("/")[-2]
title = response.css("div.post-title h1::text").extract()
title = title[len(title)-1].strip()
authors = response.css('div.author-content a::text').getall()
genres = response.css('div.genres-content a::text').getall()
release = response.css('div.post-status div.post-content_item:nth-child(1) div.summary-content::text').get().strip()
status = response.css('div.post-status div.post-content_item:nth-child(2) div.summary-content::text').get().strip()
summary = response.css('div.summary__content p').getall()
chapters = response.css('ul.version-chap li a::attr(href)').extract()
chapters.reverse()
return {
'fullurl' : fullurl,
'url' : url,
'title' : title,
'authors' : authors,
'genres' : genres,
'status' : status,
'release' : release,
'summary' : summary,
'chapters' : chapters
}
class ChapterSpider(scrapy.Spider):
name = "chapter"
def __init__(self, book="", chapters=[], **kwargs):
if isinstance(chapters,str):
chapters = [chapters]
self.book = book
self.start_urls = [f'https://daonovel.com/novel/{book}/{chapter}/' for chapter in chapters]
super().__init__(**kwargs)
def parse(self, response):
title = response.css("ol.breadcrumb li.active::text").get().strip()
container = response.css("div.cha-words p").getall() if response.css("div.cha-words p").getall() else response.css("div.text-left p").getall()
content = []
for p in container:
content.append(str(p))
return {
'title' : title,
'content' : content,
'book_url': self.book,
'url' : response.url.split("/")[-2]
}
After that i created a collector to collect and process all of the data from the spider
from scrapy import signals
class Collector():
def __init__(self, process, books=[]):
self.process = process
if isinstance(books, str):
books = [books]
self.books = books
self.books_data = []
def create_crawler(self, spider, function, **kwargs):
# we need Crawler instance to access signals
crawler = self.process.create_crawler(spider)
crawler.signals.connect(function, signal=signals.item_scraped)
x = self.process.crawl(crawler, **kwargs)
return x
def process_book_data(self, item, response, spider):
item['authors'] = [author.strip() for author in item['authors']]
item['genres'] = [genre.strip() for genre in item['genres']]
summary = [line for line in item['summary'] if not any(word in line.lower() for word in ("wuxiaworld", "disclaimer"))]
item['summary'] = str("\n").join(summary)
item['chapters'] = [chapter.replace(item['fullurl'], '').replace('/', '') for chapter in item['chapters']]
self.books_data.append(item)
def process_chapter_data(self, item, response, spider):
item['content'] = str("\n").join(item['content'])
for book in self.books_data:
if book['url'] == item['book_url']:
book['chapters'][book['chapters'].index(item['url'])] = item
def crawl_books(self):
return self.create_crawler(BookSpider, self.process_book_data, books=self.books)
def crawl_chapters(self, book, chapters):
return self.create_crawler(ChapterSpider, self.process_chapter_data, book=book, chapters=chapters)
If i put the chapter manually before process.start()
from scrapy.crawler import CrawlerProcess
process = CrawlerProcess()
collector = Collector(process, books="a-stay-at-home-dads-restaurant-in-an-alternate-world")
collector.crawl_books()
collector.crawl_chapters("a-stay-at-home-dads-restaurant-in-an-alternate-world", ['chapter-1', 'chapter-2', 'chapter-3', 'chapter-4', 'chapter-5']) # put chapter manually
process.start()
for book in (collector.books_data):
for k,v in book.items():
print(k,v)
It works but this isn't the purpose of this script
Now my question is how do i make the chapter spider run after the book spider finished collecting the data? Here is my try that didn't work
from scrapy.crawler import CrawlerProcess
process = CrawlerProcess()
collector = Collector(process, books="a-stay-at-home-dads-restaurant-in-an-alternate-world")
collector.crawl_books()
process.start()
print(collector.books_data) # this work
for book in collector.books_data:
collector.crawl_chapters(book['url'], book['chapters']) # this didn't work
print("Chapters ==>", collector.books_data)
if i add process.start() before 'print("Chapters ==>", collector.chapters_data)' it creates error of twisted.internet.error.ReactorNotRestartable
I've read this SO question Scrapy - Reactor not Restartable but didn't know how to implement it on my code
Solution
I'd suggest to change spider architecture since scrapy isn't supposed to chain spiders(it's possible of course but it's bad practice in general), it's supposed to chain requests within the same spider.
Your problem is caused by the fact that scrapy designed to grab flat list of items, while you need nested one like book = {'title': ..., 'chapters': [{some chapter data}, ...]}
I'd suggest next architecture for your spider:
def parse(self, response):
# parse book data here
book_item = {
'fullurl' : fullurl,
'url' : url,
'title' : title,
'authors' : authors,
'genres' : genres,
'status' : status,
'release' : release,
'summary' : summary,
'chapters' : []
}
chapter_urls = ...list of book chapter urls here.
chapter_url = chapter_urls.pop()
yield Request(
url=chapter_url,
callback=self.parse_chapter
meta={'book': book_item, 'chapter_urls': chapter_urls}
)
def parse_chapter(self, response):
book = response.meta['book']
chapter_urls = response.meta['chapter_urls']
# parse chapter data here
chapter = {
'title' : title,
'content' : content,
'book_url': self.book,
'url' : response.url.split("/")[-2]
}
book['chapters'].append(chapter)
if not chapter_urls:
yield book
else:
chapter_url = chapter_urls.pop()
yield Request(
url=chapter_url,
callback=self.parse_chapter
meta={'book': book, 'chapter_urls': chapter_urls}
)
This will produce books
entities with nested chapters inside.
Hope it will help even though it's not quite exact answer to your question. Good luck (:
Second edition:
class YourSpider(Spider):
books = {}
...
def parse(self, response):
# Get book info here.
book_item = {
'fullurl' : fullurl,
'url' : url,
'title' : title,
'authors' : authors,
'genres' : genres,
'status' : status,
'release' : release,
'summary' : summary,
'chapters' : []
}
self.books[book_item['title']] = book_item
chapter_urls = [..list of chapter urls]
chapter_url = chapter_urls.pop()
# This will trigger multiple request async
for chapter_url in chapter_urls:
yield scrapy.Request(
url=chapter_url,
callback=self.parse_chapter,
meta={'book': book}
)
def parse_chapter(self, response):
book_title = response.meta['book_title']
# parse chapter data here
chapter = {
'title' : title,
'content' : content,
'book_url': self.book,
'url' : response.url.split("/")[-2]
}
self.books[book_title].append(chapter)
yield self.books[book_title]
Answered By - Michael Savchenko
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.