Monday, August 1, 2022

[FIXED] Scrapy run crawl after another

August 01, 2022 python, scrapy, web-crawler No comments

Issue

I'm quite new to webscraping. I'm trying to crawl at novel reader website, to get the novel info and chapter content, so the way i do it is by creating 2 spider, one to fetch novel information and another one to fetch content of the chapter

import scrapy

class BookSpider(scrapy.Spider):
    name = "book"

    def __init__(self, books=[], **kwargs):
        if isinstance(books,str):
            books = [books]
        self.start_urls = [f'https://daonovel.com/novel/{book}/' for book in sorted(books)]
        super().__init__(**kwargs) 

    def parse(self, response):
        # self.remove_content(response.css("div.post-title h1 span"))
        fullurl = response.url
        url = fullurl.split("/")[-2]
        title = response.css("div.post-title h1::text").extract()
        title = title[len(title)-1].strip()
        authors = response.css('div.author-content a::text').getall()
        genres = response.css('div.genres-content a::text').getall()
        release = response.css('div.post-status div.post-content_item:nth-child(1) div.summary-content::text').get().strip()
        status = response.css('div.post-status div.post-content_item:nth-child(2) div.summary-content::text').get().strip()
        summary = response.css('div.summary__content p').getall()

        chapters = response.css('ul.version-chap li a::attr(href)').extract()
        chapters.reverse()

        return {
            'fullurl' : fullurl,
            'url' : url,
            'title' : title,
            'authors' : authors,
            'genres' : genres,
            'status' : status,
            'release' : release,
            'summary' : summary,
            'chapters' : chapters
        }

class ChapterSpider(scrapy.Spider):
    name = "chapter"

    def __init__(self, book="", chapters=[], **kwargs):
        if isinstance(chapters,str):
            chapters = [chapters]
        self.book = book
        self.start_urls = [f'https://daonovel.com/novel/{book}/{chapter}/' for chapter in chapters]
        super().__init__(**kwargs) 

    def parse(self, response):
        title = response.css("ol.breadcrumb li.active::text").get().strip()
        
        container = response.css("div.cha-words p").getall() if response.css("div.cha-words p").getall() else  response.css("div.text-left p").getall()
        content = []
        for p in container:
            content.append(str(p))
        
        return {
            'title' : title,
            'content' : content,
            'book_url': self.book,
            'url' : response.url.split("/")[-2]
        }

After that i created a collector to collect and process all of the data from the spider

from scrapy import signals

class Collector():
    def __init__(self, process, books=[]):
        self.process = process
        if isinstance(books, str):
            books = [books]
        self.books = books
        self.books_data = []

    def create_crawler(self, spider, function, **kwargs):
        # we need Crawler instance to access signals
        crawler = self.process.create_crawler(spider)
        crawler.signals.connect(function, signal=signals.item_scraped)
        x = self.process.crawl(crawler, **kwargs)
        return x

    def process_book_data(self, item, response, spider):
        item['authors'] = [author.strip() for author in item['authors']]
        item['genres'] = [genre.strip() for genre in item['genres']]

        summary = [line for line in item['summary'] if not any(word in line.lower() for word in ("wuxiaworld", "disclaimer"))]
        item['summary'] = str("\n").join(summary)

        item['chapters'] = [chapter.replace(item['fullurl'], '').replace('/', '') for chapter in item['chapters']]
        self.books_data.append(item)

    def process_chapter_data(self, item, response, spider):
        item['content'] = str("\n").join(item['content'])
        
        for book in self.books_data:
            if book['url'] == item['book_url']:
                book['chapters'][book['chapters'].index(item['url'])] = item
    
    def crawl_books(self):
        return self.create_crawler(BookSpider, self.process_book_data, books=self.books)
    
    def crawl_chapters(self, book, chapters):
        return self.create_crawler(ChapterSpider, self.process_chapter_data, book=book, chapters=chapters)

If i put the chapter manually before process.start()

from scrapy.crawler import CrawlerProcess
process = CrawlerProcess()
collector = Collector(process, books="a-stay-at-home-dads-restaurant-in-an-alternate-world")
collector.crawl_books()
collector.crawl_chapters("a-stay-at-home-dads-restaurant-in-an-alternate-world", ['chapter-1', 'chapter-2', 'chapter-3', 'chapter-4', 'chapter-5']) # put chapter manually
process.start()

for book in (collector.books_data):
    for k,v in book.items():
        print(k,v)

It works but this isn't the purpose of this script

Now my question is how do i make the chapter spider run after the book spider finished collecting the data? Here is my try that didn't work

from scrapy.crawler import CrawlerProcess
process = CrawlerProcess()
collector = Collector(process, books="a-stay-at-home-dads-restaurant-in-an-alternate-world")
collector.crawl_books()
process.start()

print(collector.books_data) # this work
for book in collector.books_data:
    collector.crawl_chapters(book['url'], book['chapters']) # this didn't work
    print("Chapters ==>", collector.books_data)

if i add process.start() before 'print("Chapters ==>", collector.chapters_data)' it creates error of twisted.internet.error.ReactorNotRestartable

I've read this SO question Scrapy - Reactor not Restartable but didn't know how to implement it on my code

Solution

I'd suggest to change spider architecture since scrapy isn't supposed to chain spiders(it's possible of course but it's bad practice in general), it's supposed to chain requests within the same spider.

Your problem is caused by the fact that scrapy designed to grab flat list of items, while you need nested one like book = {'title': ..., 'chapters': [{some chapter data}, ...]}

I'd suggest next architecture for your spider:

def parse(self, response):
    
    # parse book data here

    book_item = {
        'fullurl' : fullurl,
        'url' : url,
        'title' : title,
        'authors' : authors,
        'genres' : genres,
        'status' : status,
        'release' : release,
        'summary' : summary,
        'chapters' : []
    }
    chapter_urls = ...list of book chapter urls here.
    chapter_url = chapter_urls.pop()

    yield Request(
        url=chapter_url,
        callback=self.parse_chapter
        meta={'book': book_item, 'chapter_urls': chapter_urls}
    )        

def parse_chapter(self, response):
    book = response.meta['book']
    chapter_urls = response.meta['chapter_urls']
    
    # parse chapter data here
    
    chapter = {
        'title' : title,
        'content' : content,
        'book_url': self.book,
        'url' : response.url.split("/")[-2]
    }
    book['chapters'].append(chapter)
    if not chapter_urls:
        yield book
    else:
        chapter_url = chapter_urls.pop()
        yield Request(
            url=chapter_url,
            callback=self.parse_chapter
            meta={'book': book, 'chapter_urls': chapter_urls}
        )

This will produce books entities with nested chapters inside.

Hope it will help even though it's not quite exact answer to your question. Good luck (:

Second edition:

class YourSpider(Spider):
    books = {}
    ...

    def parse(self, response):
        # Get book info here.
        book_item = {
            'fullurl' : fullurl,
            'url' : url,
            'title' : title,
            'authors' : authors,
            'genres' : genres,
            'status' : status,
            'release' : release,
            'summary' : summary,
            'chapters' : []
        } 
        self.books[book_item['title']] = book_item
        chapter_urls = [..list of chapter urls]
        chapter_url = chapter_urls.pop()
        
        # This will trigger multiple request async
        for chapter_url in chapter_urls:
            yield scrapy.Request(
                url=chapter_url,
                callback=self.parse_chapter,
                meta={'book': book}
            )

    def parse_chapter(self, response):
        book_title = response.meta['book_title']

        # parse chapter data here

        chapter = {
            'title' : title,
            'content' : content,
            'book_url': self.book,
            'url' : response.url.split("/")[-2]
        }
        self.books[book_title].append(chapter)

        yield self.books[book_title]

Answered By - Michael Savchenko

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Monday, August 1, 2022

[FIXED] Scrapy run crawl after another

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels