Sunday, January 2, 2022

[FIXED] Python Scrapy Spider Not Following Correct Link

January 02, 2022 python, scrapy No comments

Issue

I am trying to scrape the data off of this post. I am having an issue with scraping the comments however. The pagination of the comments is determined by the "page=1" at the end of the url. I noticed that if "page=0" is used it loads all the comments on one page which is really nice. However, my scrapy script will only scrape the comments from the first page, no matter what. Even if I change the link to "page=2" it still will only scrape the comments from the first page. I can not figure out why this issue is occurring.

import scrapy

from scrapy.crawler import CrawlerProcess


class IdeaSpider(scrapy.Spider):
    name = "IdeaSpider"


    def start_requests(self):
        yield scrapy.Request(
            url="https://www.games2gether.com/amplitude-studios/endless-space-2/ideas/1850-force-infinite-actions-to"
                "-the-bottom-of-the-queue?page=0", callback=self.parse_idea)
            

    # parses title, post, status, author, date
    def parse_idea(self, response):
        post_author = response.xpath('//span[@class = "username-content"]/text()')
        temp_list.append(post_author.extract_first())

        post_categories = response.xpath('//a[@class = "list-tags-item ng-star-inserted"]/text()')
        post_categories_ext = post_categories.extract()
        if len(post_categories_ext) > 1:
            post_categories_combined = ""
            for category in post_categories_ext:
                post_categories_combined = post_categories_combined + category + ", "
            temp_list.append(post_categories_combined)
        else:
            temp_list.append(post_categories_ext[0])

        post_date = response.xpath('//div[@class = "time-date"]/text()')
        temp_list.append(post_date.extract_first())

        post_title = response.xpath('//h1[@class = "title"]/text()')
        temp_list.append(post_title.extract()[0])

        post_body = response.xpath('//article[@class = "post-list-item clearfix ng-star-inserted"]//div[@class = '
                                   '"post-list-item-message-content post-content ng-star-inserted"]//text()')
        post_body_ext = post_body.extract()
        if len(post_body_ext) > 1:
            post_body_combined = ""
            for text in post_body_ext:
                post_body_combined = post_body_combined + " " + text
            temp_list.append(post_body_combined)
        else:
            temp_list.append(post_body_ext[0])

        post_status = response.xpath('//p[@class = "status-title"][1]/text()')
        if len(post_status.extract()) != 0:
            temp_list.append(post_status.extract()[0])
        else:
            temp_list.append("no status")

        dev_name = response.xpath('//div[@class = "ideas-details-status-comment user-role u-bdcolor-2 dev"]//p[@class '
                                  '= "username user-role-username"]/text()')
        temp_list.append(dev_name.extract_first())
        dev_comment = response.xpath('//div[@class = "message post-content ng-star-inserted"]/p/text()')
        temp_list.append(dev_comment.extract_first())

        c_author_index = 0
        c_body_index = 0
        c_author_path = response.xpath('//article[@class = "post-list-item clearfix two-columns '
                                       'ng-star-inserted"]//span[@class = "username-content"]/text()')
        while c_author_index < len(c_author_path):
            comment_author = c_author_path[c_author_index]
            temp_list.append(comment_author.extract())
            c_author_index += 1

            c_body_combined = ""
            c_body_path = '//div[@class = "post-list-comments"]/g2g-comments-item[1]/article[@class = ' \
                          '"post-list-item clearfix two-columns ng-star-inserted"]/div/div//div[@class ' \
                          '="post-list-item-message-content post-content ng-star-inserted"]//text() '
            c_body = response.xpath(c_body_path.replace("1", str(c_body_index + 1)))
            c_body_list = c_body.extract()
            if len(c_body_list) > 1:
                for word in c_body_list:
                    c_body_combined = c_body_combined + " " + word
                temp_list.append(c_body_combined)
                c_body_index += 1
            elif len(c_body_list) != 0:
                temp_list.append(c_body_list[0])
                c_body_index += 1
            elif len(c_body_list) == 0:
                c_body_index += 1
                c_body = response.xpath(c_body_path.replace("1", str(c_body_index + 1)))
                c_body_list = c_body.extract()
                if len(c_body_list) > 1:
                    for word in c_body_list:
                        c_body_combined = c_body_combined + " " + word
                    temp_list.append(c_body_combined)
                c_body_index += 1


temp_list = list()
all_post_data = list()

process = CrawlerProcess()
process.crawl(IdeaSpider)
process.start()

print(temp_list)

Solution

This is because the comment pages are loaded using JavaScript and Scrapy is not rendering JavaScript. You could use Splash.

Answered By - mrhaanraadts

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Sunday, January 2, 2022

[FIXED] Python Scrapy Spider Not Following Correct Link

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels