Issue
I am trying to scrape the data off of this post. I am having an issue with scraping the comments however. The pagination of the comments is determined by the "page=1" at the end of the url. I noticed that if "page=0" is used it loads all the comments on one page which is really nice. However, my scrapy script will only scrape the comments from the first page, no matter what. Even if I change the link to "page=2" it still will only scrape the comments from the first page. I can not figure out why this issue is occurring.
import scrapy
from scrapy.crawler import CrawlerProcess
class IdeaSpider(scrapy.Spider):
name = "IdeaSpider"
def start_requests(self):
yield scrapy.Request(
url="https://www.games2gether.com/amplitude-studios/endless-space-2/ideas/1850-force-infinite-actions-to"
"-the-bottom-of-the-queue?page=0", callback=self.parse_idea)
# parses title, post, status, author, date
def parse_idea(self, response):
post_author = response.xpath('//span[@class = "username-content"]/text()')
temp_list.append(post_author.extract_first())
post_categories = response.xpath('//a[@class = "list-tags-item ng-star-inserted"]/text()')
post_categories_ext = post_categories.extract()
if len(post_categories_ext) > 1:
post_categories_combined = ""
for category in post_categories_ext:
post_categories_combined = post_categories_combined + category + ", "
temp_list.append(post_categories_combined)
else:
temp_list.append(post_categories_ext[0])
post_date = response.xpath('//div[@class = "time-date"]/text()')
temp_list.append(post_date.extract_first())
post_title = response.xpath('//h1[@class = "title"]/text()')
temp_list.append(post_title.extract()[0])
post_body = response.xpath('//article[@class = "post-list-item clearfix ng-star-inserted"]//div[@class = '
'"post-list-item-message-content post-content ng-star-inserted"]//text()')
post_body_ext = post_body.extract()
if len(post_body_ext) > 1:
post_body_combined = ""
for text in post_body_ext:
post_body_combined = post_body_combined + " " + text
temp_list.append(post_body_combined)
else:
temp_list.append(post_body_ext[0])
post_status = response.xpath('//p[@class = "status-title"][1]/text()')
if len(post_status.extract()) != 0:
temp_list.append(post_status.extract()[0])
else:
temp_list.append("no status")
dev_name = response.xpath('//div[@class = "ideas-details-status-comment user-role u-bdcolor-2 dev"]//p[@class '
'= "username user-role-username"]/text()')
temp_list.append(dev_name.extract_first())
dev_comment = response.xpath('//div[@class = "message post-content ng-star-inserted"]/p/text()')
temp_list.append(dev_comment.extract_first())
c_author_index = 0
c_body_index = 0
c_author_path = response.xpath('//article[@class = "post-list-item clearfix two-columns '
'ng-star-inserted"]//span[@class = "username-content"]/text()')
while c_author_index < len(c_author_path):
comment_author = c_author_path[c_author_index]
temp_list.append(comment_author.extract())
c_author_index += 1
c_body_combined = ""
c_body_path = '//div[@class = "post-list-comments"]/g2g-comments-item[1]/article[@class = ' \
'"post-list-item clearfix two-columns ng-star-inserted"]/div/div//div[@class ' \
'="post-list-item-message-content post-content ng-star-inserted"]//text() '
c_body = response.xpath(c_body_path.replace("1", str(c_body_index + 1)))
c_body_list = c_body.extract()
if len(c_body_list) > 1:
for word in c_body_list:
c_body_combined = c_body_combined + " " + word
temp_list.append(c_body_combined)
c_body_index += 1
elif len(c_body_list) != 0:
temp_list.append(c_body_list[0])
c_body_index += 1
elif len(c_body_list) == 0:
c_body_index += 1
c_body = response.xpath(c_body_path.replace("1", str(c_body_index + 1)))
c_body_list = c_body.extract()
if len(c_body_list) > 1:
for word in c_body_list:
c_body_combined = c_body_combined + " " + word
temp_list.append(c_body_combined)
c_body_index += 1
temp_list = list()
all_post_data = list()
process = CrawlerProcess()
process.crawl(IdeaSpider)
process.start()
print(temp_list)
Solution
This is because the comment pages are loaded using JavaScript and Scrapy is not rendering JavaScript. You could use Splash.
Answered By - mrhaanraadts
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.