Issue
I have created a spider using scrapy to scrape comments on video game mods, and the spider then follows a link to the user's profile that left a comment and takes some additional data from their profile. When I run the spider, it is returning the same item multiple times, except the user's information does not match the comment they left, and the spider returns the same user's information multiple times.
Here is my code:
import scrapy
from scrapy import Request
from scrapy.item import Item, Field
from scrapy.loader import ItemLoader
class Workshop_Item(Item):
game = Field()
mod = Field()
user = Field()
comment = Field()
#user_level = Field()
date_posted = Field()
class Workshop_Comment_Spider(scrapy.Spider):
name = "comments"
with open("output/workshop_comment_links.txt") as f:
urls = [line.rstrip("\n") for line in f]
start_urls = urls
def parse(self, response):
item = Workshop_Item()
for comment in response.css(".commentthread_comment"):
#yield {
#"user": comment.css("bdi::text").get(),
#"comment": ",".join(comment.css(".commentthread_comment_text::text").getall()).replace('\n', ' ').replace('\t', '').replace('\r', ' '),
#"date posted": comment.css(".commentthread_comment_timestamp::attr(title)").get()
#}
item['game'] = response.css(".apphub_AppName::text").get()
item['mod'] = response.css(".workshopItemTitle::text").get()
item['user'] = comment.css("bdi::text").get()
item['comment'] = ",".join(comment.css(".commentthread_comment_text::text").getall()).replace('\n', ' ').replace('\t', '').replace('\r', ' ')
item['date_posted'] = comment.css(".commentthread_comment_timestamp::attr(title)").get()
user_profile = comment.css(".commentthread_author_link::attr(href)").get()
#print(item)
if user_profile is not None:
yield response.follow(user_profile, callback=self.parse_user_info, meta={'item': item})
def parse_user_info(self, response):
item = response.meta['item']
yield {
'user_level' : response.css(".friendPlayerLevelNum::text").get(),
'game' : item['game'],
'mod' : item['mod'],
'user' : item['user'],
'comment' : item['comment'],
'date_posted' : item['date_posted'],
}
how do I fix this?
Solution
Try "item = Workshop_Item()" inside the for loop.
Answered By - mrhaanraadts
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.