Issue
Goal: I want to retrieve order performance data published on a specific e-commerce site. Since those data are spread across multiple pages for each order performance, we would like to extract the information for each page and finally summarize them as a single item or record.
I have looked through the official documentation and other similar Q.A.'s and found some. From the information, I was able to get an idea that it might be possible to achieve this by using cb_kwargs. However, I could not understand what is wrong with the following code.
[python - Interpreting callbacks and cb_kwargs with scrapy - Stack Overflow] (Interpreting callbacks and cb_kwargs with scrapy)
[python - Multiple pages per item in Scrapy. (https://stackoverflow.com/questions/22201876/multiple-pages-per-item-in-scrapy?noredirect=1&lq=1)
The program runs, but the csv outputs nothing as shown in the image below. enter image description here
The order results page contains information on 30 items per page. I would like to first retrieve all of the signup dates for each, which are listed only on the first page, then move to each product page from there to retrieve the details, and then store those information one item at a time.
I am a beginner who started writing code in Python 3 months ago. So I may have some problems with basic understanding about classes, etc. I would appreciate it if you could point this out to me during our discussion. The official documentation of scrapy is too unfriendly for beginners and I am having a hard time with it.
def parse_firstpage_item(self, response):
request = scrapy.Request(
url=response.url,
callback=self.parse_productpage_item,
cb_kwargs=dict(product_URL='//*[@id="buyeritemtable"]/div/ul/li[2]/p[1]/a'))
loader = ItemLoader(item = BuymaResearchtoolItem(), response = response)
loader.add_xpath("Conversion_date", '//*[@id="buyeritemtable"]/div/ul/li[2]/p[3]/text()')
yield loader.load_item()
def parse_productpage_item(self, response, product_URL):
loader = ItemLoader(item = BuymaResearchtoolItem(), response = response)
loader.add_xpath("brand_name", 'normalize-space(//*[@id="s_brand"]/dd/a/text())')
loader.add_value("page_URL" , response.url)
loader.add_xpath("inquire" , '//*[@id="tabmenu_inqcnt"]/text()')
yield loader.load_item()
class MyLinkExtractor(LinkExtractor):
def extract_links(self, response):
base_url = get_base_url(response)
if self.restrict_xpaths:
docs = [
subdoc
for x in self.restrict_xpaths
for subdoc in response.xpath(x)
]
else:
docs = [response.selector]
all_links = []
for doc in docs:
links = self._extract_links(doc, response.url, response.encoding, base_url)
all_links.extend(self._process_links(links))
logging.info('='*100)
logging.info(all_links)
logging.info(f'total liks len: {len(all_links)}')
logging.info('='*100)
return all_links
class AllSaledataSpider(CrawlSpider):
name = 'all_salesdata'
allowed_domains = ['www.buyma.com']
# start_urls = ['https://www.buyma.com/buyer/9887867/sales_1.html']
rules = (
Rule(MyLinkExtractor(
restrict_xpaths='//*[@class="buyeritem_name"]/a'), callback='parse_firstpage_item', follow=False),
Rule(LinkExtractor(restrict_xpaths='//DIV[@class="pager"]/DIV/A[contains(text(),"次")]'),follow=False)
)
def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse):
return
seen = set()
for rule_index, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)]
#if lnk not in seen]
for link in rule.process_links(links):
seen.add(link)
request = self._build_request(rule_index, link)
yield rule.process_request(request, response)
def start_requests(self):
with open('/Users/morni/buyma_researchtool/buyma_researchtool/AllshoppersURL.csv', 'r', encoding='utf-8') as f:
reader = csv.reader(f)
header = next(reader)
for row in reader:
yield scrapy.Request(url = str(row[2])[:-5]+'/sales_1.html')
for row in self.reader:
for n in range(1, 300):
url = f'{self.base_page}{row}/sales_{n}.html'
yield scrapy.Request(
url=url,
callback=self.parse_firstpage_item,
errback=self.errback_httpbin,
dont_filter=True
)
def parse_firstpage_item(self, response):
loader = ItemLoader(item = BuymaResearchtoolItem(), response = response)
loader.add_xpath("Conversion_date", '//*[@id="buyeritemtable"]/div/ul/li[2]/p[3]/text()')
loader.add_xpath("product_name" , '//*[@id="buyeritemtable"]/div/ul/li[2]/p[1]/a/text()')
loader.add_value("product_URL" , '//*[@id="buyeritemtable"]/div/ul/li[2]/p[1]/a/@href')
item = loader.load_item()
yield scrapy.Request(
url=response.urljoin(item['product_URL']),
callback=self.parse_productpage_item,
cb_kwargs={'item': item},
)
def parse_productpage_item(self, response, item):
loader = ItemLoader(item=item, response = response)
loader.add_xpath("brand_name", 'normalize-space(//*[@id="s_brand"]/dd/a/text())')
〜
yield loader.load_item()
Solution
You need to call each page and pass your current item to callback:
def parse_first_page(self, response):
loader = ItemLoader(item = BuymaResearchtoolItem(), response = response)
loader.add_xpath("brand_name", 'normalize-space(//*[@id="s_brand"]/dd/a/text())')
loader.add_value("page_URL" , response.url)
loader.add_xpath("inquire" , '//*[@id="tabmenu_inqcnt"]/text()')
item = loader.load_item()
yield scrapy. Request(
url=second_page_url,
callback=self.parse_second_page,
cb_kwargs={'item': item},
)
def parse_second_page(self, response, item):
loader = ItemLoader(item=item, response=response)
loader.add_xpath("Conversion_date", '//*[@id="buyeritemtable"]/div/ul/li[2]/p[3]/text()')
item = loader.load_item()
yield scrapy. Request(
url=third_page_url,
callback=self.parse_third_page,
cb_kwargs={'item': item},
)
def parse_third_page(self, response, item):
loader = ItemLoader(item=item, response=response)
loader.add_value('ThirdUrl', response.url)
yield loader.load_item()
Answered By - gangabass
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.