Issue
I am attempting to use a scrapy crawl spider to follow links on a website with infinite scroll, scrape information from the urls it follows, and then continue to follow links and scrape information. I have found advice on this for scrapy in general, but not much for crawl spiders. Here is what I have tried so far:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import re
class ItsySpider(CrawlSpider):
name = 'test'
allowed_domains = ['citizen.digital']
start_urls = ['https://www.citizen.digital/search?query=the']
rules = (
Rule(follow="True"),
)
def parse(self, response):
base = 'http://cms.citizen.digital/api/v2/search?page={}'
data = response.json
current_page = data["current_page"]
for page in range(2, 10):
next_page_url=base.format(current_page+page)
yield scrapy.Request(next_page_url, callback=self.parse_next)
def parse_next(self, response):
yield{
'url': response.url,
'date': response.xpath('//script[@type="application/ld+json"]/text()').re(r'(?i)(?<=datepublished": ")..........'),
}
As you can see, I want to load 10 pages on the infinite scroll site and follow on links on those pages. Then I want to extract url and date from the urls it follows, and then continue to follow links and extract info.
I do not have experience with json, so I wonder if I have made a mistake there. Here is an example response for loading the second page on the infinite scroll site:
{
"data": [
{
"id": 186903,
"slug": "there-are-plans-to-harm-me-but-i-will-not-be-intimidated-a-defiant-nyoro-says-275851",
"thumbnail": "https:\/\/images.citizen.digital\/wp-content\/uploads\/2019\/09\/ndindi-nyoro-main-e1568106330665.jpg",
"description": " ",
"type": "news",
"title": "\u2018There are plans to harm me but I will not be intimidated,\u2019 a defiant Nyoro says",
"date": "12.05pm, September 10, 2019(EAT)",
"menu": {
"id": 14,
"slug": "news"
},
"author": "Wangui Ngechu"
},
{
"id": 106999,
"slug": "mwalala-lashes-out-at-intimidated-referees-after-leopards-defeat-243224",
"thumbnail": null,
"description": " ",
"type": "news",
"title": "Mwalala lashes out at \u2018intimidated referees\u2019 after Leopards defeat",
"date": "12.20pm, April 29, 2019(EAT)",
"menu": {
"id": 7,
"slug": "sports"
},
"author": "Geoffrey Mwamburi"
},
{
"id": 271435,
"slug": "why-men-are-intimidated-by-successful-women-133180",
"thumbnail": "http:\/\/images.citizen.digital\/wp-content\/uploads\/2018\/08\/Men.jpg",
"description": " ",
"type": "news",
"title": "Why men are intimidated by successful women",
"date": "05.11pm, August 29, 2018(EAT)",
"menu": {
"id": 4,
"slug": "entertainment"
},
"author": "Sheila Jerotich"
},
{
"id": 271671,
"slug": "besides-my-wife-these-are-the-only-people-who-can-intimidate-me-duale-132744",
"thumbnail": null,
"description": " ",
"type": "news",
"title": "Besides my wife, these are the only people who can intimidate me \u2013 Duale",
"date": "05.13pm, August 02, 2018(EAT)",
"menu": {
"id": 4,
"slug": "entertainment"
},
"author": "eDaily Reporter"
},
{
"id": 209728,
"slug": "nys-boss-richard-ndubai-will-intimidate-witnesses-if-freed-dpp-203602",
"thumbnail": "https:\/\/images.citizen.digital\/wp-content\/uploads\/2018\/06\/ndubai.png",
"description": " ",
"type": "news",
"title": "NYS boss Richard Ndubai will intimidate witnesses if freed: DPP",
"date": "06.15pm, June 11, 2018(EAT)",
"menu": {
"id": 14,
"slug": "news"
},
"author": "Dzuya Walter"
}
],
"meta": {
"pagination": {
"total": 15,
"count": 5,
"per_page": 5,
"current_page": 2,
"total_pages": 3,
"links": {
"previous": "http:\/\/cms.citizen.digital\/api\/v2\/search?page=1",
"next": "http:\/\/cms.citizen.digital\/api\/v2\/search?page=3"
}
}
}
}
When I run it using scrapy crawl test -O test.csv
, it returns an empty csv file.
Solution
First scrape the html page for the api key, and api base url (optionally, youcan also just type it). Then add the api key to the headers and start scraping the api
import scrapy
import logging
import codecs
class ItsySpider(scrapy.Spider):
name = 'test'
allowed_domains = ['citizen.digital']
start_urls = ['https://www.citizen.digital/search?query=the']
custom_settings = {'DOWNLOAD_DELAY': 0.4}
headers = {
"Accept": "application/json",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content_Type": "application/json",
"DNT": "1",
"Host": "cms.citizen.digital",
"Origin": "https://www.citizen.digital",
"Pragma": "no-cache",
"Referer": "https://www.citizen.digital/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-site",
"Sec-GPC": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
}
def parse(self, response):
apiKey = response.xpath('//script/text()').re(r'apiKey:\"(.*?)\",')
apiBaseURL = response.xpath('//script/text()').re(r'apiBaseURL:\"(.*?)\",')
if not apiKey:
logging.log(logging.ERROR, 'Could not retrieve the api key')
return
if not apiBaseURL:
logging.log(logging.ERROR, 'Could not retrieve the api base url')
return
apiKey = apiKey[0]
apiBaseURL = codecs.decode(apiBaseURL[0], 'unicode-escape')
apiBaseURL += '/search?term=the&limit=5&text_limit=300&platform=web'
self.headers['Api-Key'] = apiKey
yield scrapy.Request(url=apiBaseURL+'&page=1',
headers=self.headers,
cb_kwargs={'page_number': 1, 'apiBaseURL': apiBaseURL},
callback=self.parse_api)
def parse_api(self, response, page_number, apiBaseURL):
json_data = response.json()
for data in json_data['data']:
yield data
# if you want to scrape all the pages of the api:
# if json_data['meta']['pagination']['current_page'] == json_data['meta']['pagination']['total_pages']:
# since you want to scrape 10 pages:
if json_data['meta']['pagination']['current_page'] == 10:
logging.log(logging.INFO, 'Finished scraping')
return
# go to the next page
page_number += 1
yield scrapy.Request(url=apiBaseURL+f'&page={str(page_number)}',
headers=self.headers,
cb_kwargs={'page_number': page_number, 'apiBaseURL': apiBaseURL},
callback=self.parse_api)
def parse_next(self, response):
yield{
'url': response.url,
'date': response.xpath('//script[@type="application/ld+json"]/text()').re(r'(?i)(?<=datepublished": ")..........'),
}
Answered By - SuperUser
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.