Issue
I am trying to crawl all url from a website using scrapy. But some pages in the website has infinite scrolling and the data crawled is incomplete. The code used is
import re
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from w3lib.url import url_query_cleaner
def process_links(links):
for link in links:
link.url = url_query_cleaner(link.url)
yield link
class myCrawler(CrawlSpider):
name = 'symphony'
allowed_domains = ['theshell.org']
start_urls = ['https://www.theshell.org/']
base_url = 'https://www.theshell.org/'
custom_settings = {
# in order to reduce the risk of getting blocked
'DOWNLOADER_MIDDLEWARES': {'sitescrapper.middlewares.RotateUserAgentMiddleware': 400, },
'COOKIES_ENABLED': False,
'CONCURRENT_REQUESTS': 6,
'DOWNLOAD_DELAY': 1,
# Duplicates pipeline
'ITEM_PIPELINES': {'sitescrapper.pipelines.DuplicatesPipeline': 300},
# In order to create a CSV file:
'FEEDS': {'csv_file.csv': {'format': 'csv'}}
}
rules = (
Rule(
LinkExtractor(allow_domains='theshell.org',
deny=[
r'calendar',
],
),
process_links=process_links,
callback='parse_item',
follow=True
),
)
def parse_item(self, response):
yield {
'url': response.url,
'html_data':response.text
}
This page has the infinite crawling mechanism. How this infinite crawling can be detected and crawled using scrapy.
Solution
Infinity/load more most of the time is Ajax request. So you can use API url.Here I use API url and scrapy default template instead of crawlSpider.
import scrapy
from scrapy.crawler import CrawlerProcess
class myCrawler(scrapy.Spider):
name = 'symphony'
def start_requests(self):
headers={
"accept": "*/*",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-US,en;q=0.9",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36",
"x-requested-with": "XMLHttpRequest"
}
custom_settings = {'DOWNLOAD_DELAY': 5,}
urls = ['https://www.theshell.org/performances/list/?page='+str(x)+'' for x in range(1,5)]
for url in urls:
yield scrapy.Request(
url=url,
callback=self.parse,
method="GET",
headers=headers
)
def parse(self, response):
for link in response.xpath('//*[@class="content"]/h2/a/@href').getall():
yield response.follow(link,callback=self.parse_item)
def parse_item(self, response):
yield {
'title':response.xpath('//*[@class="info"]/h1/text()').get(),
'url': response.url,
}
if __name__ == "__main__":
process =CrawlerProcess()
process.crawl(myCrawler)
process.start()
Answered By - F.Hoque
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.