Issue
I'm currently attempting to scrape articles from the website NepalItimes. The challenge I'm facing is that the website employs a "Load More" button, which I need to click to load additional articles. However, my scraping process successfully retrieves the initial page with the first six articles, but it fails to click the "Load More" button to load the rest of the articles. As a result, I am unable to scrape anything beyond the initial six articles.
Furthermore, during the scraping process, it continues to fetch URLs, but instead of obtaining the desired content, it returns "oops" pages, indicating a problem with the Selenium and button-clicking functionality.
If someone could explain to me how can I handle this? I would be really grateful!
import scrapy
import json
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http.request import Request
class NepaliSpider(CrawlSpider):
name = "nepalitimes"
allowed_domains = ["nepalitimes.com"]
# Start URL for the spider
start_urls = ['https://www.nepalitimes.com/news']
custom_settings = {
'FEED_FORMAT': 'csv',
'FEED_URI': 'nepali_times.csv'
}
# Rule to follow links to individual article pages
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
# Handling the load button using Selenium --- En cours de pulvérisation <3
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response, **kwargs):
# Parse the articles from the initial page
for result in response.xpath(".//div[contains(@class,'main--left')]/a"):
relative_url = result.xpath("@href").extract_first()
absolute_url = response.urljoin(relative_url)
yield scrapy.Request(url=absolute_url, callback=self.parse_item)
# Check if there is a "Load More" button
load_more_button = response.xpath(".//button[contains(@class, 'btn btn--load center') and contains(., 'load more')]")
if load_more_button:
print("Load more button detected")
tenant_code = "epz639"
routeId = 8
limit = 10
offset = 10
# Prepare the data payload for the POST request
data = {
"query": "query getMoreArticles($tenant_code: String, $routeId: Int, $limit: Int, $offset: Int) { articles: getPublicContent(tenant_code: $tenant_code, routeId: $routeId, limit: $limit, offset: $offset) { id } }",
"variables": {
"tenant_code": tenant_code,
"routeId": routeId,
"limit": limit,
"offset": offset
}
}
# Send a POST request to the endpoint using scrapy.FormRequest
yield scrapy.FormRequest(url="https://nepalitimes-hasura.superdesk.org/v1/graphql",
formdata={"query": json.dumps(data["query"]), "variables": json.dumps(data["variables"])},
headers={"Content-Type": "application/json"},
callback=self.parse_ajax_response)
print("Post resquest sent")
def parse_ajax_response(self, response):
if 'data' in json_response and 'articles' in json_response['data']:
articles = json_response['data']['articles']
print("Articles :", articles)
for article in articles:
# Assuming there's an 'slug' field in the response representing the article slug
article_slug = article['slug']
article_url = f"https://www.nepalitimes.com/news/{article_slug}" # Adjust this based on the actual URL structure
yield scrapy.Request(url=article_url, callback=self.parse_item)
def parse_item(self, response):
# This function should extract the article information from the provided response
# and yield the scraped data as a dictionary
# Extract article information using XPath selectors
title = response.xpath('.//article[contains(@class,"article__full")]/h1/text()').get()
subtitle = response.xpath('.//span[contains(@class,"article__subhead")]/text()').get()
date = response.xpath(".//div/time[contains(@class,'article__time')]/text()").get()
author = response.xpath('.//div/span[contains(@class,"article__author")]/span/text()').get()
category = response.xpath(".//a[contains(@class,'active')]/text()").get()
url = response.xpath(".//meta[contains(@property, 'og:url')]/@content").get()
# Parse the HTML content
content_elements = response.xpath('.//div[contains(@class,"article__text")]/p')
text_content = [element.xpath("string(.)").get().strip() for element in content_elements]
cleaned_content = ' '.join(text_content)
yield {
'title': title,
'subtitle': subtitle,
'author': author,
'date': date,
'content': cleaned_content,
'category': category,
'URL': url
}
Okay, So I tried what @Leandro suggested, that is to say, using chrom devtools instead of Selenium, but it doesn't seem to launch the def parse_ajax function.... but it still functionning not giving the results I want (only 9 items were scrapped). I need some help.
Here is what Iget when I'm clicking on the "Load button" : and
Here is the edited code :
import scrapy
import json
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http.request import Request
class NepaliSpider(CrawlSpider):
name = "nepalitimes"
allowed_domains = ["nepalitimes.com"]
# Start URL for the spider
start_urls = ['https://www.nepalitimes.com/news']
custom_settings = {
'FEED_FORMAT': 'csv',
'FEED_URI': 'nepali_times.csv'
}
# Rule to follow links to individual article pages
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
# Handling the load button using Selenium --- En cours de pulvérisation <3
def parse(self, response, **kwargs):
# Parse the articles from the initial page
for result in response.xpath(".//div[contains(@class,'main--left')]/a"):
relative_url = result.xpath("@href").extract_first()
absolute_url = response.urljoin(relative_url)
yield scrapy.Request(url=absolute_url, callback=self.parse_item)
# Fetch additional articles using GraphQL API with different offset values
tenant_code = "epz639"
routeId = 8
limit = 10
offset = 10
while True:
data = {
"query": "query getMoreArticles($tenant_code: String, $routeId: Int, $limit: Int, $offset: Int) { articles: getPublicContent(tenant_code: $tenant_code, routeId: $routeId, limit: $limit, offset: $offset) { id } }",
"variables": {
"tenant_code": tenant_code,
"routeId": routeId,
"limit": limit,
"offset": offset
}
}
yield scrapy.Request(url="https://nepalitimes-hasura.superdesk.org/v1/graphql",
method='POST',
body=json.dumps(data),
headers={'Content-Type': 'application/json'},
callback=self.parse_ajax_response)
offset += limit
def parse_ajax_response(self, response):
json_response = json.loads(response.text)
if 'items' in json_response:
articles = json_response['data']['items']
print("DAta found", articles)
for article in articles:
article_id = article['id']
article_url = f"https://www.nepalitimes.com/news/{article_id}"
yield scrapy.Request(url=article_url, callback=self.parse_item)
def parse_item(self, response):
# This function should extract the article information from the provided response
# and yield the scraped data as a dictionary
# Extract article information using XPath selectors
title = response.xpath('.//article[contains(@class,"article__full")]/h1/text()').get()
subtitle = response.xpath('.//span[contains(@class,"article__subhead")]/text()').get()
date = response.xpath(".//div/time[contains(@class,'article__time')]/text()").get()
author = response.xpath('.//div/span[contains(@class,"article__author")]/span/text()').get()
category = response.xpath(".//a[contains(@class,'active')]/text()").get()
url = response.xpath(".//meta[contains(@property, 'og:url')]/@content").get()
# Parse the HTML content
content_elements = response.xpath('.//div[contains(@class,"article__text")]/p')
text_content = [element.xpath("string(.)").get().strip() for element in content_elements]
cleaned_content = ' '.join(text_content)
yield {
'title': title,
'subtitle': subtitle,
'author': author,
'date': date,
'content': cleaned_content,
'category': category,
'URL': url
}
It do load other pages (not only focusing on the news pages) and it seems not taking into account the def parse_ajax_response() function... Furthermore, it tries to scrape the https://archive.nepalitimes.com/news structure but I don't want the script to do it...
Solution
I think the best approach is to check out what request is being issued when you click the "Load More" button. For instance, this can be done using the Network tab in Chrome Dev Tools. Then, you can schedule this request in Scrapy after loading the first page. Probably, this request will return some JSON-like structure, which you can handle in a different method (see the callback
argument in the Request object).
This way, you can get rid of Selenium, making your scraper lighter. I hope this helps :)
For your case, it is using a GraphQL API for querying more objects. The request may seem a bit scary, but it states what data should be returned from the server:
If you take a look in the Response tab, you'll see that the response looks like:
So, you should add to your scraper a yield Request(...)
in the scrape
method that mimics the images I've sent. So, your request would have a body with an attribute named query
with the string you can see on Chrome Dev Tools, and also a variables
param which is a JSON with bindings to the query
parameters. (You can check in the Payload tab and click view source for the actual string being sent).
You'll probably have to do this (yield Request(...)
) as many times as pages you want to crawl, accounting for the limit and offset parameters. You can also check what happens when you hit the last page.
A tip: you can have a
parse_first_load_more
method for the first request. The response comes with a"totalCount": 1321
, which you can use to calculate how many requests you have to issue. Then, the following request can have a different callback or you can use ameta
parameter in the request to indicate that this is not the first one....
The final result would be something like this (NOTE that this is just an example code):
import json
import scrapy
GRAPHQL_QUERY = """
query getArticles($tenant_code: String = \"\", $routeId: Int, $limit: Int = 10, $offset: Int = 0) {
metadata: swp_article_aggregate(where: {tenant_code: {_eq: $tenant_code}, route_id: {_eq: $routeId}}) {
aggregate {
totalCount: count
}
}
items: swp_article(limit: $limit, offset: $offset, order_by: {published_at: desc}, where: {tenant_code: {_eq: $tenant_code}
...
"""
class NepalTimesScraper(scrapy.Spider):
name = "nepaltimes"
start_urls = ["https://www.nepalitimes.com/news"]
def parse(self, response):
articles = response.xpath("//article[@class='list']/..")
for article in articles:
title = article.css("h3::text").get()
link = article.attrib["href"]
yield {"title": title, "link": link}
# Now, load more
graphql_req = {
"query": GRAPHQL_QUERY,
"variables": {
"tenant_code": "epz639",
"routeId": 8,
"limit": 10,
"offset": 10,
},
}
yield scrapy.Request(
"https://nepalitimes-hasura.superdesk.org/v1/graphql",
method="POST",
body=json.dumps(graphql_req),
meta={"current_offset": 10},
callback=self.parse_more,
)
def parse_more(self, response):
json_response = json.loads(response.text)
total_number_of_articles = json_response["data"]["metadata"]["aggregate"][
"totalCount"
]
current_offset = response.meta["current_offset"]
for article in json_response["data"]["items"]:
yield {
"title": article["title"],
"link": f"{article['swp_route']['staticprefix']}/{article['slug']}",
}
if current_offset * 10 < total_number_of_articles:
current_offset = current_offset + 10
graphql_req = {
"query": GRAPHQL_QUERY,
"variables": {
"tenant_code": "epz639",
"routeId": 8,
"limit": 10,
"offset": current_offset,
},
}
yield scrapy.Request(
"https://nepalitimes-hasura.superdesk.org/v1/graphql",
method="POST",
body=json.dumps(graphql_req),
meta={"current_offset": current_offset},
callback=self.parse_more,
)
Hope this helps
Answered By - Leandro Rodrigues de Souza
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.