Issue
I want to Scrape the article from this site including the feature Video URL or Image URL with all paragraphs, and headings in order except the text not related to the article but wrapped in the main article div class, by using scrapy in python. In short, I am failed to get the URL of the feature video URL or feature image from this article and got problems while text from this article.
from urllib.parse import urljoin
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from datetime import datetime
import pandas as pd
class NewsSpider(scrapy.Spider):
name = "travelandleisure"
def start_requests(self):
url = input("Enter the article url: ")
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
try:
Author = ', '.join(set([x.strip() for x in response.xpath('//a[@class="mntl-attribution__item-name"]/text()').extract()]))
except IndexError:
Author = "NULL"
url = response.url
try:
Category = response.xpath('//*[@id="mntl-text-link_1-0"]/span/text()').get()
except IndexError:
Category = "NULL"
Headlines = response.xpath('//*[@id="article-heading_1-0"]/text()').get().replace("\n","")
Source = response.xpath('//*[@id="mntl-text-block_1-0"]/text()').get().replace("\n", "")
Published_Date = response.css('div.mntl-attribution__item-date::text').get().split("on ")[1].replace(",","")#Updated on June 8, 2022
Published_Date = datetime.strptime(Published_Date, "%B %d %Y").date()
#================Waiting for Stack answer====================
Feature_Image = "NULL" #Please tell the code for feature Image or Video
Content = "NULL" #Please tell the code for Scrape the all text, paragraph and heading but in sorting as in the article but not include the text that not belong to article but wrapped in this article div " <div class="loc article-content"> "
yield{
'Category':Category,
'Headlines':Headlines,
'Author': Author,
'Source': Source,
'Publication Date': Published_Date,
'Feature_Image': Feature_Image,
'Skift Take': skift_take,
'Article Content': Content
}
# =============== Data Store +++++++++++++++++++++
Data = [[Category,Headlines,Author,Source,Published_Date,Feature_Image,Content,url]]
cols = ['Category','Headlines','Author','Source','Published_Date','Feature_Image','Content','URL']
try:
opened_df = pd.read_csv('C:/Users/Public/pagedata.csv')
opened_df = pd.concat([opened_df,pd.DataFrame(Data, columns = cols)])
except:
opened_df = pd.DataFrame(Data, columns = cols)
opened_df.to_csv('C:/Users/Public/pagedata.csv', index= False)
if __name__ == '__main__':
settings = get_project_settings()
process = CrawlerProcess(settings)
process.crawl(NewsSpider)
process.start()
Here is the website URL https://www.travelandleisure.com/travel-news/where-can-americans-travel-right-now-a-country-by-country-guide
Solution
Below is given a way of the possible solutions.
Only for the video link/video image url entirely depends on JavaScript and scrapy can't render JS that's why it's not possible to grab video image url using scrapy only.
import scrapy
import datetime
import pandas as pd
class NewsSpider(scrapy.Spider):
name = "articles"
def start_requests(self):
#https://www.travelandleisure.com/travel-news/where-can-americans-travel-right-now-a-country-by-country-guide
url = input("Enter the article url: ")
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
data = []
Author = ', '.join(set([x.strip() for x in response.xpath('//a[@class="mntl-attribution__item-name"]/text()').extract()]))
Category = response.xpath('//*[@id="mntl-text-link_1-0"]/span/text()').get()
Headlines = response.xpath('//*[@id="article-heading_1-0"]/text()').get().replace("\n","")
Source = response.xpath('//*[@id="mntl-text-block_1-0"]/text()').get().replace("\n", "")
Published_Date = response.css('div.mntl-attribution__item-date::text').get().split("on ")[1].replace(",","")#Updated on June 8, 2022
Published_Date = datetime.datetime.strptime(Published_Date, "%B %d %Y").date()
#print(Author,Category,Source,Headlines, Published_Date)
#Feature_Image = "NULL" [only for video]because video link/entirely depends on JavaScript and scrapy can't render JS that's why it's not possible to grab video image url using scrapy only
Feature_Images = ''.join([x.get() for x in response.xpath('//*[@class="img--noscript universal-image__image"]/@src')][:-1])
#print(Feature_Images)
Content = ''.join(response.xpath('//*[@id="mntl-sc-page_1-0"]//text()').getall()).strip()
#print(Content)
d = {
'Category':Category,
'Headlines':Headlines,
'Author': Author,
'Source': Source,
'Publication Date': Published_Date,
'Feature_Image': Feature_Images,
'Article Content': Content
}
#yield d
data.append(d)
df = pd.DataFrame(data).to_csv('out.csv',index=False)
#print(df)
Answered By - Md. Fazlul Hoque
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.