Issue
I am attempting to extract info about articles from this site. I am a Scrapy newbie, and bit stuck as to why I don't getting any output, although I I am able to get all the correct URL outputted. I am unable to figure out what I am missing or need to change. Any help towards this end will be highly appreciated!
Thanks!!
I have the following code so far:
Here is my spider:
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(@class, "aom-article-simple")]'):
url = article.xpath('.//a/@href').extract()
print(url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
title = response.xpath('//*[@id="post-title entry-title"]/header/h1//text()').extract()
category = response.xpath('//*[@id="in-category"]/header/p[1]//text()').extract()
date = response.xpath('//*[@id="single-date"]/header/p[2]/span[2]//text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
Here is settings.py:
BOT_NAME = 'aom'
SPIDER_MODULES = ['aom.spiders']
NEWSPIDER_MODULE = 'aom.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
HTTPERROR_ALLOW_ALL = True
Solution
I checked HTML and there is no title
'//*[@id="post-title entry-title"]/header/h1//text()'
but
'//h1[@class="post-title entry-title"]/text()'
or even simpler
'//h1[@itemprop="headline"]/text()
And probably you have the same problem with other elements
EDIT:
There is no category
'//*[@id="in-category"]/header/p[1]//text()'
but
'//p[@class="in-category"]//a/text()'
There is no date
'//*[@id="single-date"]/header/p[2]/span[2]//text()'
but
'//p[@class="single-date"]//span[2]/text()'
or even simpler
'//span[@itemprop="datePublished"]/text()'
Minimal working code with CrawlerProcess()
.
Everyone can paste all code in one file script.py
and run it as python script.py
without creating project.
I use max_pages = 2
to test only few articles.
import scrapy
from scrapy.http import Request
class ArticlesSpider(scrapy.Spider):
name = 'articles'
allowed_domains = ['artofmanliness.com']
max_pages = 2 # 200
def start_requests(self):
for i in range(self.max_pages):
yield scrapy.Request('http://artofmanliness.com/articles/page/%d/' % i, callback=self.parse)
def parse(self, response):
# AOM has a list of all articles in pages of about 189
for article in response.xpath('//article[contains(@class, "aom-article-simple")]'):
url = article.xpath('.//a/@href').extract()
print('article url:', url)
if url:
yield Request(url=url[0], callback=self.parse_article)
def parse_article(self, response):
#title = response.xpath('//h1[@class="post-title entry-title"]/text()').extract()
title = response.xpath('//h1[@itemprop="headline"]/text()').extract()
category = response.xpath('//p[@class="in-category"]//a/text()').extract()
#date = response.xpath('//p[@class="single-date"]//span[2]/text()').extract()
date = response.xpath('//span[@itemprop="datePublished"]/text()').extract()
yield {
'Title': title,
'Category': category,
'Date': date,
'URL': response.url
}
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36",
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(ArticlesSpider)
c.start()
Answered By - furas
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.