I'm trying write a parser to crawl, but something is going wrong, can help me whats wrong? I linked spider with
import scrapy
from dyplom.items import DyplomtwoItem
class Dyplom(scrapy.Spider):
name = "dyplom"
start_urls = ['']
for i in range(2, 6):
"&direction=&field=&page=" + str(i) +
def parse(self, response):
for href in response.xpath("//article[contains(@class, 'card')]/a//@href"):
# add the scheme, eg http://
url = "" + href.extract()
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
item = DyplomtwoItem()
item['id'] = response.xpath("//div[contains(@class, 'button button_print')]"
"//a[contains(@class, 'drop-down_item')]/@href").extract()[0]
item['title'] = response.xpath("//h1[contains(@class, 'recipe-header_name')]"
item['image'] = response.xpath("//div[contains(@class, 'content-media')]/img//@src").extract()
item['recipe'] = response.xpath("//div[contains(@class, 'content-box_content')]/div[contains"
"(@class, 'plain-text recipe_step_text')]/descendant::text()").extract()
yield item
I included the class from your items in the scraper so that I could dissect what you did. It's all essentially the same as your
It turned out that you had a few issues with you selectors, and you weren't selecting all the text. You needed getall()
appended to recipe instead of extract()
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from itemloaders import ItemLoader
#from dyplom.items import DyplomtwoItem
class DyplomItem(scrapy.Item):
id = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
image = Field(output_processor = TakeFirst())
recipe = Field()
class Dyplom(scrapy.Spider):
name = "dyplom"
start_urls = ['']
for i in range(2, 6):
"&direction=&field=&page=" + str(i) +
def parse(self, response):
for href in response.xpath("//article[contains(@class, 'card')]/a//@href"):
# add the scheme, eg http://
url = "" + href.extract()
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
loaders = ItemLoader(DyplomItem())
loaders.add_value('id', response.xpath("((//div[contains(@class, 'button button_print')])[1]//a)[1]/@href").get())
loaders.add_value('title', response.xpath("//div[@class='content-box']//h1//text()").get())
loaders.add_value('image', response.xpath("(//div[contains(@class, 'content-media')]//img/@src)[1]").get())
for text_stuff in response.xpath("//div[contains(@class, 'plain-text recipe_step_text')]/descendant::text()").getall():
yield loaders.load_item()
{'id': '/retsepty/146847-skrembl-s-bekonom/print?wi=true',
'image': '',
'recipe': ['Бекон нарежьте кубиком. Можно взять и сырокопченый, и '
'варенокопченый, и свежий бекон.',
'В сковороде растопите сливочное масло.'],
'title': 'Скрэмбл с беконом'}
Answered By - Working dollar
Post a Comment
Note: Only a member of this blog may post a comment.