Issue
I'm trying write a parser to crawl, but something is going wrong, can help me whats wrong? I linked spider with items.py
import scrapy
from dyplom.items import DyplomtwoItem
class Dyplom(scrapy.Spider):
name = "dyplom"
start_urls = ['https://www.edimdoma.ru/retsepty?tags%5Brecipe_cuisine%5D%5B%5D=%D0%B0%D0%BC%D0%B5%D1%80%D0%B8%D0%BA%D0%B0%D0%BD%D1%81%D0%BA%D0%B0%D1%8F+%D0%BA%D1%83%D1%85%D0%BD%D1%8F&with_ingredient=&with_ingredient_condition=and&without_ingredient=&user_ids=&field=&direction=&query=']
for i in range(2, 6):
start_urls.append("https://www.edimdoma.ru/retsepty?_=1529256600422"
"&direction=&field=&page=" + str(i) +
"&query=&tags%5Brecipe_cuisine%5D%5B%5D=&user"
"_ids=&with_ingredient=&without_ingredient=")
def parse(self, response):
for href in response.xpath("//article[contains(@class, 'card')]/a//@href"):
# add the scheme, eg http://
url = "https://www.edimdoma.ru" + href.extract()
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
item = DyplomtwoItem()
item['id'] = response.xpath("//div[contains(@class, 'button button_print')]"
"//a[contains(@class, 'drop-down_item')]/@href").extract()[0]
item['title'] = response.xpath("//h1[contains(@class, 'recipe-header_name')]"
"/descendant::text()").extract()
item['image'] = response.xpath("//div[contains(@class, 'content-media')]/img//@src").extract()
item['recipe'] = response.xpath("//div[contains(@class, 'content-box_content')]/div[contains"
"(@class, 'plain-text recipe_step_text')]/descendant::text()").extract()
yield item
Solution
I included the class from your items in the scraper so that I could dissect what you did. It's all essentially the same as your items.py
.
It turned out that you had a few issues with you selectors, and you weren't selecting all the text. You needed getall()
appended to recipe instead of extract()
.
import scrapy
from scrapy.item import Field
from itemloaders.processors import TakeFirst
from itemloaders import ItemLoader
#from dyplom.items import DyplomtwoItem
class DyplomItem(scrapy.Item):
id = Field(output_processor = TakeFirst())
title = Field(output_processor = TakeFirst())
image = Field(output_processor = TakeFirst())
recipe = Field()
class Dyplom(scrapy.Spider):
name = "dyplom"
start_urls = ['https://www.edimdoma.ru/retsepty?tags%5Brecipe_cuisine%5D%5B%5D=%D0%B0%D0%BC%D0%B5%D1%80%D0%B8%D0%BA%D0%B0%D0%BD%D1%81%D0%BA%D0%B0%D1%8F+%D0%BA%D1%83%D1%85%D0%BD%D1%8F&with_ingredient=&with_ingredient_condition=and&without_ingredient=&user_ids=&field=&direction=&query=']
for i in range(2, 6):
start_urls.append("https://www.edimdoma.ru/retsepty?_=1529256600422"
"&direction=&field=&page=" + str(i) +
"&query=&tags%5Brecipe_cuisine%5D%5B%5D=&user"
"_ids=&with_ingredient=&without_ingredient=")
def parse(self, response):
for href in response.xpath("//article[contains(@class, 'card')]/a//@href"):
# add the scheme, eg http://
url = "https://www.edimdoma.ru" + href.extract()
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
loaders = ItemLoader(DyplomItem())
loaders.add_value('id', response.xpath("((//div[contains(@class, 'button button_print')])[1]//a)[1]/@href").get())
loaders.add_value('title', response.xpath("//div[@class='content-box']//h1//text()").get())
loaders.add_value('image', response.xpath("(//div[contains(@class, 'content-media')]//img/@src)[1]").get())
for text_stuff in response.xpath("//div[contains(@class, 'plain-text recipe_step_text')]/descendant::text()").getall():
loaders.add_value('recipe',text_stuff)
yield loaders.load_item()
Output:
{'id': '/retsepty/146847-skrembl-s-bekonom/print?wi=true',
'image': 'https://e3.edimdoma.ru/data/recipes/0014/6847/146847-ed4_wide.jpg?1631992625',
'recipe': ['Бекон нарежьте кубиком. Можно взять и сырокопченый, и '
'варенокопченый, и свежий бекон.',
'В сковороде растопите сливочное масло.'],
'title': 'Скрэмбл с беконом'}
Answered By - Working dollar
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.