Issue
I am trying to combine the item with item field similarIdeas
which is a list right now I am using requests
to get the data but I need to yield those requests by chaining them and yield one single item not sure how to do it correctly. Here is my code using requests module:
import scrapy
from scrapy.selector import Selector
import json
import math
import requests
class HouzzScraper(scrapy.Spider):
name = "houzz"
# custom settings
custom_settings = {
"LOG_FILE": "houzz_spider.log",
"IMAGES_STORE": "houzz_images",
"FEEDS": {
"houzz.json": {
"format": "json",
}
},
"ITEM_PIPELINES": {
"houzz_crawler.pipelines.HouzzImagePipeline": 1,
},
}
headers = {
"authority": "www.houzz.com",
"accept": "*/*",
"accept-language": "en,ru;q=0.9",
"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"origin": "https://www.houzz.com",
"referer": "https://www.houzz.com/photos/columbus-ave-residence-contemporary-bathroom-new-york-phvw-vp~160668148",
"rrid": "70402547-c900-47f7-a913-8e1cbc9aa0c3",
"sec-ch-ua": '"Chromium";v="110", "Not A(Brand";v="24", "YaBrowser";v="23"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.1.906 (beta) Yowser/2.5 Safari/537.36",
"x-csrf-token": "i8B5ykgX-eprPj5yAHSxOng08Pa4qAr2Z0TQ",
"x-hz-request": "true",
"x-ol-exp-id": "clhhdi4wu00003y71rnvty395",
"x-ol-exp-name": "Photo - View",
"x-ol-ext-device-id": "23a3cfb8-7a04-4462-af71-d98689271533",
"x-ol-ext-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
"x-ol-product": "Houzz",
"x-ol-product-variant": "Houzz US",
"x-ol-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
"x-requested-with": "XMLHttpRequest",
}
cookies = {
"v": "1683311076_f9d9a715-f45b-42dc-bc6d-7da75774a57f_9bda9dd500ca1e5119bbecaba51e53f0",
"vct": "en-US-vxnkSVVkSBzkSVVkCR%2FkSVVk8B%2FkSVVk4R3kSVVk4h3kSVVk",
"_gcl_au": "1.1.17413922.1683311086",
"crossdevicetracking": "915374c0-439c-46a1-bbf2-3a2aaa487e69",
"_pin_unauth": "dWlkPU16Y3dNbVF6T0dNdE1tWTBOaTAwWTJSa0xUazVZakV0TXprek5XWm1ZV014WWprMw",
"_sp_id.c905": "5af74097-a6bb-46e7-8d14-35ff6d738f39.1683317411.2.1683359810.1683317411.13ad94c9-5560-4fbf-963f-b63e32f2124d",
"g_state": '{"i_p":1684144918349,"i_l":3}',
"browseResultSetGridWidth": "554",
"_gid": "GA1.2.1176067560.1683652076",
"ln_or": "eyIzODE1NzE2IjoiZCJ9",
"_csrf": "G_nV-Kaa7rlqgTwnueAXkJtj",
"jdv": "t7WOzUb2vHLZtWVVHSk%2BXJEWN7ua9zR%2FUkXpY9RYDUW00hxMyur5c%2Bzn6M%2BqQADtWOInJpmlQA37Gxp0L267jdj74Iwe",
"documentWidth": "1318",
"_uetsid": "0bf41840ee8c11edac06995ca98afa3c",
"_uetvid": "1e07d960eb7211ed880b7db3cdc86191",
"_derived_epik": "dj0yJnU9NFBDc3RuOExta3NiM2xfaV9WS0RYbVVLRS1lRVpycDEmbj1tVE1RRUtOUjYwYU1Kalp0el9mNTBBJm09OCZ0PUFBQUFBR1JiUmprJnJtPTgmcnQ9QUFBQUFHUmJSamsmc3A9NQ",
"IR_gbd": "houzz.com",
"IR_5454": "1683703358356%7C0%7C1683703358356%7C%7C",
"_ga": "GA1.2.1658927820.1683311086",
"_dc_gtm_UA-3519678-1": "1",
"_ga_PB0RC2CT7B": "GS1.1.1683703353.11.1.1683704001.59.0.0",
"hzd": "70402547-c900-47f7-a913-8e1cbc9aa0c3%3A%3A%3A%3A%3ASeeMoreIdeas",
}
base_url = "https://www.houzz.com/photos/home-design-ideas-phbr0-bp~"
similar_ideas_api_url = "https://www.houzz.com/j/getSimilarSpaces"
def start_requests(self):
yield scrapy.Request(
url=self.base_url, headers=self.headers, callback=self.parse_ideas
)
def parse_ideas(self, response):
ideas = response.css("a.hz-photo-card__ratio-box::attr(href)").extract()
total_photos = int(
response.css("span.hz-top-pagination__text ::text")
.extract()[4]
.replace(",", "")
)
photos_per_page = int(
response.css("span.hz-top-pagination__text ::text").extract()[2]
)
for idea in ideas:
yield scrapy.Request(
url=idea, headers=self.headers, callback=self.parse_project_url
)
def parse_project_url(self, response):
data = response.css('script[id="hz-ctx"] ::text').get()
json_data = json.loads(data)
space_id = json_data["data"]["pageContentData"]["spaceId"]
space = json_data["data"]["stores"]["data"]["SpaceStore"]["data"][space_id]
project_id = space["projectId"]
space_url = space["url"]
raw_project_url = (
space_url.split("~")[0].replace("phvw", "pj").replace("vp", "vj")
)
project_url = raw_project_url + "~" + str(project_id)
yield scrapy.Request(
url=project_url, headers=self.headers, callback=self.parse_project_idea
)
def parse_project_idea(self, response):
idea_board = response.css(
"div.hz-prj-container.hz-prj-container__photos.clearfix ::attr(href)"
).extract()
for idea_link in idea_board:
yield scrapy.Request(
url=idea_link,
dont_filter=True,
headers=self.headers,
callback=self.parse_idea_details,
)
def parse_idea_details(self, response):
item = {}
item["ideaUrl"] = response.url
item["Title"] = response.css(
"h1.hz-view-photo__space-info__title.text-bold::text"
).get()
subtitle = response.css(
"h1.hz-view-photo__space-info__subtitle.text-m::text"
).get()
item["subTitle"] = subtitle
item["spaceDescription"] = response.css(
"div.hz-view-photo__space-info__description.text-m ::text"
).get()
item["uploadedBy"] = response.css("div.vph-owner-info__details ::text").get()
item["Tags"] = [
{"tag": t}
for t in response.css(
"ul.hz-view-photo__breadcrumb.hz-track-me ::text"
).extract()
]
item["starRating"] = len(
response.css(
"span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
)
)
item["numberOfReviews"] = response.css(
"span.hz-star-rate__review-string::text"
).get()
item["image_urls"] = response.css(
"div.view-photo-image-pane > img::attr(src)"
).extract()
item["similarIdeas"] = []
spaceId = response.url.split("~")[-1]
data = {
"spaceId": spaceId,
"fromItem": "0",
"itemsPerPage": "10",
"contentDescriptor": '{"t":1,"et":12,"id":6258114}',
}
resp = requests.post(
self.similar_ideas_api_url,
cookies=self.cookies,
headers=self.headers,
data=data,
)
data = resp.json()["spaceData"]["spaces"]
space_keys = list(data.keys())
space_urls = [data[key]["url"] for key in space_keys]
for s_url in space_urls:
space_response = requests.get(url=s_url, headers=self.headers)
similar_space = Selector(text=space_response.text)
item["similarIdeas"].append(
{
"ideaUrl": space_response.url,
"Title": similar_space.css(
"h1.hz-view-photo__space-info__title.text-bold::text"
).get(),
"SubTitle": similar_space.css(
"h1.hz-view-photo__space-info__subtitle.text-m::text"
).get(),
"uploadedBy": similar_space.css(
"div.vph-owner-info__details ::text"
).get(),
"Tags": [
{"tag": t}
for t in similar_space.css(
"ul.hz-view-photo__breadcrumb.hz-track-me ::text"
).extract()
],
"starRating": len(
response.css(
"span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
)
),
"numberOfReviews": response.css(
"span.hz-star-rate__review-string::text"
).get(),
"image_urls": similar_space.css(
"div.view-photo-image-pane > img::attr(src)"
).extract(),
}
)
yield item
I tried to join the requests by adding item
to meta
in the requests
but it didn't worked as expected it is only collecting only one similarIdeas
and it should be at least 25 or 24 and giving a lot of duplicates despite the fact I haven't place dont_filter=True
. Here is my code trying chaining requests:
import scrapy
from scrapy.selector import Selector
import json
import math
import requests
class HouzzSimilar(scrapy.Spider):
name = "houzz_s"
# custom settings
custom_settings = {
"LOG_FILE": "houzz_spider.log",
"IMAGES_STORE": "houzz_images",
"FEEDS": {
"houzz.json": {
"format": "json",
}
},
"ITEM_PIPELINES": {
"houzz_crawler.pipelines.HouzzImagePipeline": 1,
},
}
headers = {
"authority": "www.houzz.com",
"accept": "*/*",
"accept-language": "en,ru;q=0.9",
"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"origin": "https://www.houzz.com",
"referer": "https://www.houzz.com/photos/columbus-ave-residence-contemporary-bathroom-new-york-phvw-vp~160668148",
"rrid": "70402547-c900-47f7-a913-8e1cbc9aa0c3",
"sec-ch-ua": '"Chromium";v="110", "Not A(Brand";v="24", "YaBrowser";v="23"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.1.906 (beta) Yowser/2.5 Safari/537.36",
"x-csrf-token": "i8B5ykgX-eprPj5yAHSxOng08Pa4qAr2Z0TQ",
"x-hz-request": "true",
"x-ol-exp-id": "clhhdi4wu00003y71rnvty395",
"x-ol-exp-name": "Photo - View",
"x-ol-ext-device-id": "23a3cfb8-7a04-4462-af71-d98689271533",
"x-ol-ext-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
"x-ol-product": "Houzz",
"x-ol-product-variant": "Houzz US",
"x-ol-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
"x-requested-with": "XMLHttpRequest",
}
cookies = {
"v": "1683311076_f9d9a715-f45b-42dc-bc6d-7da75774a57f_9bda9dd500ca1e5119bbecaba51e53f0",
"vct": "en-US-vxnkSVVkSBzkSVVkCR%2FkSVVk8B%2FkSVVk4R3kSVVk4h3kSVVk",
"_gcl_au": "1.1.17413922.1683311086",
"crossdevicetracking": "915374c0-439c-46a1-bbf2-3a2aaa487e69",
"_pin_unauth": "dWlkPU16Y3dNbVF6T0dNdE1tWTBOaTAwWTJSa0xUazVZakV0TXprek5XWm1ZV014WWprMw",
"_sp_id.c905": "5af74097-a6bb-46e7-8d14-35ff6d738f39.1683317411.2.1683359810.1683317411.13ad94c9-5560-4fbf-963f-b63e32f2124d",
"g_state": '{"i_p":1684144918349,"i_l":3}',
"browseResultSetGridWidth": "554",
"_gid": "GA1.2.1176067560.1683652076",
"ln_or": "eyIzODE1NzE2IjoiZCJ9",
"_csrf": "G_nV-Kaa7rlqgTwnueAXkJtj",
"jdv": "t7WOzUb2vHLZtWVVHSk%2BXJEWN7ua9zR%2FUkXpY9RYDUW00hxMyur5c%2Bzn6M%2BqQADtWOInJpmlQA37Gxp0L267jdj74Iwe",
"documentWidth": "1318",
"_uetsid": "0bf41840ee8c11edac06995ca98afa3c",
"_uetvid": "1e07d960eb7211ed880b7db3cdc86191",
"_derived_epik": "dj0yJnU9NFBDc3RuOExta3NiM2xfaV9WS0RYbVVLRS1lRVpycDEmbj1tVE1RRUtOUjYwYU1Kalp0el9mNTBBJm09OCZ0PUFBQUFBR1JiUmprJnJtPTgmcnQ9QUFBQUFHUmJSamsmc3A9NQ",
"IR_gbd": "houzz.com",
"IR_5454": "1683703358356%7C0%7C1683703358356%7C%7C",
"_ga": "GA1.2.1658927820.1683311086",
"_dc_gtm_UA-3519678-1": "1",
"_ga_PB0RC2CT7B": "GS1.1.1683703353.11.1.1683704001.59.0.0",
"hzd": "70402547-c900-47f7-a913-8e1cbc9aa0c3%3A%3A%3A%3A%3ASeeMoreIdeas",
}
base_url = "https://www.houzz.com/photos/home-design-ideas-phbr0-bp~"
similar_ideas_api_url = "https://www.houzz.com/j/getSimilarSpaces"
def start_requests(self):
yield scrapy.Request(
url=self.base_url, headers=self.headers, callback=self.parse_ideas
)
def parse_ideas(self, response):
ideas = response.css("a.hz-photo-card__ratio-box::attr(href)").extract()
total_photos = int(
response.css("span.hz-top-pagination__text ::text")
.extract()[4]
.replace(",", "")
)
photos_per_page = int(
response.css("span.hz-top-pagination__text ::text").extract()[2]
)
for idea in ideas:
yield scrapy.Request(
url=idea, headers=self.headers, callback=self.parse_project_url
)
def parse_project_url(self, response):
data = response.css('script[id="hz-ctx"] ::text').get()
json_data = json.loads(data)
space_id = json_data["data"]["pageContentData"]["spaceId"]
space = json_data["data"]["stores"]["data"]["SpaceStore"]["data"][space_id]
project_id = space["projectId"]
space_url = space["url"]
raw_project_url = (
space_url.split("~")[0].replace("phvw", "pj").replace("vp", "vj")
)
project_url = raw_project_url + "~" + str(project_id)
yield scrapy.Request(
url=project_url, headers=self.headers, callback=self.parse_project_idea
)
def parse_project_idea(self, response):
idea_board = response.css(
"div.hz-prj-container.hz-prj-container__photos.clearfix ::attr(href)"
).extract()
for idea_link in idea_board:
yield scrapy.Request(
url=idea_link,
headers=self.headers,
callback=self.parse_idea_details,
)
def parse_idea_details(self, response):
item = {}
item["ideaUrl"] = response.url
item["Title"] = response.css(
"h1.hz-view-photo__space-info__title.text-bold::text"
).get()
subtitle = response.css(
"h1.hz-view-photo__space-info__subtitle.text-m::text"
).get()
item["subTitle"] = subtitle
item["spaceDescription"] = response.css(
"div.hz-view-photo__space-info__description.text-m ::text"
).get()
item["uploadedBy"] = response.css("div.vph-owner-info__details ::text").get()
item["Tags"] = [
{"tag": t}
for t in response.css(
"ul.hz-view-photo__breadcrumb.hz-track-me ::text"
).extract()
]
item["starRating"] = len(
response.css(
"span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
)
)
item["numberOfReviews"] = response.css(
"span.hz-star-rate__review-string::text"
).get()
item["image_urls"] = response.css(
"div.view-photo-image-pane > img::attr(src)"
).extract()
item["similarIdeas"] = []
spaceId = response.url.split("~")[-1]
body = f"spaceId={spaceId}&fromItem=0&itemsPerPage=10&contentDescriptor=%7B%22t%22%3A1%2C%22et%22%3A3%2C%22id%22%3A160668148%7D"
yield scrapy.Request(
url=self.similar_ideas_api_url,
method="POST",
cookies=self.cookies,
headers=self.headers,
body=body,
meta={"item": item},
callback=self.get_similar_ideas_urls,
)
def get_similar_ideas_urls(self, response):
data = response.json()["spaceData"]["spaces"]
space_keys = list(data.keys())
space_urls = [data[key]["url"] for key in space_keys]
item = response.meta.get("item")
yield scrapy.Request(
url=space_urls[0],
headers=self.headers,
meta={"item": item, "space_urls": space_urls[1:]},
callback=self.parse_similar_ideas,
)
def parse_similar_ideas(self, response):
item = response.meta.get("item")
space_urls = response.meta.get("space_urls")
item["similarIdeas"].append(
{
"ideaUrl": response.url,
"Title": response.css(
"h1.hz-view-photo__space-info__title.text-bold::text"
).get(),
"subTitle": response.css(
"h1.hz-view-photo__space-info__subtitle.text-m::text"
).get(),
"spaceDescription": response.css(
"div.hz-view-photo__space-info__description.text-m ::text"
).get(),
"uploadedBy": response.css("div.vph-owner-info__details ::text").get(),
"Tags": [
{"tag": t}
for t in response.css(
"ul.hz-view-photo__breadcrumb.hz-track-me ::text"
).extract()
],
"starRating": len(
response.css(
"span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
)
),
"numberOfReviews": response.css(
"span.hz-star-rate__review-string::text"
).get(),
"image_urls": response.css(
"div.view-photo-image-pane > img::attr(src)"
).extract(),
}
)
if len(space_urls) > 0:
yield scrapy.Request(
url=space_urls.pop(0),
headers=self.headers,
meta={"item": item, "space_urls": space_urls[1:]},
callback=self.parse_similar_ideas,
)
yield item
my expected output: https://jsoneditoronline.org/#left=cloud.6a9b829e90014b55975756556c3d0f2d
Solution
Your second example is really close. THere are just a couple of things I would recommend and one thing that is missing:
You should use
cb_kwargs
to pass data between callback methods, instead of using themeta
dict
. Both will work but using the cb_kwargs is what scrapy recommends in this situation and I believe it makes it more readable and requires fewer lines of code.When running your second example I ran into quite a few situations where the dupelicates filter was triggered. When chaining requests like this having single request filtered will likely mean that item will never be yielded. In order to avoid this you should make your
space_urls
variable a set instead of a list so you know each url is unique, and you should also add thedont_filter
argument to the requests in yourparse_similar_ideas
method.THe last and most important point is that you are yielding an item at the end of every call to
parse_similar_ideas
, which means you are yielding the same item once for every single url in thespace_urls
list, and the only thing that is changing is the number of items in thesimilarIdeas
field in your item. What you actually want to do is only yield the item once there are no more urls left inspace_urls
, then you are only yielding the item once at the very end of the chain.
The example below implements the above points and creates the output you are expecting. You will want to add your custom_settings back to the example though.
import scrapy
import json
class HouzzSimilar(scrapy.Spider):
name = "houzz"
headers = {
"authority": "www.houzz.com",
"accept": "*/*",
"accept-language": "en,ru;q=0.9",
"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"origin": "https://www.houzz.com",
"referer": "https://www.houzz.com/photos/columbus-ave-residence-contemporary-bathroom-new-york-phvw-vp~160668148",
"rrid": "70402547-c900-47f7-a913-8e1cbc9aa0c3",
"sec-ch-ua": '"Chromium";v="110", "Not A(Brand";v="24", "YaBrowser";v="23"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 YaBrowser/23.3.1.906 (beta) Yowser/2.5 Safari/537.36",
"x-csrf-token": "i8B5ykgX-eprPj5yAHSxOng08Pa4qAr2Z0TQ",
"x-hz-request": "true",
"x-ol-exp-id": "clhhdi4wu00003y71rnvty395",
"x-ol-exp-name": "Photo - View",
"x-ol-ext-device-id": "23a3cfb8-7a04-4462-af71-d98689271533",
"x-ol-ext-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
"x-ol-product": "Houzz",
"x-ol-product-variant": "Houzz US",
"x-ol-session-id": "782c0a90-8925-409f-90c1-f47798e0426e",
"x-requested-with": "XMLHttpRequest",
}
cookies = {
"v": "1683311076_f9d9a715-f45b-42dc-bc6d-7da75774a57f_9bda9dd500ca1e5119bbecaba51e53f0",
"vct": "en-US-vxnkSVVkSBzkSVVkCR%2FkSVVk8B%2FkSVVk4R3kSVVk4h3kSVVk",
"_gcl_au": "1.1.17413922.1683311086",
"crossdevicetracking": "915374c0-439c-46a1-bbf2-3a2aaa487e69",
"_pin_unauth": "dWlkPU16Y3dNbVF6T0dNdE1tWTBOaTAwWTJSa0xUazVZakV0TXprek5XWm1ZV014WWprMw",
"_sp_id.c905": "5af74097-a6bb-46e7-8d14-35ff6d738f39.1683317411.2.1683359810.1683317411.13ad94c9-5560-4fbf-963f-b63e32f2124d",
"g_state": '{"i_p":1684144918349,"i_l":3}',
"browseResultSetGridWidth": "554",
"_gid": "GA1.2.1176067560.1683652076",
"ln_or": "eyIzODE1NzE2IjoiZCJ9",
"_csrf": "G_nV-Kaa7rlqgTwnueAXkJtj",
"jdv": "t7WOzUb2vHLZtWVVHSk%2BXJEWN7ua9zR%2FUkXpY9RYDUW00hxMyur5c%2Bzn6M%2BqQADtWOInJpmlQA37Gxp0L267jdj74Iwe",
"documentWidth": "1318",
"_uetsid": "0bf41840ee8c11edac06995ca98afa3c",
"_uetvid": "1e07d960eb7211ed880b7db3cdc86191",
"_derived_epik": "dj0yJnU9NFBDc3RuOExta3NiM2xfaV9WS0RYbVVLRS1lRVpycDEmbj1tVE1RRUtOUjYwYU1Kalp0el9mNTBBJm09OCZ0PUFBQUFBR1JiUmprJnJtPTgmcnQ9QUFBQUFHUmJSamsmc3A9NQ",
"IR_gbd": "houzz.com",
"IR_5454": "1683703358356%7C0%7C1683703358356%7C%7C",
"_ga": "GA1.2.1658927820.1683311086",
"_dc_gtm_UA-3519678-1": "1",
"_ga_PB0RC2CT7B": "GS1.1.1683703353.11.1.1683704001.59.0.0",
"hzd": "70402547-c900-47f7-a913-8e1cbc9aa0c3%3A%3A%3A%3A%3ASeeMoreIdeas",
}
base_url = "https://www.houzz.com/photos/home-design-ideas-phbr0-bp~"
similar_ideas_api_url = "https://www.houzz.com/j/getSimilarSpaces"
def start_requests(self):
yield scrapy.Request(
url=self.base_url, headers=self.headers, callback=self.parse_ideas
)
def parse_ideas(self, response):
ideas = response.css("a.hz-photo-card__ratio-box::attr(href)").extract()
total_photos = int(
response.css("span.hz-top-pagination__text ::text")
.extract()[4]
.replace(",", "")
)
photos_per_page = int(
response.css("span.hz-top-pagination__text ::text").extract()[2]
)
for idea in ideas:
yield scrapy.Request(
url=idea, headers=self.headers, callback=self.parse_project_url
)
def parse_project_url(self, response):
data = response.css('script[id="hz-ctx"] ::text').get()
json_data = json.loads(data)
space_id = json_data["data"]["pageContentData"]["spaceId"]
space = json_data["data"]["stores"]["data"]["SpaceStore"]["data"][space_id]
project_id = space["projectId"]
space_url = space["url"]
raw_project_url = (
space_url.split("~")[0].replace("phvw", "pj").replace("vp", "vj")
)
project_url = raw_project_url + "~" + str(project_id)
yield scrapy.Request(
url=project_url, headers=self.headers, callback=self.parse_project_idea
)
def parse_project_idea(self, response):
idea_board = response.css(
"div.hz-prj-container.hz-prj-container__photos.clearfix ::attr(href)"
).extract()
for idea_link in idea_board:
yield scrapy.Request(
url=idea_link,
headers=self.headers,
callback=self.parse_idea_details,
)
def parse_idea_details(self, response):
item = {}
item["ideaUrl"] = response.url
item["Title"] = response.css(
"h1.hz-view-photo__space-info__title.text-bold::text"
).get()
subtitle = response.css(
"h1.hz-view-photo__space-info__subtitle.text-m::text"
).get()
item["subTitle"] = subtitle
item["spaceDescription"] = response.css(
"div.hz-view-photo__space-info__description.text-m ::text"
).get()
item["uploadedBy"] = response.css("div.vph-owner-info__details ::text").get()
item["Tags"] = [
{"tag": t}
for t in response.css(
"ul.hz-view-photo__breadcrumb.hz-track-me ::text"
).extract()
]
item["starRating"] = len(
response.css(
"span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
)
)
item["numberOfReviews"] = response.css(
"span.hz-star-rate__review-string::text"
).get()
item["image_urls"] = response.css(
"div.view-photo-image-pane > img::attr(src)"
).extract()
item["similarIdeas"] = []
spaceId = response.url.split("~")[-1]
body = f"spaceId={spaceId}&fromItem=0&itemsPerPage=10&contentDescriptor=%7B%22t%22%3A1%2C%22et%22%3A3%2C%22id%22%3A160668148%7D"
yield scrapy.Request(
url=self.similar_ideas_api_url,
method="POST",
cookies=self.cookies,
headers=self.headers,
body=body,
cb_kwargs={"item": item}, # <-- cb_kwargs
callback=self.get_similar_ideas_urls,
)
def get_similar_ideas_urls(self, response, item=None):
data = response.json()["spaceData"]["spaces"]
space_keys = list(data.keys())
space_urls = set([data[key]["url"] for key in space_keys]) # <- set
yield scrapy.Request(
url=space_urls.pop(),
headers=self.headers,
cb_kwargs={"item": item, "space_urls": space_urls},
callback=self.parse_similar_ideas,
)
def parse_similar_ideas(self, response, item=None, space_urls=None):
item["similarIdeas"].append(
{
"ideaUrl": response.url,
"Title": response.css(
"h1.hz-view-photo__space-info__title.text-bold::text"
).get(),
"subTitle": response.css(
"h1.hz-view-photo__space-info__subtitle.text-m::text"
).get(),
"spaceDescription": response.css(
"div.hz-view-photo__space-info__description.text-m ::text"
).get(),
"uploadedBy": response.css("div.vph-owner-info__details ::text").get(),
"Tags": [
{"tag": t}
for t in response.css(
"ul.hz-view-photo__breadcrumb.hz-track-me ::text"
).extract()
],
"starRating": len(
response.css(
"span.icon-font.icon-star.hz-star-rate.hz-star-rate--highlighted.star-icon"
)
),
"numberOfReviews": response.css(
"span.hz-star-rate__review-string::text"
).get(),
"image_urls": response.css(
"div.view-photo-image-pane > img::attr(src)"
).extract(),
}
)
if len(space_urls) > 0:
yield scrapy.Request(
url=space_urls.pop(),
headers=self.headers,
cb_kwargs={"item": item, "space_urls": space_urls},
dont_filter=True, # <--- add this
callback=self.parse_similar_ideas,
)
else: # <--- this was the piece you were missing
yield item
Answered By - Alexander
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.