Issue
I am trying to making a csv file for shopify store to upload, According to Shopify, you must do the following to add multiple images when importing:
Insert new rows (one per picture).
Copy + paste the "handle".
Copy + paste the image URLs.
Thus, the first image goes in the first row, and all subsequent images go in rows below. The example CSV is located here: https://help.shopify.com/csv/product_template.csv
I would like to program something that will loop through an array, which looks like the following (except significantly longer), and converts it to a CSV, putting all the photos except the first into a new row. Here is my attempted code:
import scrapy
from scrapy.crawler import CrawlerProcess
import csv
class SweetPeaAndWillowSpider(scrapy.Spider):
name = "sweetpea_and_willow"
custom_settings = {
# "FEED_FORMAT": "csv",
# "FEED_URI": "malabar_furniture.csv",
"LOG_FILE": "malabar_furniture_shopify.log",
}
data = []
headers = {
"authority": "www.sweetpeaandwillow.com",
"cache-control": "max-age=0",
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Yandex";v="22"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.141 YaBrowser/22.3.3.886 (beta) Yowser/2.5 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "navigate",
"sec-fetch-user": "?1",
"sec-fetch-dest": "document",
"accept-language": "en,ru;q=0.9",
}
cookies = {
"amzn-checkout-session": "%7B%7D",
"_fbp": "fb.1.1652394481944.1343184112",
"_pin_unauth": "dWlkPU56VmhNak5rTUdVdE1EVmhaQzAwTkdabExXRm1PREF0TnpOak9XRXdOek5rTjJFeg",
"_ga": "GA1.2.752968178.1652394485",
"SPSI": "4eea709914a47dc1f5575f79dc373b51",
"SPSE": "oc1iOVbm463lrWtCnix8S1Zlf9aGvPeKg7TG7d/WQXvAZjkksosjO/BSl80SLUWb/O8aqo3+lQSH9B1gMRWVdQ==",
"PHPSESSID": "n6mfpugp82troila6hfib78q3k",
"UTGv2": "h483379466221b95c6e78e9eb01940db0f64",
"_hjSessionUser_2692700": "eyJpZCI6ImQ0MDU3M2YzLWM0YjItNTJjMS04YzNiLTM4NzcyMWI5MGY0MyIsImNyZWF0ZWQiOjE2NTIzOTQ0ODI4MTAsImV4aXN0aW5nIjp0cnVlfQ==",
"_hjIncludedInSessionSample": "0",
"_hjSession_2692700": "eyJpZCI6ImExOWI0YjI5LTcxODYtNGU5Ny05Y2UwLTVjYmFmODQ0MWZjYiIsImNyZWF0ZWQiOjE2NTI1OTk3NDU3MTAsImluU2FtcGxlIjpmYWxzZX0=",
"_hjAbsoluteSessionInProgress": "0",
"form_key": "LCm4cy48SHYhBX3C",
"_gid": "GA1.2.1948251329.1652599747",
"_gat": "1",
"mage-cache-storage": "%7B%7D",
"mage-cache-storage-section-invalidation": "%7B%7D",
"mage-cache-sessid": "true",
"recently_viewed_product": "%7B%7D",
"recently_viewed_product_previous": "%7B%7D",
"recently_compared_product": "%7B%7D",
"recently_compared_product_previous": "%7B%7D",
"product_data_storage": "%7B%7D",
"section_data_ids": "%7B%22cart%22%3A1652599747%7D",
"newsletter-popup-form": "declined",
"spcsrf": "ef84c17476941fe30a45db5a0a4b8686",
"sp_lit": "JHxME1OUKp+83P5XseqYpg==",
"PRLST": "AH",
"adOtr": "7ae049U19a4",
}
def start_requests(self):
yield scrapy.Request(
"https://www.sweetpeaandwillow.com/brands/emotional-brands/malabar?p=1",
headers=self.headers,
cookies=self.cookies,
callback=self.parse_urls,
)
def parse_urls(self, response):
url_list = response.css("div.item.product-item")
for link in url_list:
url = link.css("a::attr(href)").get()
yield scrapy.Request(
url=url,
headers=self.headers,
cookies=self.cookies,
callback=self.parse_details,
)
def parse_details(self, response):
table = response.css("table.data.table.additional-attributes")
for tr in table.css("tbody"):
row = tr.css("tr")
color = row[0].css("td::text").get()
dimension = row[1].css("td::text").get()
material = row[2].css("td::text").get()
self.data.append(
{
"Handle": response.css("h1.page-title ::text").get().lower(),
"Title": response.css("h1.page-title ::text").get(),
"Descritpion": response.css(
"div#description_product_show > p::text"
).get(),
"Price": response.css("div.original-pricing-wrapper")
.css("span.price ::text")
.getall()[28],
"Delivery": response.css("p.availability-message > span::text").get(),
"Color": color,
"Dimensions": dimension,
"Material": material,
"Image_Src": response.css("div.MagicSlideshow")
.css("a img::attr(src)")
.getall(),
}
)
# print(self.data)
f = csv.writer(open("malabar_furniture_shopify.csv", "w", newline=""))
f.writerow(
[
"Handle",
"Title",
"Descritpion",
"Price",
"Delivery",
"Color",
"Dimensions",
"Material",
"Image_Src",
]
)
for d in self.data:
images = d["Image_Src"]
f.writerow(
[
d["Handle"],
d["Title"],
d["Descritpion"],
d["Price"],
d["Delivery"],
d["Color"],
d["Dimensions"],
d["Material"],
images.pop(0) if images else None,
]
)
while images:
f.writerow(
[None, None, None, None, None, None, None, None, images.pop(0)]
)
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(SweetPeaAndWillowSpider)
process.start()
Update: I tried opening the file at the satrt and define the headers as well but no difference. I tried using a
appending to the file it makes duplicate entries with duplicate headers.
I am getting Image_Src links only for one product which is the last one. Anyone knows how to fix it? Thanks
Solution
You are creating and writing "malabar_furniture_shopify.csv"
for each response. The result being that you will only ever see the final entry, as all other entries will be overwritten.
One possible workaround would be to append your results:
with open("malabar_furniture_shopify.csv", "a", newline="") as csvfile:
You would then need a flag to ensure the header is only written for your first entry. newline=""
is used to ensure you don't see extra blank rows in the output.
A better approach would be to open the file at the start and write the header once. Then use the same file handle to write each row. At then end ensure the file is closed.
Try the following:
import scrapy
from scrapy.crawler import CrawlerProcess
import csv
class SweetPeaAndWillowSpider(scrapy.Spider):
name = "sweetpea_and_willow"
custom_settings = {
# "FEED_FORMAT": "csv",
# "FEED_URI": "malabar_furniture.csv",
"LOG_FILE": "malabar_furniture_shopify.log",
}
headers = {
"authority": "www.sweetpeaandwillow.com",
"cache-control": "max-age=0",
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Yandex";v="22"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.141 YaBrowser/22.3.3.886 (beta) Yowser/2.5 Safari/537.36",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "navigate",
"sec-fetch-user": "?1",
"sec-fetch-dest": "document",
"accept-language": "en,ru;q=0.9",
}
cookies = {
"amzn-checkout-session": "%7B%7D",
"_fbp": "fb.1.1652394481944.1343184112",
"_pin_unauth": "dWlkPU56VmhNak5rTUdVdE1EVmhaQzAwTkdabExXRm1PREF0TnpOak9XRXdOek5rTjJFeg",
"_ga": "GA1.2.752968178.1652394485",
"SPSI": "4eea709914a47dc1f5575f79dc373b51",
"SPSE": "oc1iOVbm463lrWtCnix8S1Zlf9aGvPeKg7TG7d/WQXvAZjkksosjO/BSl80SLUWb/O8aqo3+lQSH9B1gMRWVdQ==",
"PHPSESSID": "n6mfpugp82troila6hfib78q3k",
"UTGv2": "h483379466221b95c6e78e9eb01940db0f64",
"_hjSessionUser_2692700": "eyJpZCI6ImQ0MDU3M2YzLWM0YjItNTJjMS04YzNiLTM4NzcyMWI5MGY0MyIsImNyZWF0ZWQiOjE2NTIzOTQ0ODI4MTAsImV4aXN0aW5nIjp0cnVlfQ==",
"_hjIncludedInSessionSample": "0",
"_hjSession_2692700": "eyJpZCI6ImExOWI0YjI5LTcxODYtNGU5Ny05Y2UwLTVjYmFmODQ0MWZjYiIsImNyZWF0ZWQiOjE2NTI1OTk3NDU3MTAsImluU2FtcGxlIjpmYWxzZX0=",
"_hjAbsoluteSessionInProgress": "0",
"form_key": "LCm4cy48SHYhBX3C",
"_gid": "GA1.2.1948251329.1652599747",
"_gat": "1",
"mage-cache-storage": "%7B%7D",
"mage-cache-storage-section-invalidation": "%7B%7D",
"mage-cache-sessid": "true",
"recently_viewed_product": "%7B%7D",
"recently_viewed_product_previous": "%7B%7D",
"recently_compared_product": "%7B%7D",
"recently_compared_product_previous": "%7B%7D",
"product_data_storage": "%7B%7D",
"section_data_ids": "%7B%22cart%22%3A1652599747%7D",
"newsletter-popup-form": "declined",
"spcsrf": "ef84c17476941fe30a45db5a0a4b8686",
"sp_lit": "JHxME1OUKp+83P5XseqYpg==",
"PRLST": "AH",
"adOtr": "7ae049U19a4",
}
def start_requests(self):
self.f_output = open("malabar_furniture_shopify.csv", "w", newline="")
self.csv_output = csv.writer(self.f_output)
self.csv_output.writerow(
[
"Handle",
"Title",
"Descritpion",
"Price",
"Delivery",
"Color",
"Dimensions",
"Material",
"Image_Src",
]
)
yield scrapy.Request(
"https://www.sweetpeaandwillow.com/brands/emotional-brands/malabar?p=1",
headers=self.headers,
cookies=self.cookies,
callback=self.parse_urls,
)
def parse_urls(self, response):
url_list = response.css("div.item.product-item")
for link in url_list:
url = link.css("a::attr(href)").get()
yield scrapy.Request(
url=url,
headers=self.headers,
cookies=self.cookies,
callback=self.parse_details,
)
def parse_details(self, response):
data = []
table = response.css("table.data.table.additional-attributes")
for tr in table.css("tbody"):
row = tr.css("tr")
color = row[0].css("td::text").get()
dimension = row[1].css("td::text").get()
material = row[2].css("td::text").get()
data.append(
{
"Handle": response.css("h1.page-title ::text").get().lower(),
"Title": response.css("h1.page-title ::text").get(),
"Descritpion": response.css(
"div#description_product_show > p::text"
).get(),
"Price": response.css("div.original-pricing-wrapper")
.css("span.price ::text")
.getall()[28],
"Delivery": response.css("p.availability-message > span::text").get(),
"Color": color,
"Dimensions": dimension,
"Material": material,
"Image_Src": response.css("div.MagicSlideshow")
.css("a img::attr(src)")
.getall(),
}
)
for d in data:
images = d["Image_Src"]
self.csv_output.writerow(
[
d["Handle"],
d["Title"],
d["Descritpion"],
d["Price"],
d["Delivery"],
d["Color"],
d["Dimensions"],
d["Material"],
images.pop(0) if images else None,
]
)
while images:
self.csv_output.writerow(
[None, None, None, None, None, None, None, None, images.pop(0)]
)
def closed(self, spider):
self.f_output.close()
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(SweetPeaAndWillowSpider)
process.start()
The reason for duplicates is you were always appending to a global data
list.
Answered By - Martin Evans
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.