Issue
I'm trying to scrape products info from homedepot like price, product details, specifications, images, and s on. I was able to scrape all these information but now I don't know how to scrape the price if different combination of options selected in the product since the price is changing based on these options. Is there any way to scrape the price and image for each possible combination in the options in the product?
for more clear explanation to my problem see that product url
you can see that there're some options in the right of the image each parameter has multiple options and each combination of these options when selected it changes the image and the price. If possible how can I scrape these info?
Note: I'm using selenium and BeautifulSoup
Update:
Here's my code so far for scraping the options part in the product page
def scrape_price(self):
if self.soup.find("div", attrs={"class":"price-format__large price-format__main-price"}):
price_div = self.soup.find("div", attrs={"class":"price-format__large price-format__main-price"})
price_curr = price_div.findAll("span")[0].text
price_doll = price_div.findAll("span")[1].text
price_cent=""
if len(price_div.findAll("span")) > 2:
price_cent = price_div.findAll("span")[2].text
if price_cent != "":
self.data['price']=price_curr+price_doll+"."+price_cent
else:
self.data['price']=price_curr+price_doll
else:
if self.soup.find("div",attrs={"class":"price-detailed__left-price-wrapper price-detailed__left-price-row"}):
price_div = self.soup.find("div",attrs={"class":"price-detailed__left-price-wrapper price-detailed__left-price-row"})
price_span = price_div.find("span",attrs={"class":"price-detailed__unit-price"}).find("span").text
#unit_span = price_div.findAll("span")[1].text
self.data['price']=price_span
else:
if self.soup.find("div", attrs={"class":"pricingReg"}):
price_div = self.soup.find("div", attrs={"class":"pricingReg"})
curr = price_div.find("span", attrs={"class":"price__currency"}).text
dollars = price_div.find("span", attrs={"class":"price__dollars"}).text
cents = price_div.find("span", attrs={"class":"price__cents"}).text
price = curr+dollars+"."+cents
self.data['price']=price
self.data['Availability'] = "Available"
else:
self.data['Availability'] = "Not Available"
if self.soup.find("div", attrs={"class":"price-detailed__left-price-wrapper price-detailed__left-price-row"}):
detailed_price_tag = self.soup.find("div", attrs={"class":"price-detailed__left-price-wrapper price-detailed__left-price-row"})
detailed_price = cleanhtml(detailed_price_tag.text)
self.data["Detailed Price"] = detailed_price
if self.soup.find("div", attrs={"class":"price-detailed__unit-cover"}):
self.data["Case Unit Cover"] = self.soup.find("div", attrs={"class":"price-detailed__unit-cover"}).text
def scrape_images(self):
if self.soup.findAll("button", attrs={'class':"mediagallery__imgblock"}):
img_btns = self.soup.findAll("button", attrs={'class':"mediagallery__imgblock"})
count=0
self.data["images"]=[]
for img_btn in img_btns:
img_url = img_btn.find("img").get("src")
self.data["images"].append(img_url)
count+=1
else:
if self.soup.find("div", attrs={"class":"styles__ThumbnailList-sc-10zajq9-5 gyXsdF"}):
images_div = self.soup.find("div", attrs={"class":"styles__ThumbnailList-sc-10zajq9-5 gyXsdF"})
images_divs = images_div.findAll("div", attrs={"class":"styles__ThumbnailInner-sc-10zajq9-1 icLycq"})
imgs=[]
for image_div in images_divs:
if image_div.find("img"):
img_src = image_div.find("img").get("src")
imgs.append(img_src)
self.data["images"]= imgs
def scrape_options(self):
if self.soup.find("div", attrs={"class":"super-sku"}):
param_tag = self.soup.find("div", attrs={"class":"super-sku"})
params = param_tag.findAll("div", attrs={"class":"super-sku__inline-attribute"})
parameters=[]
for param in params:
param_body = param.find("div", attrs={"class":"label"}).text
cleaned_param = cleanhtml(param_body)
splitted = cleaned_param.split(':')
label = splitted[0]
val = splitted[1]
options_div=param.findAll("div", attrs={"class":"super-sku__inline-tile--space"})
if len(options_div) == 0:
options_div=param.findAll("button", attrs={"class":"super-sku__inline-swatch"})
options=[]
for opt_div in options_div:
if opt_div.find("img"):
opt = {
"img" : opt_div.find("img").get("src"),
"label":opt_div.find("img").get("title")
}
else:
opt = opt_div.find("button").text
options.append(opt)
parameters.append({
"Label":label,
"Value":val,
"Options":options
})
self.data["Parameters"] = parameters
else:
if self.soup.find("div", attrs={"class":"buybox__super-sku"}):
options=[]
options_divs = self.soup.find("div", attrs={"class":"buybox__super-sku"}).find_all("div",recursive=False)
for option_div in options_divs:
option={}
optionheader0 = option_div.find("div", attrs={"class":"styles__HeaderRow-fb29x6-1"})
optionheader1 = option_div.find("div", attrs={"class":"styles__Header-sc-1gql1zk-0"})
if optionheader0 or optionheader1:
if optionheader0:
header_div = optionheader0
else:
header_div = optionheader1
if header_div.find("span", attrs={"class":"styles__Label-sc-1gql1zk-1"}):
label = header_div.find("span", attrs={"class":"styles__Label-sc-1gql1zk-1"}).text
option["Label"] = label
if header_div.find("span", attrs={"class":"styles__Value-sc-1gql1zk-2"}):
value = header_div.find("span", attrs={"class":"styles__Value-sc-1gql1zk-2"}).text
option["Value"] = value
optionchoices0 = option_div.find("div", attrs={"class":"DefaultTemplate__FixedSizeChoiceImageWrapper-rpf825-0"})
optionchoices1 = option_div.find("div", attrs={"class":"styles__TileSelectWrapper-jw86q8-1"})
optionchoices2 = option_div.find("div", attrs={"class":"product_sku_Overlay_ListBoxes"})
optionchoices3 = option_div.find("div", attrs={"class":"product_sku_Overlay_ColorSwtHolder"})
if optionchoices0 or optionchoices1 or optionchoices2 or optionchoices3:
if optionchoices0:
choices_div = optionchoices0
choices=[]
choices_images=choices_div.findAll("div",attrs={"class":"styles__ChoiceImage-kykx13-4"})
for choice_div in choices_images:
if choice_div.find("img"):
choice_img = choice_div.find("img").get("src")
choice_val = choice_div.find("img").get("alt")
choices.append({
"img":choice_img,
"value":choice_val
})
option["choices"]=choices
elif optionchoices2:
choices_div = optionchoices2
choices=[]
choices_images=choices_div.findAll("span",attrs={"class":"drop-down__hover-effect"})
for choice_div in choices_images:
if choice_div.find("a"):
choice_text = choice_div.find("a").text
choices.append(choice_text)
option["choices"]=choices
elif optionchoices3:
choices_div = optionchoices3
choices=[]
choices_images=choices_div.findAll("li",attrs={"class":"styles__SwatchRoot-sc-1kr5yl9-1"})
for choice_div in choices_images:
if choice_div.find("img"):
choice_img = choice_div.find("img").get("src")
choice_val = choice_div.find("img").get("title")
choices.append({
"img":choice_img,
"value":choice_val
})
option["choices"]=choices
else:
choices_div = optionchoices1
choices=[]
choices_images=choices_div.findAll("div",attrs={"class":"styles__TileDiv-jw86q8-0"})
for choice_div in choices_images:
choice_text = choice_div.text
choices.append(choice_text)
option["choices"]=choices
options.append(option)
self.data["options"] = options
now I want to know how can I scrape the price for each combination of these options
Solution
An important facet of the target page is that whenever an item is toggled (clicked or selected), additional pricing options can emerge. This solution recursively traverses the feature listing, clicking on each one, and continues the process on the rest of price listings once they appear:
from selenium import webdriver
import time, re
d = webdriver.Chrome('/path/to/chromedriver')
d.get('https://custom.homedepot.com/custom-doors/p/Steves-Sons-Regency-Modern-Customizable-Fiberglass-Door/314599913/45272-Pre-Hung/57533-Single-w-Two-Sidelites-Transom/57526-36-x-93/40069-12/57523-64-1-2-x-95-1-4/55572-Autumn-Wheat/45147-Left-Hand-Inswing/55578-Glass-Panels/45143-Black-Bronze/35733-4-9-16')
def get_combos(_seen):
flag = False
for i, a in enumerate(d.execute_script("""return document.querySelector('.buybox__super-sku').children""")):
if i and i not in dict(_seen):
flag = True
for _s in ['.styles__BoxChoice-kykx13-3', '.styles__TileSelectWrapper-jw86q8-1', '.styles__SwatchRoot-sc-1kr5yl9-1', '.drop-down__hover-effect a']:
p = f'.buybox__super-sku > div:nth-child({i+1}) {_s}'
if (op1:=d.execute_script(f"""return document.querySelectorAll('{p}')""")):
for j, _ in enumerate(op1):
try:
d.execute_script(f"""document.querySelectorAll('{p}')[{j}].click()""")
time.sleep(1)
yield from get_combos([*_seen, [i, [d.execute_script(f"""return (x => x === undefined ? 'n/a' : x.textContent)(document.querySelector('.buybox__super-sku').children[{i}].querySelector('.styles__Label-sc-1gql1zk-1'))"""), d.execute_script(f"""return (x => x === undefined ? 'n/a' : x.textContent)(document.querySelector('.buybox__super-sku').children[{i}].querySelector('.styles__Value-sc-1gql1zk-2'))""")]]])
except:
pass
break
break
if not flag:
yield {'price':d.execute_script("""return document.querySelector('span:nth-of-type(1).pReg').textContent"""),
'img':d.execute_script("""return document.querySelector('.styles__ThumbnailInner-sc-10zajq9-1.icLycq img').getAttribute('src')"""),
'combo':_seen}
result = list(get_combos([]))
final_result = [{'price':f'{i["price"][:6]}.{i["price"][-2:]}', 'image':i['img'], **({re.sub(':\s*$', '', a):b for _, [a, b] in i['combo']})} for i in result]
Output:
[{'price': '$1,423.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009218?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Left-Hand Inswing', 'Panel Type': 'Glass Panels', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '4 9/16'}, {'price': '$1,506.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009266?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Left-Hand Inswing', 'Panel Type': 'Glass Panels', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '6 9/16'}, {'price': '$1,264.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009242?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Left-Hand Inswing', 'Panel Type': 'V-Groove Panel', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '4 9/16'}, {'price': '$1,346.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009290?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Left-Hand Inswing', 'Panel Type': 'V-Groove Panel', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '6 9/16'}, {'price': '$1,423.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009219?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Right-Hand Inswing', 'Panel Type': 'Glass Panels', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '4 9/16'}, {'price': '$1,506.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009267?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Right-Hand Inswing', 'Panel Type': 'Glass Panels', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '6 9/16'}, {'price': '$1,264.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009243?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Right-Hand Inswing', 'Panel Type': 'V-Groove Panel', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '4 9/16'}, {'price': '$1,346.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009291?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Right-Hand Inswing', 'Panel Type': 'V-Groove Panel', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '6 9/16'}, {'price': '$1,423.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009220?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Left-Hand Outswing', 'Panel Type': 'Glass Panels', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '4 9/16'}, {'price': '$1,506.00', 'image': 'https://custom.homedepot.com/is/image/blinds/SIP0000009268?fmt=jpeg&fit=constrain,1&hei=100', 'Door Type': 'Pre-Hung', 'Door Configuration NEW': 'Single Door', 'Door Size (WxH) in.': '36 x 80', 'Total Door Frame Size (WxH) in.': '37 1/2 x 81 1/2', 'Door Color': 'Autumn Wheat', 'Door Handing': 'Left-Hand Outswing', 'Panel Type': 'Glass Panels', 'Hinge and Sill Color': 'Black-Bronze', 'Jamb Depth (in.)': '6 9/16'}]
Regarding a concurrent version of the above solution for use on many thousands of input links, there are a couple immediate particulars that should be addressed:
- First, the hosting of your
selenium
instances. You can either spin up a number ofselenium
driver instances on your own machine or use a service like Browserless to do this for you.selenium
is very resource intensive, so a hosting service that can automatically handle many separateselenium
instances is probably your best approach to take. - Second, the method by which to interact with the target pages. If you use a service like Browserless, you could pass a Javascript function to
driver.execute_script
with a specific timeout that will offload the page interactions to the service itself.
Below is a solution that maintains a pool of selenium
driver instances with an async
version of get_combos
. These drivers can either point to a remote selenium
instance (like Browserless) or local instances on your own machine.
First, the async
implementation of get_combos
:
import asyncio, functools
from selenium import webdriver
async def get_page_combos(d, link):
d.get(link)
async def get_combos(_seen):
flag = False
loop = asyncio.get_running_loop()
first_vals = await loop.run_in_executor(None, functools.partial(d.execute_script, """return document.querySelector('.buybox__super-sku').children"""))
for i, a in enumerate(first_vals):
if i and i not in dict(_seen):
flag = True
for _s in ['.styles__BoxChoice-kykx13-3', '.styles__TileSelectWrapper-jw86q8-1', '.styles__SwatchRoot-sc-1kr5yl9-1', '.drop-down__hover-effect a']:
p = f'.buybox__super-sku > div:nth-child({i+1}) {_s}'
loop = asyncio.get_running_loop()
if (op1:=(await loop.run_in_executor(None, functools.partial(d.execute_script, f"""return document.querySelectorAll('{p}')""")))):
for j, _ in enumerate(op1):
try:
loop = asyncio.get_running_loop()
await loop.run_in_exector(None, functools.partial(d.execute_script, f"""document.querySelectorAll('{p}')[{j}].click()"""))
await asyncio.sleep(1)
new_vals = [(await loop.run_in_executor(None, functools.partial(d.execute_script, f"""return (x => x === undefined ? 'n/a' : x.textContent)(document.querySelector('.buybox__super-sku').children[{i}].querySelector('.styles__Label-sc-1gql1zk-1'))"""))), (await loop.run_in_executor(None, functools.partial(d.execute_script, f"""return (x => x === undefined ? 'n/a' : x.textContent)(document.querySelector('.buybox__super-sku').children[{i}].querySelector('.styles__Value-sc-1gql1zk-2'))""")))]
async for pl in get_combos([*_seen, [i, new_vals]]):
yield pl
except:
pass
break
break
if not flag:
loop = asyncio.get_running_loop()
yield {'price':await loop.run_in_executor(None, functools.partial(d.execute_script, """return document.querySelector('span:nth-of-type(1).pReg').textContent""")),
'img':await loop.run_in_executor(None, functools.partial(d.execute_script, """return document.querySelector('.styles__ThumbnailInner-sc-10zajq9-1.icLycq img').getAttribute('src')""")),
'combo':_seen}
result = []
async for i in get_combos([]):
result.append(i)
return result
Second, putting it all together:
async def main(links, instance_num = 10): #you can adjust the number of instances depending on your needs
drivers = [webdriver.Chrome('<path>') for _ in range(instance_num)] #<path> can be substituted for a path to a local chromedriver executable or a url to a remote instance
final_results = []
while links:
pairing = [(a, b) for a, b in zip(drivers, [links.pop(0) if links else None for _ in range(instance_num)]) if b]
vals = await asyncio.gather(*[get_page_combos(*i) for i in pairing])
final_results.extend(vals)
return final_results
links = [...] #all your homedepot links to be crawled
all_page_vals = asyncio.run(main(links))
Answered By - Ajax1234
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.