Issue
I am rather new to programming. I've done a few small projects on my own and have started getting into making webscrapers with Scrapy. I'm trying to make a scraper for Home Depot and have run into issues. The problem this is trying to solve is that the Home Depot webpage has javascript that only loads when you scroll down the page, so I added some code I found that scrolls down the page to reveal all the products so that it can grab the title, review count, and price of each product tile. Before adding this code, it was indeed scraping product info correctly; after adding it I originally had issues with the code only scraping the last page of results, so I moved some things around. I think being new I just don't understand something about objects in Scrapy and how information is passed, particularly the HTML I'm trying to get it to return values for in parse_product. So far this indeed opens up the page and goes to the next page, but it's not scraping any products anymore. Where am I going wrong? I have been struggling with this for hours, I'm taking a class in web scraping and while I've had some success it seems like if I have to do anything slightly off course it's a massive struggle.
import scrapy
import logging
from scrapy.utils.markup import remove_tags
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from scrapy.selector import Selector
from time import sleep
from datetime import datetime
class HdSpider(scrapy.Spider):
name = 'hd'
allowed_domains = ['www.homedepot.com']
start_urls = ['https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao='] #Add %%Nao= to end of URL you got from search or category
def parse(self, response):
options = Options()
chrome_path = which("chromedriver")
driver = webdriver.Chrome(executable_path=chrome_path)#, chrome_options=options)
p = 0 # The home depot URLs end in =24, =48 etc basically products are grouped 24 on a page so this is my way of getting the next page
start_url = 'https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao='
while p < 25:
driver.get(start_url + str(p))
driver.set_window_size(1920, 1080)
#sleep(2)
scroll_pause_time = 1
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
while True: #this is the infinite scoll thing which reveals all javascript generated product tiles
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
sleep(scroll_pause_time)
scroll_height = driver.execute_script("return document.body.scrollHeight;")
if (screen_height) * i > scroll_height:
break
self.html = driver.page_source
p = p + 24
def parse_product(self, response):
resp = Selector(text=self.html)
for products in resp.xpath("//div[@class='product-pod--padding']"):
date = datetime.now().strftime("%m-%d-%y")
brand = products.xpath("normalize-space(.//span[@class='product-pod__title__brand--bold']/text())").get()
title = products.xpath("normalize-space(.//span[@class='product-pod__title__product']/text())").get()
link = products.xpath(".//div//a//@href").get()
model = products.xpath("normalize-space(.//div[@class='product-pod__model'][2]/text())").get()
review_count = products.xpath("normalize-space(.//span[@class='product-pod__ratings-count']/text())").get()
price = products.xpath("normalize-space(.//div[@class='price-format__main-price']//span[2]/text())").get()
yield {
'Date scraped' : date,
'Brand' : brand,
'Title' : title,
'Product Link' : "https://www.homedepot.com" + remove_tags(link),
'Price' : "$" + price,
'Model #' : model,
'Review Count' : review_count
}
Solution
I don't see where you runs parse_product
. It will not execute it automatically for you. Besides function like your parse_product
with response
is rather to use it in some yield Requests(supage_url, parse_product)
to parse data from subpage, not from page which you get in parse
. You should rather move code from parse_product
into parse
like this:
def parse(self, response):
options = Options()
chrome_path = which("chromedriver")
driver = webdriver.Chrome(executable_path=chrome_path)#, chrome_options=options)
driver.set_window_size(1920, 1080)
p = 0 # The home depot URLs end in =24, =48 etc basically products are grouped 24 on a page so this is my way of getting the next page
start_url = 'https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao='
scroll_pause_time = 1
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
while p < 25:
driver.get(start_url + str(p))
#sleep(2)
i = 1
# scrolling
while True: #this is the infinite scoll thing which reveals all javascript generated product tiles
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
sleep(scroll_pause_time)
scroll_height = driver.execute_script("return document.body.scrollHeight;")
if (screen_height) * i > scroll_height:
break
# after scrolling
self.html = driver.page_source
p = p + 24
resp = Selector(text=self.html)
for products in resp.xpath("//div[@class='product-pod--padding']"):
date = datetime.now().strftime("%m-%d-%y")
brand = products.xpath("normalize-space(.//span[@class='product-pod__title__brand--bold']/text())").get()
title = products.xpath("normalize-space(.//span[@class='product-pod__title__product']/text())").get()
link = products.xpath(".//div//a//@href").get()
model = products.xpath("normalize-space(.//div[@class='product-pod__model'][2]/text())").get()
review_count = products.xpath("normalize-space(.//span[@class='product-pod__ratings-count']/text())").get()
price = products.xpath("normalize-space(.//div[@class='price-format__main-price']//span[2]/text())").get()
yield {
'Date scraped' : date,
'Brand' : brand,
'Title' : title,
'Product Link' : "https://www.homedepot.com" + remove_tags(link),
'Price' : "$" + price,
'Model #' : model,
'Review Count' : review_count
}
But I would do other changes - you use p = p + 24
but when I check page in browser then I see I need p = p + 48
to get all product. Instead of p = p + ...
I would rather use Selenium
to click button >
to get next page.
EDIT:
My version with other changes.
Everyone can run it without creating project.
#!/usr/bin/env python3
import scrapy
from scrapy.utils.markup import remove_tags
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from shutil import which
from scrapy.selector import Selector
from time import sleep
from datetime import datetime
class HdSpider(scrapy.Spider):
name = 'hd'
allowed_domains = ['www.homedepot.com']
start_urls = ['https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao='] #Add %%Nao= to end of URL you got from search or category
def parse(self, response):
options = Options()
chrome_path = which("chromedriver")
driver = webdriver.Chrome(executable_path=chrome_path) #, chrome_options=options)
#driver.set_window_size(1920, 1080)
print(dir(driver))
driver.maximize_window()
scroll_pause_time = 1
# loading first page
start_url = 'https://www.homedepot.com/b/Home-Decor-Artificial-Greenery-Artificial-Flowers/N-5yc1vZcf9y?Nao=0'
driver.get(start_url)
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
#while True: # all pages
for _ in range(5): # only 5 pages
#sleep(scroll_pause_time)
# scrolling page
i = 1
while True: #this is the infinite scoll thing which reveals all javascript generated product tiles
driver.execute_script(f"window.scrollBy(0, {screen_height});")
sleep(scroll_pause_time)
i += 1
scroll_height = driver.execute_script("return document.body.scrollHeight;")
if screen_height * i > scroll_height:
break
# after scrolling
resp = Selector(text=driver.page_source)
for products in resp.xpath("//div[@class='product-pod--padding']"):
date = datetime.now().strftime("%m-%d-%y")
brand = products.xpath("normalize-space(.//span[@class='product-pod__title__brand--bold']/text())").get()
title = products.xpath("normalize-space(.//span[@class='product-pod__title__product']/text())").get()
link = products.xpath(".//div//a//@href").get()
model = products.xpath("normalize-space(.//div[@class='product-pod__model'][2]/text())").get()
review_count = products.xpath("normalize-space(.//span[@class='product-pod__ratings-count']/text())").get()
price = products.xpath("normalize-space(.//div[@class='price-format__main-price']//span[2]/text())").get()
yield {
'Date scraped' : date,
'Brand' : brand,
'Title' : title,
'Product Link' : "https://www.homedepot.com" + remove_tags(link),
'Price' : "$" + price,
'Model #' : model,
'Review Count' : review_count
}
# click button `>` to load next page
try:
driver.find_element_by_xpath('//a[@aria-label="Next"]').click()
except:
break
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
})
c.crawl(HdSpider)
c.start()
Answered By - furas
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.