Issue
I have problems trying to scrape a web with multiple pages with Spyder: the web has 1 to 6 pages and also a next button. Also, each of one the six pages has 30 results. I've tried two solutions without success.
This is the first one:
#SOLUTION 1#
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://store.unionlosangeles.com/collections/outerwear?sort_by=creation_date&page_num=1')
#Imports the HTML of the webpage into python
soup = BeautifulSoup(driver.page_source, 'lxml')
postings = soup.find_all('div', class_ = 'isp_grid_product')
#Creates data frame
df = pd.DataFrame({'Link':[''], 'Vendor':[''],'Title':[''], 'Price':['']})
#Scrape the data
for i in range (1,7): #I've also tried with range (1,6), but it gives 5 pages instead of 6.
url = "https://store.unionlosangeles.com/collections/outerwear?sort_by=creation_date&page_num="+str(i)+""
postings = soup.find_all('li', class_ = 'isp_grid_product')
for post in postings:
link = post.find('a', class_ = 'isp_product_image_href').get('href')
link_full = 'https://store.unionlosangeles.com'+link
vendor = post.find('div', class_ = 'isp_product_vendor').text.strip()
title = post.find('div', class_ = 'isp_product_title').text.strip()
price = post.find('div', class_ = 'isp_product_price_wrapper').text.strip()
df = df.append({'Link':link_full, 'Vendor':vendor,'Title':title, 'Price':price}, ignore_index = True)
The output of this code is a data frame with 180 rows (30 x 6), but it repeats the results of the first page. Thus, my first 30 rows are the first 30 results of the first page, and the rows 31-60 are again the same results of the first page and so on.
Here is the second solution I tried:
### SOLUTION 2 ###
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://store.unionlosangeles.com/collections/outerwear?sort_by=creation_date&page_num=1')
#Imports the HTML of the webpage into python
soup = BeautifulSoup(driver.page_source, 'lxml')
soup
#Create data frame
df = pd.DataFrame({'Link':[''], 'Vendor':[''],'Title':[''], 'Price':['']})
#Scrape data
i = 0
while i < 6:
postings = soup.find_all('li', class_ = 'isp_grid_product')
len(postings)
for post in postings:
link = post.find('a', class_ = 'isp_product_image_href').get('href')
link_full = 'https://store.unionlosangeles.com'+link
vendor = post.find('div', class_ = 'isp_product_vendor').text.strip()
title = post.find('div', class_ = 'isp_product_title').text.strip()
price = post.find('div', class_ = 'isp_product_price_wrapper').text.strip()
df = df.append({'Link':link_full, 'Vendor':vendor,'Title':title, 'Price':price}, ignore_index = True)
#Imports the next pages HTML into python
next_page = 'https://store.unionlosangeles.com'+soup.find('div', class_ = 'page-item next').get('href')
page = requests.get(next_page)
soup = BeautifulSoup(page.text, 'lxml')
i += 1
The problem with this second solution is that the program cannot recognize the attribute "get" in next_page
, for reasons I cannot grasp (I haven't had this problem in other webs with paginations). Thus, I get only the first page and not the others.
How can I fix the code to properly scrape all 180 elements?
Solution
The data you see is loaded from external URL via javascript. You can simulate these calls with requests
module. For example:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs
url = "https://store.unionlosangeles.com/collections/outerwear?sort_by=creation_date&page_num=1"
api_url = "https://cdn-gae-ssl-premium.akamaized.net/categories_navigation"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
params = {
"page_num": 1,
"store_id": "",
"UUID": "",
"sort_by": "creation_date",
"facets_required": "0",
"callback": "",
"related_search": "1",
"category_url": "/collections/outerwear",
}
q = parse_qs(
urlparse(soup.select_one("#isp_search_result_page ~ script")["src"]).query
)
params["store_id"] = q["store_id"][0]
params["UUID"] = q["UUID"][0]
all_data = []
for params["page_num"] in range(1, 7):
data = requests.get(api_url, params=params).json()
for i in data["items"]:
link = i["u"]
vendor = i["v"]
title = i["l"]
price = i["p"]
all_data.append([link, vendor, title, price])
df = pd.DataFrame(all_data, columns=["link", "vendor", "title", "price"])
print(df.head(10).to_markdown(index=False))
print("Total items =", len(df))
Prints:
link | vendor | title | price |
---|---|---|---|
/products/barn-jacket | Essentials | BARN JACKET | 250 |
/products/work-vest-2 | Essentials | WORK VEST | 120 |
/products/tailored-track-jacket | Martine Rose | TAILORED TRACK JACKET | 1206 |
/products/work-vest-1 | Essentials | WORK VEST | 120 |
/products/60-40-cloth-bug-anorak-1tone | Kapital | 60/40 Cloth BUG Anorak (1Tone) | 747 |
/products/smooth-jersey-stand-man-woman-track-jkt | Kapital | Smooth Jersey STAND MAN & WOMAN Track JKT | 423 |
/products/supersized-sports-jacket | Martine Rose | SUPERSIZED SPORTS JACKET | 1695 |
/products/pullover-vest | Nicholas Daley | PULLOVER VEST | 267 |
/products/flannel-polkadot-x-bandana-reversible-1st-jkt-1 | Kapital | FLANNEL POLKADOT X BANDANA REVERSIBLE 1ST JKT | 645 |
/products/60-40-cloth-bug-anorak-1tone-1 | Kapital | 60/40 Cloth BUG Anorak (1Tone) | 747 |
Total items = 175
Answered By - Andrej Kesely
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.