Issue
I am trying to scrape some data to understand a trend. Below is the scraping code. But I'm getting the output as an empty dataframe. I'm not sure where I am going wrong in the approach.
Output:
Empty DataFrame
Columns: [Title, Price, Discount]
Index: []
Code:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
# Specify the full path to the ChromeDriver executable
driver=webdriver.Chrome()
driver.get('https://www.zara.com/us/en/search?searchTerm=women%20jackets§ion=WOMAN')
driver.maximize_window()
container=driver.find_element(by='xpath', value='.//ul[contains(@class, "product-grid__product-list")]')
products=container.find_elements(by='xpath', value='.//li[contains(@class, "product-grid-product")]')
product_title=[]
product_price=[]
discount=[]
for product in products:
product_title.append(product.find_element(by='xpath',value='.//a[@class="product-link _item product-grid-product-info__name link"]').text)
product_price.append(product.find_element(by='xpath', value='.//span[@class="money-amount__main"]').text)
discount.append(product.find_element(by='xpath',value='.//span[@class="price-current__discount-percentage"]').text)
driver.quit()
df=pd.DataFrame({'Title':product_title,'Price':product_price, 'Discount':discount})
df.to_csv('Zara_Jackets.csv',index=False)
print(df)
Solution
You getting empty data, because you don't wait properly for elements to be rendered.
However, to render more elements you need to scroll down until the end, so you should implement continuous scroll and then get all elements data.
Or you can get all data by parsing query that is responsible for rendering (complicated approach, however, more stable).
Query has url part search/store
, and for it's execution session id is needed.
To make request to contain sessionId param, 2 things are needed:
- You should be logged in
- Resource shouldn't detect you as automation browser. (https://pypi.org/project/undetected-chromedriver/)
If 1 or 2 actions wouldn't be done, sessionId would be empty and API request would return 403 without it.
So you log in and start getting network logs, and look for request that contain search/store
and session
param.
After you found url, you need to open it and parse body as json. (You can use request lib, but I was unable to get correct response with status 200 even with session param, probably some extra headers needed)
From json
object you get totalCount
that represents amount of products and loop through urls, increasing params offset
and limit.
Limit I set up to 20. API allows limit 200, however, on last iteration in this case it returns not all products (158 from 172). When I set limit to 20 for each iteration, last iteration result seemed to be correct.
It looks a bit complicated, but if you will understand how it works, parse data from most of sites wouldn't be big deal.
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import json
from urllib.parse import urlparse, parse_qs, urlunparse, urlencode
import undetected_chromedriver as uc
options = webdriver.ChromeOptions()
options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})
driver = uc.Chrome(options)
driver.get("https://www.zara.com/us/en/logon")
wait = WebDriverWait(driver, 20)
time.sleep(2)
geo_consent = driver.find_elements(By.CSS_SELECTOR, '[data-qa-action=stay-in-store]')
if len(geo_consent) > 0:
geo_consent[0].click()
wait.until(EC.staleness_of(geo_consent[0]))
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '[name=logonId'))).send_keys('your mail')
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '[name=password'))).send_keys('yourpass')
driver.find_element(By.CSS_SELECTOR, '[data-qa-action=logon-form-submit]').click()
driver.get('https://www.zara.com/us/en/search?searchTerm=women%20jackets§ion=WOMAN')
driver.maximize_window()
products = wait.until(
EC.presence_of_all_elements_located((By.XPATH, './/li[contains(@class, "product-grid-product")]')))
logs = driver.get_log("performance")
driver.find_element(By.XPATH, "//button[text()='Woman']").click()
target_url = "search/store"
time.sleep(2)
def get_store_endpoint():
for log in logs:
message = log["message"]
if "Network.responseReceived" not in message:
continue
params = json.loads(message)["message"].get("params")
if params is None:
continue
response = params.get("response")
if response is None or target_url not in response["url"]:
continue
parsed_endpoint = urlparse(response['url'])
query_params = parse_qs(parsed_endpoint.query)
if 'session' in query_params:
return response['url']
product_title = []
product_price = []
discount = []
endpoint = get_store_endpoint()
parsed_endpoint = urlparse(endpoint)
query_params = parse_qs(parsed_endpoint.query)
limit_per_request = 20
offset = 0
driver.get(endpoint)
response = driver.find_element(By.TAG_NAME, 'body').text
total_count = json.loads(response)['totalResults']
while offset < total_count:
parsed_endpoint = urlparse(endpoint)
query_params = parse_qs(parsed_endpoint.query)
if total_count - offset <= limit_per_request:
limit_per_request = total_count - offset
query_params['offset'] = [f"{offset}"]
query_params['limit'] = [f"{limit_per_request}"]
endpoint = urlunparse(parsed_endpoint._replace(query=urlencode(query_params, doseq=True)))
offset += limit_per_request
driver.get(endpoint)
response = driver.find_element(By.TAG_NAME, 'body').text
results = json.loads(response)['results']
for product in results:
content = product['content']
product_title.append(content['name'])
price = 'No price'
product_discount = 'No discount'
if 'price' in content:
price = content['price'] / 100
if 'oldPrice' in content:
product_discount = (content['oldPrice'] - content['price']) / 100
product_price.append(price)
discount.append(product_discount)
df = pd.DataFrame({'Title': product_title, 'Price': product_price, 'Discount': discount})
df.to_csv('Zara_Jackets.csv', index=False)
print(df)
driver.quit()
Answered By - Yaroslavm
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.