Issue
I am trying to scrape shopee.co.id using beautifulsoup and selenium. There are 60 product in a single search results page. At the end of the code, I checked the extracted data using len() and it shows that I only extracted 42 of them. How should I fix the code to obtain all the search results?
Here is the code that I've been trying:
import imp
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options # to customize chrome display
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
from collections import Counter
import threading
import time
import pandas as pd
import numpy as np
from numpy import nan
import re
import concurrent.futures
import csv
# Link product search result
from turtle import delay
url = 'https://shopee.co.id/search?keyword=obat%20kanker'
path = '/Applications/chromedriver'
# create object for chrome options
chrome_options = Options()
# Customize chrome display
chrome_options.add_argument('start-maximized')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('disable-notifications')
# To disable the message, "Chrome is being controlled by automated test software"
chrome_options.add_argument('--disable-infobars')
# create webdriver object
driver = webdriver.Chrome(executable_path=path, options=chrome_options)
driver.get(url)
# get url
main_link = 'https://shopee.co.id/search?keyword=obat%20kanker&page=0'
driver.get(main_link)
WebDriverWait(driver, 5).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "shopee-search-item-result__item")))
driver.execute_script("""
var scroll = document.body.scrollHeight / 10;
var i = 0;
function scrollit(i) {
window.scrollBy({top: scroll, left: 0, behavior: 'smooth'});
i++;
if (i < 10) {
setTimeout(scrollit, 500, i);
}
}
scrollit(i);
""")
sleep(5)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup = BeautifulSoup(html, "html.parser")
# Scrape product name
product_name = soup.find_all('div', class_="ie3A+n bM+7UW Cve6sh")
product_name[0].get_text()
product_price = soup.find_all('span', {'class': 'ZEgDH9'})
product_price[0].get_text()
product_sold = soup.find_all('div', {'class':"r6HknA uEPGHT"})
product_sold[0].get_text()
len(product_name)
Solution
This is one way you can get those product details (selenium setup is chrome/linux, you can adapt the code to your own setup, just see the imports and the code after defining the browser):
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time as t
import json
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('disable-notifications')
chrome_options.add_argument("window-size=1280,720")
webdriver_service = Service("chromedriver/chromedriver") ## path to where you saved chromedriver binary
browser = webdriver.Chrome(service=webdriver_service, options=chrome_options)
url = 'https://shopee.co.id/search?keyword=obat%20kanker&page=0'
browser.get(url)
items = WebDriverWait(browser, 20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'script[data-rh="true"]')))
print(len(items))
for i in items:
json_obj = json.loads(i.get_attribute('innerHTML'))
if json_obj['@type'] == 'Product':
print(json_obj['name'], json_obj['offers'])
print('_____________')
This will print out in terminal:
61
OBAT KANKER TUMOR MIOM KISTA KELENJAR POLIP LIPOM BENJOLAN SEMBUH TOTAL TANPA OPERASI {'@type': 'Offer', 'price': '184000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
_____________
GRAVIDA BHARATA OBAT KANKER PAYUDARA AMPUH |KANKER GANAS HERBAL TERDAFTAR DBPOM MUI WARYANTO076 {'@type': 'Offer', 'price': '275000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
_____________
Walatra Zedoril 7 Asli Obat Herbal Kanker Tumor Dan Segala Jenis Benjolan Aman Tanpa Efek Samping {'@type': 'Offer', 'price': '255000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
_____________
PROMO PAKET SEMBUH OBAT TUMOR KANKER KISTA MIOM & KELENJAR TERLARIS, TERPERCAYA TERBUKTI &GARANSI {'@type': 'Offer', 'price': '349600.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
_____________
Obat Herbal Kanker Payudara, Serviks, Hati, Usus, Prostat, Leukimia dan Paru Paru ORIGINAL 100% ASLI {'@type': 'Offer', 'price': '525000.00', 'priceCurrency': 'IDR', 'availability': 'http://schema.org/InStock'}
[...]
You can dissect those json objects further, to extract the data you need.
Answered By - platipus_on_fire
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.