Issue
I have the followind code
import pandas as pd
import requests
from bs4 import BeautifulSoup
import datetime
import time
url_list = [
'https://www.coolmod.com/componentes-pc-procesadores?f=375::No',
# 'https://www.coolmod.com/componentes-pc-placas-base?f=55::ATX||prices::3-300',
]
df_list = []
for url in url_list:
headers = ({'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Accept-Language': 'es-ES, es;q=0.5'})
print (url)
r = requests.get(url, headers = headers)
print(r.status_code)
soup = BeautifulSoup(r.content,'html.parser')
items = soup.find_all('div',class_='col-xs-12 col-sm-6 col-sm-6 col-md-6 col-lg-3 col-product col-custom-width')
# print(items)
store = 'Coolmod'
extraction_date = datetime.datetime.today().replace(microsecond=0)
for item in items:
product_name = item.find('div',class_ = 'product-name').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
try:
price = item.find('div', class_ = 'margin-top-20 mod-product-price text-big').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
except AttributeError:
price = item.find('div', class_ = 'mod-product-price text-big').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
except AttributeError:
price = "No price" # .replace('€','').replace('\t','').replace('\n', '').replace('\r', '')
# old_price = item.find(class_ = 'old-price product-price').text[:-2] if item.find(class_ = 'old-price product-price') != None else None
try:
availability = item.find('div', class_ = 'product-availability cat-product-availability').text.replace('\t','').replace('\n', '').replace('\r', '')
# except AttributeError:
# availability = item.find('span', class_ = 'btn-addtocart btn-icon disabled').text.replace('\t','').replace('\n', '').replace('\r', '')
except AttributeError:
availability = "No info"
# stock = [item.find(class_ = 'item-availability').get_text() if item.find(class_ = 'item-availability') != None else None for item in items]
product_info = {
'product_name' : product_name,
'price' : price,
# 'old_price' : old_price,
'availability' : availability,
'store' : store,
'date_extraction' : extraction_date,
}
df_list.append(product_info)
time.sleep(3)
df = pd.DataFrame(df_list)
print(df)
It works fine and return a dataframe with the expected results. The problem is only retrieve the twenty first records, after that there is a "Show more" button in order to get the next twenty products and so on.
I see the web page code and inspect it but I don´t find a way to interact with the button.
Any idea or suggestion would be much appreciated.
Regards.
Solution
Finally I got it
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
import datetime
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
import random
import pandas as pd
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
#options.add_argument('--headless')
driver = webdriver.Chrome(executable_path=r"C:\\chromedriver.exe", options=options)
url = 'https://www.coolmod.com/componentes-pc-procesadores?f=375::No'
driver.get(url)
sleep(random.uniform(5.0, 7.5))
try:
popup = driver.find_element_by_class_name('confirm').click()
except NoSuchElementException:
pass
iter = 1
while iter > 0:
sleep(random.uniform(3.5, 6.5))
try:
ver_mas = driver.find_element_by_class_name('button-load-more')
actions = ActionChains(driver)
actions.move_to_element(ver_mas).perform()
driver.execute_script("arguments[0].click();", ver_mas)
except NoSuchElementException:
break
iter += 1
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml')
# print(soup)
items = soup.find_all('div', class_='col-lg-12 col-md-12 col-sm-8 col-xs-9 cat-container-text')
# print(len(items))
df_list = []
store = 'Coolmod'
extraction_date = datetime.datetime.today().replace(microsecond=0)
for item in items:
product_name = item.find('div',class_ = 'product-name').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
try:
price = item.find('div', class_ = 'margin-top-20 mod-product-price text-big').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
except AttributeError:
price = item.find('div', class_ = 'mod-product-price text-big').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
except AttributeError:
price = "No price"
try:
availability = item.find('div', class_ = 'product-availability cat-product-availability').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
except AttributeError:
availability = item.find('div', class_ = 'product-availability cat-product-availability local-available').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
except AttributeError:
availability = "No info"
product_info = {
'product_name' : product_name,
'price' : price,
'availability' : availability,
'store' : store,
'date_extraction' : extraction_date,
}
df_list.append(product_info)
df = pd.DataFrame(df_list)
print(df)
Thanks @Alin Stelian for the help
Answered By - CMonte2
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.