Issue
I'm trying to scrape a list of products of products on the following page: https://www.beermerchants.com/browse/brewery/cantillon , however I only want to print products that are in stock. I've been able to scrape the full list of products with the following code, however how can I modify this so that this is only true for products that are in stock?
import ssl
import requests
import sys
import time
import smtplib
from email.message import EmailMessage
import hashlib
from urllib.request import urlopen
from datetime import datetime
import json
import random
import requests
from itertools import cycle
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from urllib3.exceptions import InsecureRequestWarning
from requests_html import HTMLSession
session = HTMLSession()
# Suppress only the single warning from urllib3 needed.
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
user_agent_list = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
]
for i in range(1,4):
#Pick a random user agent
user_agent = random.choice(user_agent_list)
#Set the headers
headers = {'User-Agent': user_agent}
url = 'https://www.beermerchants.com/browse/brewery/cantillon'
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.text,features="html.parser")
link = []
for product in soup.find_all('a', href=True, class_="product-item-link"):
link.append(product['href'])
print(link)
Thanks in advance!!!
Solution
I have adapted your code to use xpath, in order to add more complex logic.
I've checked the products that can be added to cart (meaning they are in stock)
import ssl
import requests
import sys
import time
import smtplib
from email.message import EmailMessage
import hashlib
from urllib.request import urlopen
from datetime import datetime
import json
import random
import requests
from itertools import cycle
import pandas as pd
from bs4 import BeautifulSoup
from lxml import etree
from selenium import webdriver
from urllib3.exceptions import InsecureRequestWarning
from requests_html import HTMLSession
session = HTMLSession()
# Suppress only the single warning from urllib3 needed.
requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
user_agent_list = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
]
for i in range(1,4):
#Pick a random user agent
user_agent = random.choice(user_agent_list)
#Set the headers
headers = {'User-Agent': user_agent}
url = 'https://www.beermerchants.com/browse/brewery/cantillon'
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.text,features="html.parser")
link = []
dom = etree.HTML(str(soup))
for i in dom.xpath('//div[contains(@class, "product-item-info") and .//form[@data-role="tocart-form"]]//a[@class="product-item-link"]/@href'):
print(i)
Answered By - SimonR
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.