Issue
I'm writing a Python script that parses HTML (a classifieds website) and sends me email notifications on specific products and price points. Everything works here except for the "listing_url" capture, which I want displayed in the email so I can click on the url to visit the product page. I tried scraping the corresponding tag with "class=listing-card__inner" as inspected on the website but this isn't working.
This is my full code:
import requests
from bs4 import BeautifulSoup
import smtplib
from email.mime.text import MIMEText
# Email details
sender_email = "[email protected]"
receiver_email = "[email protected]"
smtp_server = "smtp.gmail.com"
smtp_port = 587
smtp_username = "[email protected]"
smtp_password = "xxxxxxxx"
# Send email function
def send_email(subject, body):
message = MIMEText(body)
message['Subject'] = subject
message['From'] = sender_email
message['To'] = receiver_email
with smtplib.SMTP(smtp_server, smtp_port) as server:
server.starttls()
server.login(smtp_username, smtp_password)
server.sendmail(sender_email, receiver_email, message.as_string())
# Scrape listings function
def scrape_listings(url):
# Make a GET request to the website
response = requests.get(url)
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Find all the laptop listings
listings = soup.find_all('div', class_='listing-card__content')
# List to store qualifying listings
qualifying_listings = []
# Iterate through listings and check conditions
for listing in listings:
title = listing.find('div', class_='listing-card__header__title').text.strip()
date = listing.find('div', class_='listing-card__header__date').text.strip()
price = listing.find('span', class_='listing-card__price__value 1').text.strip()
price = price.replace('Br', '').replace(',', '').strip()
price = int(price)
listing_url = listing.find('a', class_='listing-card__inner')['href']
if price < 80000:
qualifying_listings.append((title, date, price, listing_url))
return qualifying_listings
# Main function
def main():
base_url = "https://www.qefira.com/classifieds?q=gaming%20laptops&sort=latest"
qualifying_listings = []
# Scrape first page
qualifying_listings += scrape_listings(base_url)
# Scrape remaining pages, limited to 8
page = 2 # Start from the second page
while page <= 9: # Limit to 8 pages (2 to 9)
url = base_url + f"&page={page}"
qualifying_listings += scrape_listings(url)
page += 1
# Prepare email subject and body
subject = "Gaming Laptops under 80,000"
total_count = len(qualifying_listings)
body = f"Total Qualifying Listings: {total_count}\n\n"
for listing in qualifying_listings:
title, date, price, listing_url = listing
body += f"Title: {title}\n"
body += f"Date Posted: {date}\n"
body += f"Price: {price}\n"
body += f"URL: {listing_url}\n"
body += "\n"
# Send the email
send_email(subject, body)
# Run the script
if __name__ == '__main__':
main()
The following is a shortened sample of the tag on the website from which I am trying to scrape the URL in the href object on the very first line:
<a href="https://www.qefira.com/listings/hpelitebook-core-i5-4th-laptops-5366113" class="listing-card__inner" id="listing-5366113" data-t-listing="" data-t-listing_context="search" data-t-listing_id="5366113" data-t-listing_title="HpElitebook core i5 4th Laptops" data-t-listing_type="classified" data-t-listing_category_title="Laptops" data-t-listing_category_slug="laptops" data-t-listing_slug="hpelitebook-core-i5-4th-laptops" data-t-listing_price="19500.00" data-t-listing_currency="ETB" data-t-listing_location_title="Bole" data-t-listing_source="qe_et" data-t-listing_product_slugs="listing">...</a>
Any guidance on how I can successfully scrape the URL so it can appear as a clickable link inside the email body?
Tried different iterations of the .find method such as:
listing_url_element = listing.find('a', class_='listing-card__inner')
listing_url = listing_url_element['href'] if listing_url_element else "N/A"
or
listing_url = listing.find('a', class_='listing-card__title')['href']
or
listing_url_element = listing.find('a', class_='listing-card__inner')
listing_url = listing_url_element['href'] if listing_url_element and 'href' in listing_url_element.attrs else "N/A"
or even:
import re
listing_url_element = listing.find('a', class_='listing-card__inner')
listing_url_match = re.search(r'href="(.*?)"', str(listing_url_element))
listing_url = listing_url_match.group(1) if listing_url_match else "N/A"
Still nothing (or the "N/A" is displayed rather than the actual URL). The common error is something like:
listing_url = listing.find('a', class_='listing-card__inner')['href']
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^
TypeError: 'NoneType' object is not subscriptable
Solution
I advise you to use this class 'listing-card__content' instead of this one 'listing-card--has-content'. I also added a check to avoid the code execution stopping.
Here is the full code:
import requests
from bs4 import BeautifulSoup
import smtplib
from email.mime.text import MIMEText
# Email details
sender_email = "[email protected]"
receiver_email = "[email protected]"
smtp_server = "smtp.gmail.com"
smtp_port = 587
smtp_username = "[email protected]"
smtp_password = "xxxxxxxx"
# Send email function
def send_email(subject, body):
message = MIMEText(body)
message['Subject'] = subject
message['From'] = sender_email
message['To'] = receiver_email
with smtplib.SMTP(smtp_server, smtp_port) as server:
server.starttls()
server.login(smtp_username, smtp_password)
server.sendmail(sender_email, receiver_email, message.as_string())
# Scrape listings function
def scrape_listings(url):
# Make a GET request to the website
response = requests.get(url)
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Find all the laptop listings
# listings = soup.find_all('div', class_='listing-card__content')
listings = soup.find_all('div', class_='listing-card--has-content')
# List to store qualifying listings
qualifying_listings = []
# Iterate through listings and check conditions
for listing in listings:
title = listing.find('div', class_='listing-card__header__title').text.strip()
date = listing.find('div', class_='listing-card__header__date').text.strip()
price = listing.find('span', class_='listing-card__price__value 1').text.strip()
price = price.replace('Br', '').replace(',', '').strip()
price = int(price)
a = listing.find('a', class_='listing-card__inner')
if a:
listing_url = a['href']
print(listing_url)
if price < 80000:
qualifying_listings.append((title, date, price, listing_url))
return qualifying_listings
# Main function
def main():
base_url = "https://www.qefira.com/classifieds?q=gaming%20laptops&sort=latest"
qualifying_listings = []
# Scrape first page
qualifying_listings += scrape_listings(base_url)
# Scrape remaining pages, limited to 8
page = 2 # Start from the second page
while page <= 9: # Limit to 8 pages (2 to 9)
url = base_url + f"&page={page}"
qualifying_listings += scrape_listings(url)
page += 1
# Prepare email subject and body
subject = "Gaming Laptops under 80,000"
total_count = len(qualifying_listings)
body = f"Total Qualifying Listings: {total_count}\n\n"
for listing in qualifying_listings:
title, date, price, listing_url = listing
body += f"Title: {title}\n"
body += f"Date Posted: {date}\n"
body += f"Price: {price}\n"
body += f"URL: {listing_url}\n"
body += "\n"
# Send the email
send_email(subject, body)
# Run the script
if __name__ == '__main__':
main()
Answered By - Spawin
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.