Thursday, January 11, 2024

[FIXED] How can I scrape a specific URL from a webpage using BeautifulSoup?

January 11, 2024 beautifulsoup, html-parsing, python, scripting, smtp No comments

Issue

I'm writing a Python script that parses HTML (a classifieds website) and sends me email notifications on specific products and price points. Everything works here except for the "listing_url" capture, which I want displayed in the email so I can click on the url to visit the product page. I tried scraping the corresponding tag with "class=listing-card__inner" as inspected on the website but this isn't working.

This is my full code:

import requests
from bs4 import BeautifulSoup
import smtplib
from email.mime.text import MIMEText


# Email details
sender_email = "[email protected]"
receiver_email = "[email protected]"
smtp_server = "smtp.gmail.com"
smtp_port = 587
smtp_username = "[email protected]"  
smtp_password = "xxxxxxxx"  

# Send email function
def send_email(subject, body):
    message = MIMEText(body)
    message['Subject'] = subject
    message['From'] = sender_email
    message['To'] = receiver_email

    with smtplib.SMTP(smtp_server, smtp_port) as server:
        server.starttls()
        server.login(smtp_username, smtp_password)
        server.sendmail(sender_email, receiver_email, message.as_string())

# Scrape listings function
def scrape_listings(url):
    # Make a GET request to the website
    response = requests.get(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the laptop listings
    listings = soup.find_all('div', class_='listing-card__content')

    # List to store qualifying listings
    qualifying_listings = []

    # Iterate through listings and check conditions
    for listing in listings:
        title = listing.find('div', class_='listing-card__header__title').text.strip()
        date = listing.find('div', class_='listing-card__header__date').text.strip()
        price = listing.find('span', class_='listing-card__price__value 1').text.strip()
        price = price.replace('Br', '').replace(',', '').strip()
        price = int(price)
        listing_url = listing.find('a', class_='listing-card__inner')['href']
        if price < 80000:
            qualifying_listings.append((title, date, price, listing_url))

    return qualifying_listings

# Main function
def main():
    base_url = "https://www.qefira.com/classifieds?q=gaming%20laptops&sort=latest"
    qualifying_listings = []

    # Scrape first page
    qualifying_listings += scrape_listings(base_url)

    # Scrape remaining pages, limited to 8
    page = 2  # Start from the second page
    while page <= 9:  # Limit to 8 pages (2 to 9)
        url = base_url + f"&page={page}"
        qualifying_listings += scrape_listings(url)
        page += 1

    # Prepare email subject and body
    subject = "Gaming Laptops under 80,000"
    total_count = len(qualifying_listings)
    body = f"Total Qualifying Listings: {total_count}\n\n"
    for listing in qualifying_listings:
        title, date, price, listing_url = listing
        body += f"Title: {title}\n"
        body += f"Date Posted: {date}\n"
        body += f"Price: {price}\n"
        body += f"URL: {listing_url}\n"
        body += "\n"

    # Send the email
    send_email(subject, body)

# Run the script
if __name__ == '__main__':
    main()

The following is a shortened sample of the tag on the website from which I am trying to scrape the URL in the href object on the very first line:

<a href="https://www.qefira.com/listings/hpelitebook-core-i5-4th-laptops-5366113" class="listing-card__inner" id="listing-5366113" data-t-listing="" data-t-listing_context="search" data-t-listing_id="5366113" data-t-listing_title="HpElitebook core i5 4th Laptops" data-t-listing_type="classified" data-t-listing_category_title="Laptops" data-t-listing_category_slug="laptops" data-t-listing_slug="hpelitebook-core-i5-4th-laptops" data-t-listing_price="19500.00" data-t-listing_currency="ETB" data-t-listing_location_title="Bole" data-t-listing_source="qe_et" data-t-listing_product_slugs="listing">...</a>

Any guidance on how I can successfully scrape the URL so it can appear as a clickable link inside the email body?

Tried different iterations of the .find method such as:

 listing_url_element = listing.find('a', class_='listing-card__inner')
 listing_url = listing_url_element['href'] if listing_url_element else "N/A"

 listing_url = listing.find('a', class_='listing-card__title')['href']

listing_url_element = listing.find('a', class_='listing-card__inner')
listing_url = listing_url_element['href'] if listing_url_element and 'href' in listing_url_element.attrs else "N/A"

or even:

import re

listing_url_element = listing.find('a', class_='listing-card__inner')
listing_url_match = re.search(r'href="(.*?)"', str(listing_url_element))
listing_url = listing_url_match.group(1) if listing_url_match else "N/A"

Still nothing (or the "N/A" is displayed rather than the actual URL). The common error is something like:

listing_url = listing.find('a', class_='listing-card__inner')['href']
                  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^
TypeError: 'NoneType' object is not subscriptable

Solution

I advise you to use this class 'listing-card__content' instead of this one 'listing-card--has-content'. I also added a check to avoid the code execution stopping.

Here is the full code:

import requests
from bs4 import BeautifulSoup
import smtplib
from email.mime.text import MIMEText


# Email details
sender_email = "[email protected]"
receiver_email = "[email protected]"
smtp_server = "smtp.gmail.com"
smtp_port = 587
smtp_username = "[email protected]"  
smtp_password = "xxxxxxxx"  

# Send email function
def send_email(subject, body):
    message = MIMEText(body)
    message['Subject'] = subject
    message['From'] = sender_email
    message['To'] = receiver_email

    with smtplib.SMTP(smtp_server, smtp_port) as server:
        server.starttls()
        server.login(smtp_username, smtp_password)
        server.sendmail(sender_email, receiver_email, message.as_string())

# Scrape listings function
def scrape_listings(url):
    # Make a GET request to the website
    response = requests.get(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all the laptop listings
    # listings = soup.find_all('div', class_='listing-card__content')
    listings = soup.find_all('div', class_='listing-card--has-content')

    # List to store qualifying listings
    qualifying_listings = []

    # Iterate through listings and check conditions
    for listing in listings:
        title = listing.find('div', class_='listing-card__header__title').text.strip()
        date = listing.find('div', class_='listing-card__header__date').text.strip()
        price = listing.find('span', class_='listing-card__price__value 1').text.strip()
        price = price.replace('Br', '').replace(',', '').strip()
        price = int(price)
        
        a = listing.find('a', class_='listing-card__inner')
        if a:
            listing_url = a['href']
            print(listing_url)
            if price < 80000:
                qualifying_listings.append((title, date, price, listing_url))
    
    return qualifying_listings

# Main function
def main():
    base_url = "https://www.qefira.com/classifieds?q=gaming%20laptops&sort=latest"
    qualifying_listings = []

    # Scrape first page
    qualifying_listings += scrape_listings(base_url)

    # Scrape remaining pages, limited to 8
    page = 2  # Start from the second page
    while page <= 9:  # Limit to 8 pages (2 to 9)
        url = base_url + f"&page={page}"
        qualifying_listings += scrape_listings(url)
        page += 1

    # Prepare email subject and body
    subject = "Gaming Laptops under 80,000"
    total_count = len(qualifying_listings)
    body = f"Total Qualifying Listings: {total_count}\n\n"
    for listing in qualifying_listings:
        title, date, price, listing_url = listing
        body += f"Title: {title}\n"
        body += f"Date Posted: {date}\n"
        body += f"Price: {price}\n"
        body += f"URL: {listing_url}\n"
        body += "\n"

    # Send the email
    send_email(subject, body)

# Run the script
if __name__ == '__main__':
    main()

Answered By - Spawin

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Thursday, January 11, 2024

[FIXED] How can I scrape a specific URL from a webpage using BeautifulSoup?

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels