Sunday, April 10, 2022

[FIXED] How do I extract the Discount (% off ) value?

April 10, 2022 beautifulsoup, html, python, request, web-scraping No comments

Issue

I am getting the desired results but I'm not sure how to extract the percentage value from the listing as it doesn't have a class.

from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
import urllib

data =[]

def getdata (url):
    header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' } 
    req = urllib.request.Request(url, headers=header)
    amazon_html = urllib.request.urlopen(req).read()
    a_soup = soup(amazon_html,'html.parser')
    
    for e in a_soup.select('div[data-component-type="s-search-result"]'):
        

        try:
            title = e.find('h2').text
        except:
            title = None
            
        try:
            sponsored = e.find('span',{'class':'a-color-secondary'}).text
        except:
            sponsored = None
            
        try:
            limited_deal = e.find('span',{'class':'a-badge-label-inner a-text-ellipsis'}).find('span', {'class': 'a-badge-text'}).text
        except:
            limited_deal = None
            
        
            
        data.append({
            'list_price':list_price,
            'sponsored':sponsored,
            'limited_deal':limited_deal
            
        })
        
    return a_soup

def getnextpage(a_soup):
    try:
        page = a_soup.find('a',attrs={"class": 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})['href']
        url =  'http://www.amazon.in'+ str(page)
    except:
        url = None
    return url


keywords = ['earphones']

for k in keywords:
    url = 'https://www.amazon.in/s?k='+k
    while True:
        geturl = getdata(url)
        url = getnextpage(geturl)

        if not url:
            break
        print(url)

How do I get the discount (% off ). I have not written any code for that yet, rest of the results are showing up correctly

Solution

You can get discounted price from span class="a-price-whole"

from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
import urllib

data =[]

def getdata (url):
    header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' } 
    req = urllib.request.Request(url, headers=header)
    amazon_html = urllib.request.urlopen(req).read()
    a_soup = soup(amazon_html,'html.parser')
    
    for e in a_soup.select('div[data-component-type="s-search-result"]'):
        

        try:
            title = e.find('h2').text
        except:
            title = None
            
        try:
            sponsored = e.find('span',{'class':'a-color-secondary'}).text
        except:
            sponsored = None
            
        try:
            limited_deal = e.find('span',{'class':'a-badge-label-inner a-text-ellipsis'}).find('span', {'class': 'a-badge-text'}).text
        except:
            limited_deal = None
        
        try:
            list_price = e.select_one('.a-letter-space +span').text
            print(list_price)
        except:
            limited_deal = None
            
        
            
        data.append({
            #'list_price':list_price,
            'sponsored':sponsored,
            'limited_deal':limited_deal
            
        })
        
    return a_soup

def getnextpage(a_soup):
    try:
        page = a_soup.find('a',attrs={"class": 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})['href']
        url =  'http://www.amazon.in'+ str(page)
    except:
        url = None
    return url


keywords = ['earphones']

for k in keywords:
    url = 'https://www.amazon.in/s?k='+k
    while True:
        geturl = getdata(url)
        url = getnextpage(geturl)

        if not url:
            break
        #print(url)

Output:

(70% off)
(56% off)
(70% off)
(70% off)
(63% off)
(25% off)
(53% off)
(50% off)
(63% off)
(43% off)
(57% off)
(62% off)
(50% off)
(60% off)
(69% off)
(50% off)
(41% off)
(60% off)
(70% off)

... so on

If you need only digit

from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
import urllib

data =[]

def getdata (url):
    header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' } 
    req = urllib.request.Request(url, headers=header)
    amazon_html = urllib.request.urlopen(req).read()
    a_soup = soup(amazon_html,'html.parser')
    
    for e in a_soup.select('div[data-component-type="s-search-result"]'):
        

        try:
            title = e.find('h2').text
        except:
            title = None
            
        try:
            sponsored = e.find('span',{'class':'a-color-secondary'}).text
        except:
            sponsored = None
            
        try:
            limited_deal = e.find('span',{'class':'a-badge-label-inner a-text-ellipsis'}).find('span', {'class': 'a-badge-text'}).text
        except:
            limited_deal = None
        
        try:
            list_price = e.select_one('.a-letter-space +span').text.split('%')[0].replace('(','')
            print(list_price)
        except:
            limited_deal = None
            
        
            
        data.append({
            #'list_price':list_price,
            'sponsored':sponsored,
            'limited_deal':limited_deal
            
        })
        
    return a_soup

def getnextpage(a_soup):
    try:
        page = a_soup.find('a',attrs={"class": 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})['href']
        url =  'http://www.amazon.in'+ str(page)
    except:
        url = None
    return url


keywords = ['earphones']

for k in keywords:
    url = 'https://www.amazon.in/s?k='+k
    while True:
        geturl = getdata(url)
        url = getnextpage(geturl)

        if not url:
            break
        #print(url)

Output:

Answered By - F.Hoque

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Sunday, April 10, 2022

[FIXED] How do I extract the Discount (% off ) value?

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels