Issue
I am still new to this python coding thing, I am trying some web scraping codes and I am stuck on this error it worked fine for the first page until I started to try and scrape the next pages
from bs4 import BeautifulSoup
import requests, datetime, time
# adjusting the link and the header (only change the user agent)
url = "https://www.amazon.eg/s?k=mechinical+keyboard&language=en_AE&crid=1UBHQ81JZMC5G&sprefix=mechinical+keyboard%2Caps%2C206&ref=nb_sb_noss_1"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
def get_data(soup):
page = requests.get(url, headers= headers)
soup = BeautifulSoup(page.content , "html.parser")
# getting the html code of the webpage
title = soup.find_all("span", "a-size-base-plus a-color-base a-text-normal")
price = soup.find_all("span", {"class": "a-price-whole"})
# making a list for the data to be saved in
title_list=[]
price_list=[]
# for loops to extract the titles and prices
for titlex in title:
titles= titlex.text
title_list.append(titles)
for pricex in price:
prices=pricex.text.strip()
prices = prices[:-1]
price_list.append(prices)
##
def get_nextpage(soup):
pages = soup.find('a', {'class': "s-pagination-strip"})
if not pages.find({'class':'s-pagination-item s-pagination-next s-pagination-disabled'}):
url = 'https://www.amazon.eg/-' + str(pages.find({'class': 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'}).find('a')['href'])
return url
else:
return
while True:
data = get_data(url)
url = get_nextpage(data)
if not url:
break
print(url)
That is the error I keep getting, I think I am writing the find() wrong I cant figure out how to properly write it for the "a" tag
AttributeError Traceback (most recent call last)
Cell In[85], line 43
41 while True:
42 data = get_data(url)
---> 43 url = get_nextpage(data)
44 if not url:
45 break
Cell In[85], line 33, in get_nextpage(soup)
32 def get_nextpage(soup):
---> 33 pages = soup.find('span', {'class': "s-pagination-strip"})
34 if not pages.find({'class':'s-pagination-item s-pagination-next s-pagination-disabled'}):
35 url = 'https://www.amazon.eg/-' + str(pages.find({'class': 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'}).find('a')['href'])
AttributeError: 'NoneType' object has no attribute 'find'
Solution
Here is the solution:
from bs4 import BeautifulSoup
import requests, datetime, time
# adjusting the link and the header (only change the user agent)
url = "https://www.amazon.eg/s?k=mechinical+keyboard&language=en_AE&crid=1UBHQ81JZMC5G&sprefix=mechinical+keyboard%2Caps%2C206&ref=nb_sb_noss_1"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
"Accept-Encoding": "gzip, deflate", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"DNT": "1", "Connection": "close", "Upgrade-Insecure-Requests": "1"}
def get_data(url):
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, "html.parser")
# getting the html code of the webpage
title = soup.find_all("span", "a-size-base-plus a-color-base a-text-normal")
price = soup.find_all("span", {"class": "a-price-whole"})
# making a list for the data to be saved in
title_list = [titlex.text for titlex in title]
price_list = [pricex.text.strip()[:-1] for pricex in price]
return soup
def get_nextpage(soup):
pages = soup.find('span', {'class': "s-pagination-strip"})
if not pages.find({'class': 's-pagination-item s-pagination-next s-pagination-disabled'}):
try:
url = 'https://www.amazon.eg/-' + str(pages.find('a',
{'class': 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'}).get('href'))
return url
except AttributeError:
pass
while True:
data = get_data(url)
url = get_nextpage(data)
if not url:
break
url = url.replace('en/en', 'en')
print(url)
output:
https://www.amazon.eg/-/-/en/s?k=mechanical+keyboard&page=2&language=en_AE&crid=1UBHQ81JZMC5G&qid=1682303405&sprefix=mechinical+keyboard%2Caps%2C206&ref=sr_pg_1
https://www.amazon.eg/-/-/en/s?k=mechanical+keyboard&page=3&language=en_AE&crid=1UBHQ81JZMC5G&qid=1682303407&sprefix=mechinical+keyboard%2Caps%2C206&ref=sr_pg_2
https://www.amazon.eg/-/-/en/s?k=mechanical+keyboard&page=4&language=en_AE&crid=1UBHQ81JZMC5G&qid=1682303409&sprefix=mechinical+keyboard%2Caps%2C206&ref=sr_pg_3
https://www.amazon.eg/-/-/en/s?k=mechanical+keyboard&page=5&language=en_AE&crid=1UBHQ81JZMC5G&qid=1682303411&sprefix=mechinical+keyboard%2Caps%2C206&ref=sr_pg_4
https://www.amazon.eg/-/-/en/s?k=mechanical+keyboard&page=6&language=en_AE&crid=1UBHQ81JZMC5G&qid=1682303413&sprefix=mechinical+keyboard%2Caps%2C206&ref=sr_pg_5
https://www.amazon.eg/-/-/en/s?k=mechanical+keyboard&page=7&language=en_AE&crid=1UBHQ81JZMC5G&qid=1682303415&sprefix=mechinical+keyboard%2Caps%2C206&ref=sr_pg_6
Things to Note:
- you need to pass
url
as a parameter to the functionget_data(url)
instead ofget_data(soup)
. - Handle the exception
AttributeError
if the classs-pagination-item s-pagination-next s-pagination-button s-pagination-separator
is not found. - handle the URL from the 2nd page onwards as it starts getting an extra
en
in the URL which makes it an invalid URL:https://www.amazon.eg/-/-/en/en/s?k=mechanical+keyboard&page=3&language=en_AE&crid=1UBHQ81JZMC5G&qid=1682303529&sprefix=mechinical+keyboard%2Caps%2C206&ref=sr_pg_2
. So remove/replace that extraen
to make it a valid one.
Answered By - Ajeet Verma
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.