Issue
Written some code to scrape a website: https://books.toscrape.com/catalogue/page-1.html
but I'm getting an error:
Nontype object has no attribute text
Failed to find a solution for this so how can I can fix this error?
import requests
from bs4 import BeautifulSoup
import pandas as pd
all_books=[]
url='https://books.toscrape.com/catalogue/page-1.html'
headers=('https://developers.whatismybrowser.com/useragents/parse/22526098chrome-windows-blink')
def get_page(url):
page=requests.get(url,headers)
status=page.status_code
soup=BeautifulSoup(page.text,'html.parser')
return [soup,status]
#get all books links
def get_links(soup):
links=[]
listings=soup.find_all(class_='product_pod')
for listing in listings:
bk_link=listing.find("h3").a.get("href")
base_url='https://books.toscrape.com/catalogue/page-1.html'
cmplt_link=base_url+bk_link
links.append(cmplt_link)
return links
#extraxt info from each link
def extract_info(links):
for link in links:
r=requests.get(link).text
book_soup=BeautifulSoup(r,'html.parser')
name=book_soup.find(class_='col-sm-6 product_main').text.strip()
price=book_soup.find(class_='col-sm-6 product_main').text.strip()
desc=book_soup.find(class_='sub-header').text.strip()
cat=book_soup.find('"../category/books/poetry_23/index.html">Poetry').text.strip()
book={'name':name,'price':price,'desc':desc,'cat':cat}
all_books.append(book)
pg=48
while True:
url=f'https://books.toscrape.com/catalogue/page-{pg}.html'
soup_status=get_page(url)
if soup_status[1]==200:
print(f"scrapping page{pg}")
extract_info(get_links(soup_status[0]))
pg+=1
else:
print("The End")
break
df=pd.DataFrame(all_books)
print(df)
Solution
Note First of all, always take a look at your soup - therein lies the truth. The contents can always differ slightly to extremely from the view in the dev tools.
What happens?
There are different issues you should keep in mind:
base_url='https://books.toscrape.com/catalogue/page-1.html'
will lead to 404 errors and is the first reason causing your "Nontype object has no attribute text"You try to find the category like this
cat=book_soup.find('"../category/books/poetry_23/index.html">Poetry').text.strip()
what won't work to and will lead to same errorThere some more selection that will not lead to an expected result, take a look in my example edited them to give you a clue how to get the goal.
How to fix?
Change
base_url='https://books.toscrape.com/catalogue/page-1.html'
tobase_url='https://books.toscrape.com/catalogue/'
Select the category more specific, it is the last
<a>
in breadcrumb:cat=book_soup.select('.breadcrumb a')[-1].text.strip()
Example
import requests
from bs4 import BeautifulSoup
import pandas as pd
all_books=[]
url='https://books.toscrape.com/catalogue/page-1.html'
headers=('https://developers.whatismybrowser.com/useragents/parse/22526098chrome-windows-blink')
def get_page(url):
page=requests.get(url,headers)
status=page.status_code
soup=BeautifulSoup(page.text,'html.parser')
return [soup,status]
#get all books links
def get_links(soup):
links=[]
listings=soup.find_all(class_='product_pod')
for listing in listings:
bk_link=listing.find("h3").a.get("href")
base_url='https://books.toscrape.com/catalogue/'
cmplt_link=base_url+bk_link
links.append(cmplt_link)
return links
#extraxt info from each link
def extract_info(links):
for link in links:
r=requests.get(link).text
book_soup=BeautifulSoup(r,'html.parser')
name= name.text.strip() if (name := book_soup.h1) else None
price= price.text.strip() if (price := book_soup.select_one('h1 + p')) else None
desc= desc.text.strip() if (desc := book_soup.select_one('#product_description + p')) else None
cat= cat.text.strip() if (cat := book_soup.select('.breadcrumb a')[-1]) else None
book={'name':name,'price':price,'desc':desc,'cat':cat}
all_books.append(book)
pg=48
while True:
url=f'https://books.toscrape.com/catalogue/page-{pg}.html'
soup_status=get_page(url)
if soup_status[1]==200:
print(f"scrapping page{pg}")
extract_info(get_links(soup_status[0]))
pg+=1
else:
print("The End")
break
all_books
Answered By - HedgeHog
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.