Issue
def getAllBooksPagesURLs():
lists_of_url = []
lists_of_url.append(r"http://books.toscrape.com/")
for j in range(2,51):
lists_of_url.append(r"http://books.toscrape.com/catalogue/page-%d.html"%j)
return lists_of_url
def getAndParseURL(url):
result = requests.get(url)
soup = BeautifulSoup(result.text, 'html.parser')
return soup
def getBooksURLs(url,z):
soup = getAndParseURL(url)
return([z+ x.a.get('href') for x in soup.findAll( "div", class_="image_container")])
books_url = []
title_list = []
main_page_list = []
list_of_rewiew_num = []
list_of_bookpage = []
list_of_resultitle = []
books_done_page = []
list_of_review_num=[]
for y in getAllBooksPagesURLs()[0:1]:
main_page=getAndParseURL(y)
result_of_title = main_page.findAll("h3")
for x in result_of_title:
list_of_resultitle.append(x.find("a").get("title"))
books_url = getBooksURLs(y,y)
for b in books_url:
print(b)
books_page = getAndParseURL(b)
if books_page.find("td") is None:
list_of_review_num.append(0)
else:
review_num =books_page.find("td").contents[0]
list_of_review_num.append(review_num)
books_url
list_of_resultitle
list_of_review_num
above is my code ,the result is
['a897fe39b1053632', '90fa61229261140a', '6957f44c3847a760', 'e00eb4fd7b871a48', '4165285e1663650f', 'f77dbf2323deb740', '2597b5a345f45e1b', 'e72a5dfc7e9267b2', 'e10e1e165dc8be4a', '1dfe412b8ac00530', '0312262ecafa5a40', '30a7f60cd76ca58c', 'ce6396b0f23f6ecc', '3b1c02bac2a429e6', 'a34ba96d4081e6a4', 'deda3e61b9514b83', 'feb7cc7701ecf901', 'e30f54cea9b38190', 'a18a4f574854aced', 'a22124811bfa8350']
the garble codes are like 'a22124811bfa8350', is it about dynamic html? I donnot know. my desire output of list_of_review_num should be
[0,1,2,3]
how to get the correct output?could you plz help me? thank u in advance
Solution
The reason your code is outputting the result that you have is that you are using .find()
which will find the first occurrence of the td
tag, since there are numerous tags on the page you are working with and that the reviews would be the last td
tag you should do something like this.
if books_page.find("td") is None: # saying that there is no td tags at all
list_of_review_num.append(0)
else:
review_num = books_page.find_all("td")[-1].contents[0] # using find_all and accessing the last td tag element
list_of_review_num.append(review_num)
Answered By - Andrew Ryan
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.