Issue
This script is crawling through every page of the MAL site, and logging each anime into a csv. Each page contains a table, and I am navigating to each row item's attached link, then getting the info from said linked page.
- every time I run it, it gets to a different page, or different row item.
- every page is structured the same(at least for the key items I am using), so it shouldn't be any different.
- The try/catch was added to try and fix this, but nothing is working.
The IndexError is happening on the 'title' portion of the findInfo() method.
from bs4 import BeautifulSoup
import csv
import pip._vendor.requests as request
import time
starter = 0
def findInfo(url):
result = []
response = request.get(url)
doc = BeautifulSoup(response.text, "html.parser")
title = doc.find_all("strong")
try:
clean_title = title[1].text.encode("utf-8")
except(IndexError):
clean_title = title[1].text.encode("utf-8")
result.append(clean_title)
list = doc.find_all("span", {"class":"dark_text"})
for i in list:
if(i.text == "Aired:"):
arr = i.parent.text.split(':')
string = arr[1].split('\n')
result.append(string[1].encode("utf-8"))
if(i.text == "Studios:"):
arr = i.parent.text.split(':')
string = arr[1].split('\n')
result.append(string[1].encode("utf-8"))
if(i.text == "Score:"):
arr = i.parent.text.split(':')
string1 = arr[1].split('\n')
string = string1[1].split(' ')
result.append(string[0])
if(i.text == "Type:"):
arr = i.parent.text.split(':')
string = arr[1].split('\n')
result.append(string[1].encode("utf-8"))
if(i.text == "Source:"):
arr = i.parent.text.split(':')
string = arr[1].split('\n')
result.append(string[1].encode('utf-8'))
if(i.text == "Episodes:"):
arr = i.parent.text.split(':')
string = arr[1].split('\n')
result.append(string[1])
if(i.text == "Genres:"):
arr = i.parent.text.split(':')
string = arr[1].split('\n')
for x in range(len(string)-1):
string[x] = string[x].encode("utf-8")
result.append(string)
if(i.text == "Genre:"):
arr = i.parent.text.split(':')
string = arr[1].split('\n')
result.append(string[1].encode("utf-8"))
result.append(doc.find("p", {"itemprop":"description"}).text.encode("utf-8"))
if len(result)<9:
d = result[-1]
result[-1] = "N/A"
result.append(d)
return result
with open("log.csv", 'w', newline='') as f:
names = ['Title', 'Type', 'Episodes', 'Aired', 'Studio', 'Source', 'Genres', 'Score', 'Summary']
writer = csv.DictWriter(f, fieldnames=names)
writer.writeheader()
while(starter < 20700):
url = "https://myanimelist.net/topanime.php?type=bypopularity&limit="+str(starter)
response = request.get(url)
doc = BeautifulSoup(response.text, "html.parser")
rows = doc.find_all("tr", {"class": "ranking-list"})
row_num = 0
for n in rows:
head = n.find("h3", {"class":"hoverinfo_trigger fl-l fs14 fw-b anime_ranking_h3"})
title = head.find("a")
new_url = str(title['href'])
try:
results = findInfo(new_url)
except(IndexError):
results = findInfo(new_url)
writer.writerow({'Title':results[0], 'Type':results[1], 'Episodes':results[2], 'Aired':results[3], 'Studio':results[4], 'Source':results[5], 'Genres':results[6], 'Score':results[7], 'Summary':results[8]})
print("row " + str(row_num) + " done!")
row_num+=1
starter+=50
f.close()
Error trace:
soup.py", line 12, in findInfo
clean_title = title[1].text.encode("utf-8")
IndexError: list index out of range
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "c:\Users\christian\Documents\practice\mal_soup\soup.py", line 78, in <module>
results = findInfo(new_url)
File "c:\Users\christian\Documents\practice\mal_soup\soup.py", line 14, in findInfo
clean_title = title[1].text.encode("utf-8")
IndexError: list index out of range
During handling of the above exception, another exception occurred:
File "c:\Users\christian\Documents\practice\mal_soup\soup.py", line 12, in findInfo
clean_title = title[1].text.encode("utf-8")
IndexError: list index out of range
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "c:\Users\christian\Documents\practice\mal_soup\soup.py", line 80, in <module>
results = findInfo(new_url)
File "c:\Users\christian\Documents\practice\mal_soup\soup.py", line 14, in findInfo
clean_title = title[1].text.encode("utf-8")
IndexError: list index out of range
Solution
I found out why this is happening.
After so many urls fetched, the site decides you are a bot (which you are!) and restricts your access.
In this case, the returned text looks like this:
<div class="caption">
Checking your request. Please wait a moment...<br />
Thank you for your cooperation!<br />
</div>
</div>
<div class="display-submit">
<div class="caption">
We are temporarily restricting site connections due to heavy access.<br />
Please click "Submit" to verify that you are not a bot.<br />
<div class="error">
Some error occured. please try again.
</div>
</div>
<form id="reform">
<button type="submit" class="g-recaptcha" data-sitekey="6Ld_1aIZAAAAAF6bNdR67ICKIaeXLKlbhE7t2Qz4" data-callback='onSubmit' data-action='submit'>Submit</button>
</form>
</div>
Answered By - John Gordon
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.