Issue
I wrote a code that allows to test if a link is broken for a selected website.
Here is the code I tried to change into version python 3:
"""
This script allows to test if a link is broken.
It will test all the links, external or internal, from the website.
Also, it will give you the number of broken links.
"""
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
import requests
# from urllib.parse import urljoin
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import sys
# initialize the set of links (unique links)
internal_urls = set()
external_urls = set()
# number of urls visited so far will be stored here
total_urls_visited = 0
total_broken_link = set()
output = 'output.txt'
def is_valid(url):
"""
Checks whether `url` is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
"""
Almost any value is evaluated to True if it has some sort of content.
Every Url should follow a specific format: <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
Example: http://www.example.com/index?search=src
Here, www.example.com is your netloc, while index is the path,
search is the query parameter, and src is the value being passed along the parameter search.
This will make sure that a proper scheme (protocol, e.g http or https) and domain name exists in the URL.
"""
def get_all_website_links(url):
"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
# all URLs of `url`, we use python set() cause we don't redondant links
urls = set()
# domain name of the URL without the protocol, to check if the link is internal or external
domain_name = urlparse(url).netloc
#Python library for pulling data out of HTML or XML files
soup = BeautifulSoup(requests.get(url).content, "html.parser",from_encoding="iso-8859-1")
# print(soup.prettify()) #test if the html of the page is correctly displaying
# print(soup.find_all('a')) #collect all the anchor tag
for a_tag in soup.findAll("a"):
href = a_tag.get("href")
if href == "" or href is None:
# href empty tag
continue
href = urljoin(url, href) #internal urls
#print(internal_urls)
# print('href:' + href)
if not is_valid(href):
# not a valid URL
continue
if href in internal_urls:
# already in the set
continue
if domain_name not in href:
# external link
if href not in external_urls:
# print("External link:" + href)
# print((requests.get(href)).status_code)
is_broken_link(href, url)
external_urls.add(href)
continue
# print("Internal link:" + href)
# print((requests.get(href)).status_code)
is_broken_link(href, url)
urls.add(href) #because it is not an external link
internal_urls.add(href) #because it is not an external link
return urls
def is_broken_link(url, origin_url):
if ((requests.get(url)).status_code) != 200:
#print("This link is broken")
print(('|' + url.encode('utf-8').center(60) + '|' + origin_url.encode('utf-8').center(60) + '|' + '\n'))
total_broken_link.add(url)
return True
else:
#print("This link works well")
return False
def crawl(url, max_urls=80):
"""
Crawls a web page and extracts all links.
You'll find all links in `external_urls` and `internal_urls` global set variables.
params:
max_urls (int): number of max urls to crawl.
"""
global total_urls_visited
total_urls_visited += 1
links = get_all_website_links(url)
for link in links:
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)
if __name__ == "__main__":
print(('\n' + '|' + color.BOLD + color.RED + "Broken links".center(60) + color.END + '|' + color.BOLD + color.YELLOW + "Origin of the link".center(60) + color.END + '|'))
print(('+' + '-'.center(60,'-') + '+' + '-'.center(60,'-') + '+'))
if len(sys.argv) <= 1:
crawl('http://localhost:1313/')
else:
crawl(sys.argv[1])
print(('Total External links:' + str(len(external_urls))))
print(('Total Internal links:' + str(len(internal_urls))))
print(('Total:' + str(len(external_urls) + len(internal_urls))))
print(('Be careful: ' + color.BOLD + color.PURPLE + str(len(total_broken_link)) + color.END + ' broken links found !'))
Also I am using a docker, so what I tell to my docker to install while running the script is the following command:
RUN python -m pip install requests beautifulsoup4
So when I run my code I got this return:
Traceback (most recent call last):
File "/home/camille/workspace/test-link/test-link.py", line 124, in <module>
crawl(sys.argv[1])
File "/home/camille/workspace/test-link/test-link.py", line 115, in crawl
crawl(link, max_urls=max_urls)
File "/home/camille/workspace/test-link/test-link.py", line 111, in crawl
links = get_all_website_links(url)
File "/home/camille/workspace/test-link/test-link.py", line 86, in get_all_website_links
is_broken_link(href, url)
File "/home/camille/workspace/test-link/test-link.py", line 94, in is_broken_link
print(('|' + url.encode('utf-8').center(60) + '|' + origin_url.encode('utf-8').center(60) + '|' + '\n'))
TypeError: can only concatenate str (not "bytes") to str
I think I changed everything that needed to be change in order to go on version python 3 and I am very confused about this "bytes" concatenation. In fact, if I remember well I got the same issue with python 2 but I solved it. I don't know why the issue is back on python 3.
Thank you for helping.
Solution
url.encode('utf-8')
returns bytes, not a string. In Python 2, ordinary strings like 'hello'
or '|'
where bytes and could therefore be concatenated with other bytes. Now you're trying to concatenate bytes (url.encode('utf-8')
) with strings ('|'
). To fix your problem, all you need to do is remove the encode
method.
print ('|' + url.center(60) + '|' + origin_url.center(60) + '|' + '\n')
Answered By - Ted Klein Bergman
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.