Issue
I have scrapped content of the web (css, js and images)
now I want to edit downloaded HTML file to provide absolute path of images, js and css.
for example, the script need to find the source 'src', it must be absolutes path (contain domain) and not relatives (not contain domain).
change from: /static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js To https://es.sopranodesign.com/static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js and save it as index2.html
Here is my code so far:
import os
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
from pathlib import Path
import os.path
import urllib.request, urllib.error, urllib.parse
from tldextract import extract
directory = Path(r"\Documents\python\public_html").mkdir(parents=True, exist_ok=True)
dir_path = r"\Documents\python\public_html"
# URL of the web page you want to extract
url = "https://es.sopranodesign.com/sei/login.do?customerId=270"
# initialize a session
session = requests.Session()
# set the User-agent as a regular browser
session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"
# get the HTML content
html = session.get(url).content
# parse HTML using beautiful soup
soup = bs(html, "html.parser")
# get the JavaScript files
script_files = []
for script in soup.find_all("script"):
if script.attrs.get("src"):
# if the tag has the attribute 'src'
script_url = urljoin(url, script.attrs.get("src"))
script_files.append(script_url)
# get the CSS files
css_files = []
for css in soup.find_all("link"):
if css.attrs.get("href"):
# if the link tag has the 'href' attribute
css_url = urljoin(url, css.attrs.get("href"))
css_files.append(css_url)
# get the images files
image_files = []
for script in soup.find_all("img"):
if script.attrs.get("src"):
# if the tag has the attribute 'src'
script_url = urljoin(url, script.attrs.get("src"))
image_files.append(script_url)
print("Total script files in the page:", len(script_files))
print("Total CSS files in the page:", len(css_files))
print("Total images files in the page:", len(image_files))
# write file links into files
javascript_filename = "javascript_files.txt"
with open(os.path.join(dir_path, javascript_filename), "w") as f:
for js_file in script_files:
print(js_file, file=f)
css_filename = "css_files.txt"
with open(os.path.join(dir_path, css_filename), "w") as f:
for css_file in css_files:
print(css_file, file=f)
image_filename = "image_files.txt"
with open(os.path.join(dir_path, image_filename), "w") as f:
for image_file in image_files:
print(image_file, file=f)
try:
for js_file in script_files:
fileNamepath = os.path.basename(js_file)
fileName = os.path.join(dir_path,fileNamepath)
text = requests.get(js_file).text
with open(fileName, 'w',encoding="utf-8") as f:
f.write(text)
except:
pass
for css_file in css_files:
fileNamepath = os.path.basename(css_file)
fileName = os.path.join(dir_path,fileNamepath)
text = requests.get(css_file).text
with open(fileName, 'w',encoding="utf-8") as f:
f.write(text)
for image_file in image_files:
fileNamepath = os.path.basename(image_file)
fileName = os.path.join(dir_path,fileNamepath)
text = requests.get(image_file).text
with open(fileName, 'w',encoding="utf-8") as f:
f.write(text)
tsd, td, tsu = extract(url) # prints hostname
domain = td # will prints as hostname
print(domain)
response = urllib.request.urlopen(url)
webContent = response.read().decode('UTF-8')
html_filename = domain + "test2.do.html"
f = open(os.path.join(dir_path, html_filename), 'w')
f.write(webContent)
f.close
Solution
You can simply reassign that as the attribute to the bs4 object, as per the link I provided:
for example:
for script in soup.find_all("script"):
if script.attrs.get("src"):
#Original
print('Original')
print(script)
# if the tag has the attribute 'src'
script_url = urljoin(url, script.attrs.get("src"))
script_files.append(script_url)
script['src'] = script_url
# Now changed
print('New/Changed')
print(script)
Output:
Original
<script src="/static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js" type="text/javascript"></script>
New/Changed
<script src="https://es.sopranodesign.com/static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js" type="text/javascript"></script>
And you can see it in the html, it's changed (this is after the first iteration. It'll continuw with doing this to the rest.)
Answered By - chitown88
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.