Saturday, April 16, 2022

[FIXED] Edit html file using python

April 16, 2022 beautifulsoup, python, web-scraping No comments

Issue

I have scrapped content of the web (css, js and images)

now I want to edit downloaded HTML file to provide absolute path of images, js and css.

for example, the script need to find the source 'src', it must be absolutes path (contain domain) and not relatives (not contain domain).

change from: /static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js To https://es.sopranodesign.com/static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js and save it as index2.html

Here is my code so far:

import os
import requests
from bs4 import BeautifulSoup as bs
from urllib.parse import urljoin
from pathlib import Path
import os.path
import urllib.request, urllib.error, urllib.parse
from tldextract import extract

directory = Path(r"\Documents\python\public_html").mkdir(parents=True, exist_ok=True)
dir_path = r"\Documents\python\public_html"

# URL of the web page you want to extract
url = "https://es.sopranodesign.com/sei/login.do?customerId=270"

# initialize a session
session = requests.Session()
# set the User-agent as a regular browser
session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"

# get the HTML content
html = session.get(url).content

# parse HTML using beautiful soup
soup = bs(html, "html.parser")

# get the JavaScript files
script_files = []

for script in soup.find_all("script"):
    if script.attrs.get("src"):
        # if the tag has the attribute 'src'
        script_url = urljoin(url, script.attrs.get("src"))
        script_files.append(script_url)

# get the CSS files
css_files = []

for css in soup.find_all("link"):
    if css.attrs.get("href"):
        # if the link tag has the 'href' attribute
        css_url = urljoin(url, css.attrs.get("href"))
        css_files.append(css_url)

# get the images files
image_files = []

for script in soup.find_all("img"):
    if script.attrs.get("src"):
        # if the tag has the attribute 'src'
        script_url = urljoin(url, script.attrs.get("src"))
        image_files.append(script_url)
        
print("Total script files in the page:", len(script_files))
print("Total CSS files in the page:", len(css_files))
print("Total images files in the page:", len(image_files))

# write file links into files
javascript_filename = "javascript_files.txt"
with open(os.path.join(dir_path, javascript_filename), "w") as f:
    for js_file in script_files:
        print(js_file, file=f)

css_filename = "css_files.txt"
with open(os.path.join(dir_path, css_filename), "w") as f:
    for css_file in css_files:
        print(css_file, file=f)

image_filename = "image_files.txt"
with open(os.path.join(dir_path, image_filename), "w") as f:
    for image_file in image_files:
        print(image_file, file=f)
        
        
try:
    for js_file in script_files:
        fileNamepath = os.path.basename(js_file)
        fileName = os.path.join(dir_path,fileNamepath)
        text = requests.get(js_file).text
        with open(fileName, 'w',encoding="utf-8") as f:
            f.write(text)
except:
    pass


for css_file in css_files:
    fileNamepath = os.path.basename(css_file)
    fileName = os.path.join(dir_path,fileNamepath)
    text = requests.get(css_file).text
    with open(fileName, 'w',encoding="utf-8") as f:
        f.write(text)
        
for image_file in image_files:
    fileNamepath = os.path.basename(image_file)
    fileName = os.path.join(dir_path,fileNamepath)
    text = requests.get(image_file).text
    with open(fileName, 'w',encoding="utf-8") as f:
        f.write(text)
        
tsd, td, tsu = extract(url) # prints  hostname
domain = td  # will prints as hostname
print(domain)

response = urllib.request.urlopen(url)
webContent = response.read().decode('UTF-8')
html_filename = domain + "test2.do.html"
f = open(os.path.join(dir_path, html_filename), 'w')
f.write(webContent)
f.close

Solution

You can simply reassign that as the attribute to the bs4 object, as per the link I provided:

for example:

for script in soup.find_all("script"):
    if script.attrs.get("src"):
        #Original
        print('Original')
        print(script)
        
        # if the tag has the attribute 'src'
        script_url = urljoin(url, script.attrs.get("src"))
        script_files.append(script_url)
        
        script['src'] = script_url
        
        # Now changed
        print('New/Changed')
        print(script)

Output:

Original
<script src="/static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js" type="text/javascript"></script>

New/Changed
<script src="https://es.sopranodesign.com/static_1.872.4/js/jquery_3.4.1/jquery-3.4.1.min.js" type="text/javascript"></script>

And you can see it in the html, it's changed (this is after the first iteration. It'll continuw with doing this to the rest.)

Answered By - chitown88

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Saturday, April 16, 2022

[FIXED] Edit html file using python

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels