Issue
I have two htmls that look like this:
<h3>
First heading
</h3>
<ol>
<li>
hi
</li>
</ol>
<h3>
Second
</h3>
<ol>
<li>
second
</li>
</ol>
Document 2
<h3>
First heading
</h3>
<ol>
<li>
hello
</li>
</ol>
<h3>
Second
</h3>
<ol>
<li>
second to second
</li>
</ol>
I need to append li from second document to html of first document under relevant h3. This is my code
soup = BeautifulSoup(html_string)
h3_tags = soup.find_all('h3')
ol_tags = [each_h3.find_next('ol') for each_h3 in h3_tags]
soup = BeautifulSoup(html_string_new)
h3_tags_new = soup.find_all('h3')
ol_tags_new = [each_h3.find_next('ol') for each_h3 in h3_tags_new]
countries_old = []
countries_new = []
html_new = ""
for i in h3_tags:
countries_old.append(i.text)
for i in h3_tags_new:
countries_new.append(i.text)
for country in countries_new:
idx = countries_old.index(country)
tag = str(ol_tags[idx])
tag = tag[:-5]
tag = tag[4:]
idx_new = countries_new.index(country)
tag_new = str(ol_tags_new[idx_new])
tag_new = tag_new[:-5]
tag_new = tag_new[4:]
tag = "<ol>" + tag + tag_new + "</ol>"
ol_tags[idx] = tag
html_new += h3_tags[idx]
html_new += tag
with open("check.html", "w", encoding="utf8") as html_file:
html_file.write(html_new)
html_file.close()
import pypandoc
output = pypandoc.convert(source='check.html', format='html', to='docx', outputfile='test.docx', extra_args=["-M2GB", "+RTS", "-K64m", "-RTS"])
This code takes h3 from second document checks its index and from same index takes the value from ol of second document. It then removes ol tags from these and concatenate these two together. It keeps storing these in html_file. But when I concatenate ol with h3 it gives this error:
TypeError: can only concatenate str (not "Tag") to str
Edit: expected output:
<h3>
First heading
</h3>
<ol>
<li>
hello
</li>
<li>
hi
</li>
</ol>
<h3>
Second
</h3>
<ol>
<li>
second to second
</li>
<li>
second
</li>
</ol>
Solution
Try:
from bs4 import BeautifulSoup
html1 = """
<h3>
First heading
</h3>
<ol>
<li>
hi
</li>
</ol>
<h3>
Second
</h3>
<ol>
<li>
second
</li>
</ol>
"""
html2 = """
<h3>
First heading
</h3>
<ol>
<li>
hello
</li>
</ol>
<h3>
Second
</h3>
<ol>
<li>
second to second
</li>
</ol>
"""
soup1 = BeautifulSoup(html1, "html.parser")
soup2 = BeautifulSoup(html2, "html.parser")
for li in soup2.select("h3 + ol > li"):
h3_text = li.find_previous("h3").get_text(strip=True)
h3_soup1 = soup1.find("h3", text=lambda t: h3_text in t)
if not h3_soup1:
continue
h3_soup1.find_next("ol").insert(0, li)
print(soup1.prettify())
Prints:
<h3>
First heading
</h3>
<ol>
<li>
hello
</li>
<li>
hi
</li>
</ol>
<h3>
Second
</h3>
<ol>
<li>
second to second
</li>
<li>
second
</li>
</ol>
Answered By - Andrej Kesely
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.