Issue
I have 3 different type ofhtml
snippets which are part of a bigger part as follows:
<html>
<body>
<span _ngcontent-dna-c199="" class="font-weight-bold">
<span _ngcontent-dna-c199="" class="ng-star-inserted">
<span _ngcontent-dna-c199="" translate="">
issue_number
</span>
4 Näköispainos
</span>
<span _ngcontent-dna-c199="" class="ng-star-inserted">
6.12.1939
</span>
</span>
</body>
</html>
and
<html>
<body>
<span _ngcontent-sut-c199="" class="font-weight-bold">
<span _ngcontent-sut-c199="" class="ng-star-inserted">
<span _ngcontent-sut-c199="" translate="">
issue_number
</span>
8
</span>
<span _ngcontent-sut-c199="" class="ng-star-inserted">
1998
</span>
</span>
</body>
</html>
and
<html>
<body>
<span _ngcontent-dgu-c199="" class="font-weight-bold">
<span _ngcontent-dgu-c199="" class="ng-star-inserted">
1905
</span>
</span>
</body>
</html>
Given the following code:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml") # html_1, html_2, html_3
res = soup.find("span", class_="font-weight-bold")
print(res.text.split())
I get the following results:
['issue_number', '4', 'Näköispainos', '6.12.1939'] # html_1
['issue_number', '8', '1998'] # html_2
['1905'] # html_3
However, my desired custom-made list should have 4 elements and looks like this:
desired_list = ["issue_number", "number", "extension", "date"]
so if there is no info available in html snippet, I'd like to get None
or simply "-"
in that specific element of my desired custom list as follows:
['issue_number', '4', 'Näköispainos', '6.12.1939'] # html_1
['issue_number', '8', None, '1998'] # html_2
[None, None, None, '1905'] # html_3
Is there anyway to manipulate the result list to obtain the desired list using soup.find()
?
Solution
You could search for each part individually:
def bs2customList(soup):
fwb = soup.find("span", class_="font-weight-bold")
#fdesc = [] if fwb is None else [( #(text, filter1, filter2, filter3)
#d.get_text(strip=True), d.parent.get('class', []), d.parent.get('translate'), len(d.parent.select('span'))
#) for d in fwb.descendants if d.name is None and d.parent.name == 'span' and d.get_text(strip=True) ]
fdesc = [] if fwb is None else [( #(text, filter1, filter2, filter3)
str(d).strip(), d.parent.get('class', []), d.parent.get('translate'), len(d.parent.select('span'))
) for d in fwb.descendants if 'NavigableString' in str(type(d)) and d.parent.name == 'span' and str(d).strip()]
filters = [([], '', 0), (['ng-star-inserted'], None, 0), (['ng-star-inserted'], None, 1)]
issueNum, ydate, num_ext = [[d[0] for d in fdesc if d[1:] == f] for f in filters]
num = num_ext[0].split()[0] if num_ext else []
ext = num_ext[0].split()[1:] if num_ext else []
return [(d[0] if d else None) for d in [issueNum, num, ext, ydate]]
or maybe this is more understandable:
def bs2customList(soup):
fwb = soup.find("span", class_="font-weight-bold")
if fwb is None or not fwb.select('span.ng-star-inserted'):
return [ None ]*4
snsi = fwb.select('span.ng-star-inserted')
snsi1 = [t for t in snsi if t.select('span[translate]')]
if snsi1 != []:
issueNum = snsi1[0].select_one('span[translate]').get_text(strip=True)
#ext = [
# c.get_text(strip=True) for c in snsi1[0].children
# if c.name is None and c.get_text(strip=True)
#]
ext = [
str(c).strip() for c in snsi1[0].children
if 'NavigableString' in str(type(c)) and str(c).strip()
]
if ext:
ext = [e for e in ext[0].split() if e]
# keep ext[0].isdigit() only if "number" is always integer
if len(ext) > 1 and ext[0].isdigit():
num = ext[0]
ext = ' '.join(ext[1:])
else: num, ext = ' '.join(ext), None
else: num, ext = None, None
else: issueNum, num, ext = [ None ]*3
ydate = [t for t in snsi if not t.select('span[translate]')]
ydate = ydate[0].get_text(strip=True) if ydate else None
return [issueNum, num, ext, ydate]
Whichever version of the function is used, with the below test set:
htmls = [
'''
<html>
<body>
<span _ngcontent-dna-c199="" class="font-weight-bold">
<span _ngcontent-dna-c199="" class="ng-star-inserted">
<span _ngcontent-dna-c199="" translate="">
issue_number
</span>
4 Näköispainos
</span>
<span _ngcontent-dna-c199="" class="ng-star-inserted">
6.12.1939
</span>
</span>
</body>
</html>
''',
'''
<html>
<body>
<span _ngcontent-sut-c199="" class="font-weight-bold">
<span _ngcontent-sut-c199="" class="ng-star-inserted">
<span _ngcontent-sut-c199="" translate="">
issue_number
</span>
8
</span>
<span _ngcontent-sut-c199="" class="ng-star-inserted">
1998
</span>
</span>
</body>
</html>
''',
'''
<html>
<body>
<span _ngcontent-dgu-c199="" class="font-weight-bold">
<span _ngcontent-dgu-c199="" class="ng-star-inserted">
1905
</span>
</span>
</body>
</html>
''',
'<html><body><span class="font-weight-bold"></span></body></html>',
'' # empty str
]
printing with
for h in htmls: print(bs2customList(BeautifulSoup(h, 'lxml')))
gives the same output [with both versions]:
['issue_number', '4', 'Näköispainos', '6.12.1939']
['issue_number', '8', None, '1998']
[None, None, None, '1905']
[None, None, None, None]
[None, None, None, None]
(The last 2 tests are with an empty [textless] html and an empty string.)
Answered By - Driftr95
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.