Issue
I have extracted the content of a certain script with "BeautifulSoup". The content of the script contains "json-like" structured data.
I want to extract the three "urls" of the first "content" group and the "defeatedBosses" from the second "content" group.
This is the extracted script content (part of):
new WH.Wow.TodayInWow(WH.ge('tiw-standalone'), [{
"id": "dungeons-and-raids",
"groups": [{
"content": {
"lines": [{
"icon": "achievement_boss_archaedas",
"url": "\/affix=9\/tyrannical"
}, {
"icon": "spell_shaman_lavasurge",
"url": "\/affix=3\/volcanic"
}, {
"icon": "spell_shadow_bloodboil",
"url": "\/affix=8\/sanguine"
}],
"icons": "large"
},
"id": "mythicaffix",
}, {
"content": {
"defeatedBosses": 9,
},
"id": "mythic-progression",
"url": "\/aberrus-the-shadowed-crucible\/overview"
},
...
And my Python (3.11) script so far:
import re
import json
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import json
req = Request('https://www.wowhead.com/today-in-wow', headers={'User-Agent': 'Mozilla/5.0'})
html_page = urlopen(req).read()
soup = BeautifulSoup(html_page, "html.parser")
all_scripts = soup.find_all('script')
script_sp = all_scripts[36]
// My try
model_data = re.search(r"content = ({.*?});", script_sp, flags=re.S)
model_data = model_data.group(1)
model_data = json.loads(model_data)
print(model_data)
I get an error:
TypeError: expected string or bytes-like object, got 'Tag'
Solution
Here is an example how you can download the page, parse the required data and print sample imformation (info about US Dungeons&Raids):
import re
import json
from urllib.request import Request, urlopen
req = Request('https://www.wowhead.com/today-in-wow', headers={'User-Agent': 'Mozilla/5.0'})
html_page = urlopen(req).read().decode('utf-8')
json_data = re.search(r"TodayInWow\(WH\.ge\('tiw-standalone'\), (.*), true\);", html_page)
json_data = json.loads(json_data.group(1))
# uncomment to print all data:
# print(json.dumps(json_data, indent=4))
for part in json_data:
if part['id'] == 'dungeons-and-raids' and part['regionId'] == 'US':
for g in part['groups']:
print(g['name'], g.get('url', '-'))
Prints:
Mythic+ Affixes /guides/mythic-keystones-and-dungeons
Aberrus, the Shadowed Crucible (Mythic) https://www.wowhead.com/guide/raids/aberrus-the-shadowed-crucible/overview
Conquest Points -
EDIT: For easier search I recommend to transform the Json data from a list to a dictionary:
import re
import json
from urllib.request import Request, urlopen
req = Request(
"https://www.wowhead.com/today-in-wow", headers={"User-Agent": "Mozilla/5.0"}
)
html_page = urlopen(req).read().decode("utf-8")
json_data = re.search(
r"TodayInWow\(WH\.ge\('tiw-standalone'\), (.*), true\);", html_page
)
json_data = json.loads(json_data.group(1))
# uncomment to print all data:
# print(json.dumps(json_data, indent=4))
# transform the received data from list to a dictionary (for easier search)
data = {
(d["id"], d["regionId"]): {dd["id"]: dd for dd in d["groups"]} for d in json_data
}
for line in data[("dungeons-and-raids", "US")]["mythicaffix"]['content']['lines']:
l = line['name'], line['url']
if line['name'] == 'Tyrannical':
print(' --> ', *l)
else:
print(' ', *l)
Prints:
--> Tyrannical /affix=9/tyrannical
Volcanic /affix=3/volcanic
Sanguine /affix=8/sanguine
Answered By - Andrej Kesely
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.