Issue
I am trying to create a script to return the durations of all tonies at https://tonies.com/en-gb/tonies/. I also wanted to return the cost of each of them but am struggling. I have also looked a scripting through selenium but got stuck with the cookie accept which is a shadow dom. I think I might be making this overly complex. I am novice with programming and Python. Any advice is appreciated. The script in its current form only seems to scrape the first 21 items.
import re
import requests
from bs4 import BeautifulSoup
def get_tonie_info(tonie_url):
response = requests.get(tonie_url)
soup = BeautifulSoup(response.text, 'html.parser')
script_tags = soup.find_all('script')
tonie_info = {'url': tonie_url, 'durations': []}
for script_tag in script_tags:
script_content = script_tag.string
if script_content and 'runTime' in script_content:
matches = re.findall(r'"runTime":\s*(\d+)', script_content)
if matches:
tonie_info['durations'] = list(map(int, matches))
return tonie_info
def scrape_tonies():
all_tonie_info = []
base_url = "https://tonies.com/en-gb/tonies/?page="
page_number = 9 # Only scrape data from page 9
current_url = base_url + str(page_number)
response = requests.get(current_url)
soup = BeautifulSoup(response.text, 'html.parser')
tonie_links = soup.find_all('a', class_='View__StretchedLink-sc-5t9da0-0 ivnTIu')
for tonie_link in tonie_links:
tonie_url = "https://tonies.com" + tonie_link['href']
tonie_info = get_tonie_info(tonie_url)
if tonie_info['durations']:
tonie_info['name'] = tonie_link.text.strip()
tonie_info['duration'] = tonie_info['durations'][-1]
all_tonie_info.append(tonie_info)
else:
print(f"Could not retrieve information for {tonie_url}")
return all_tonie_info
if __name__ == "__main__":
tonies_info = scrape_tonies()
for index, tonie_info in enumerate(tonies_info, start=1):
print(f"Toni {index} Name: {tonie_info['name']}")
print(f" URL: {tonie_info['url']}")
print(f" Duration: {tonie_info['duration']}")
Solution
You can try collecting the tonies' data in JSON format, then post-process it :
import json
url = "https://tonies.com/en-gb/tonies/"
response = requests.get(url) # with optional headers
soup = BeautifulSoup(response.text, "html.parser")
data = (json.loads(soup.select_one("#__NEXT_DATA__").text)
["props"]["pageProps"]["page"]["productList"]["normalizedProducts"])
use_keys = ["name", "price", "runTime"] # << ask for more if needed
tonies = [
{
k: d.get(k) if k!="price" else d.get(k).get("amount")
for k in use_keys
} for d in data
]
Output :
# len(tonies) # 196
print(json.dumps(tonies, indent=4))
[
{
"name": "Chase",
"price": 14.99,
"runTime": 54
},
{
"name": "Elmer and Friends Story Collection",
"price": 14.99,
"runTime": 62
},
{
"name": "Frozen",
"price": 14.99,
"runTime": 24
},
...
]
Answered By - Timeless
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.