Issue
I am new to python and I have a requirement to parse all the recording URL from a website. I tried the below program but it's not able to find the recording links but It's printing other links in the web page. I am not aware of the website design, I tried with AI tools and Stackoverflow but I can find same solution every where. Can you please provide what is the mistake I am doing here or some other way I need to follow to parse this?
Sample recording URL which I found from the webpage using inspect element:
Here is the code snipper I tried:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
def parse_page(url):
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
for quote in soup.find_all('a',href=True):
href = quote.get('href')
print(href)
base_url = 'https://www.vector.com/int/en/search/#type=%5B%22webinar_recording%22%5D&page=1&pageSize=50&sort=date&order=desc'
parse_page(base_url)
Solution
The data you see on the page is loaded via Javascript, sou BeautifulSoup doesn't see it. To simulate this requests you can try:
import requests
payload = {
"aggs": {
"categories": {
"terms": {"field": "downloadType", "order": {"_key": "asc"}, "size": 1000}
},
"content_type": {
"terms": {"field": "type", "order": {"_key": "asc"}, "size": 1000}
},
"file_type": {
"terms": {"field": "fileType", "order": {"_key": "asc"}, "size": 1000}
},
"languages": {
"terms": {
"field": "categoryFileLanguageDefaultLang",
"order": {"_key": "asc"},
"size": 1000,
}
},
"products": {
"terms": {
"field": "categoryProductDefaultLang",
"order": {"_key": "asc"},
"size": 1000,
}
},
"standards": {
"terms": {
"field": "categoryStandardDefaultLang",
"order": {"_key": "asc"},
"size": 1000,
}
},
"topics": {
"terms": {
"field": "categoryTopicDefaultLang",
"order": {"_key": "asc"},
"size": 1000,
}
},
},
"explain": False,
"from": 0,
"query": {
"function_score": {
"boost_mode": "multiply",
"functions": [{"filter": {"match": {"type": "products"}}, "weight": 50}],
"query": {
"bool": {
"filter": [
{
"match": {
"type": {
"boost": 1,
"operator": "AND",
"query": "webinar_recording",
}
}
}
],
"must": [
{
"bool": {
"should": [
{
"bool": {
"must_not": {"exists": {"field": "endtime"}}
}
},
{"range": {"endtime": {"gte": "now"}}},
]
}
},
{
"bool": {
"should": [
{
"bool": {
"must_not": {
"exists": {"field": "starttime"}
}
}
},
{"range": {"starttime": {"lte": "now"}}},
]
}
},
],
"must_not": [
{"term": {"type": {"value": "marketingitems"}}},
{"match": {"downloadType": {"query": "demos"}}},
{"match": {"downloadType": {"query": "software"}}},
{"match": {"downloadType": {"query": "drivers"}}},
{"match": {"downloadType": {"query": "freeware"}}},
{"match": {"downloadType": {"query": "service"}}},
],
"should": [],
}
},
"score_mode": "first",
}
},
"size": 50,
"sort": [{"sortdate": "desc"}],
"suggest": {
"didYouMean": {
"phrase": {
"direct_generator": [
{"field": "didYouMean.trigram", "suggest_mode": "always"}
],
"field": "didYouMean.trigram",
"gram_size": 3,
"size": 5,
},
"text": "",
}
},
}
api_url = "https://search.vector.com/int-en/_search/"
headers = {
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:121.0) Gecko/20100101 Firefox/121.0"
}
data = requests.post(api_url, headers=headers, json=payload).json()
for h in data["hits"]["hits"]:
print(h["_source"]["title"])
print(h["_source"]["streamingUrl"])
print("-" * 80)
Prints:
Simplify the Simulation, Testing and Measurement of 10BASE-T1S Networks With CANoe/CANalyzer
https://vector-group.webex.com/vector-group/ldr.php?RCID=0ce68a6a7132fb032088f53a6b5cd4b2
--------------------------------------------------------------------------------
Remote Diagnostics and Flashing
https://vector-group.webex.com/vector-group/ldr.php?RCID=7307e0a9000c63ad7dce5523ec058af2
--------------------------------------------------------------------------------
Maintain ODX-based Diagnostic Data Easily, Quickly and Effectively with ODXStudio
https://vector-group.webex.com/vector-group/ldr.php?RCID=4c3f1a4d894482e9899b2c0a64f1914b
--------------------------------------------------------------------------------
...
Answered By - Andrej Kesely
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.