Issue
I have a page which has a script and it contains one array which I need (myHashDay).
<script type="text/javascript">
function toggleCheckBoxes(obj) {
var theForm = document.getElementById("thePage:SiteTemplate:theForm");
for (var i=0; i<theForm.elements.length; i++) {
if (theForm.elements[i].type == "checkbox" &&
theForm.elements[i].name != obj.name) {
theForm.elements[i].checked = false;
}
}
}
// ATLAS-1089: back & continue buttons showing twice for
// Reserved Group/Emergency Appointments
function checkIfButtonsShowTwice() {
// From first form
var continueBtn = document.getElementById("thePage:SiteTemplate:theForm:continueBtn");
var backBtn = document.getElementById("thePage:SiteTemplate:theForm:backBtn");
// From second form
var continueBtnToHide = document.getElementById("thePage:SiteTemplate:theForm2:continueBtn");
var backBtnToHide = document.getElementById("thePage:SiteTemplate:theForm2:form2BackBtn");
// The controller logic for rendering the buttons
// is fragile so... front end solutions for the win
if(continueBtn != null) {
if (continueBtnToHide != null) {
continueBtnToHide.style.display = "none";
}
}
}
var myDayHash = new Array();
myDayHash['14-9-2023'] = true;
myDayHash['4-12-2023'] = true;
myDayHash['31-1-2024'] = true;
myDayHash['1-2-2024'] = true;
myDayHash['27-2-2024'] = true;
myDayHash['28-2-2024'] = true;
myDayHash['4-3-2024'] = true;
myDayHash['5-3-2024'] = true;
myDayHash['6-3-2024'] = true;
myDayHash['7-3-2024'] = true;
myDayHash['11-3-2024'] = true;
myDayHash['12-3-2024'] = true;
myDayHash['13-3-2024'] = true;
myDayHash['14-3-2024'] = true;
myDayHash['18-3-2024'] = true;
myDayHash['19-3-2024'] = true;
myDayHash['20-3-2024'] = true;
myDayHash['21-3-2024'] = true;
myDayHash['25-3-2024'] = true;
myDayHash['26-3-2024'] = true;
myDayHash['27-3-2024'] = true;
var ofcAptDateStr = null;ofcAptDateStr = '';
var splitDate = 'Thu Sep 14 00:00:00 GMT 2023'.split(" ");
var minApptDate = splitDate[1] + ' ' + splitDate[2] + ' ' + splitDate[5];
}
</script>
So I need to get myDayHash array from it.
What I am trying to do:
driver.get('\test.html')
element = driver.execute_script("myDayHash")
But it doesn’t return anything.
I tried element = driver.execute_script("return myDayHash") as well. But it returns none.
But if I use the console in the Chrome browser and type "myDayHash" it prints my whole array.
How can I get this array to Python?
Solution
Here is a solution using Beautiful Soup and regular expressions.
Fetch the data
from bs4 import BeautifulSoup
import requests
import re
r = requests.get('http://website.com/test.html')
soup = BeautifulSoup(r.content)
array = soup.select('script')
Get the text from each script tag
text = ' '.join([elem.text for elem in array])
Apply regex to get myDayHash
The below regex gives you the myDayHash
datastructure values in the form of list of tuples.
myDayHash = re.findall(r"myDayHash\[\'(.*?)\'\] = (.*?);", text)
Generating output:
print(dict(myDayHash))
Output
This gives us the expected output. Now based on your requirement, you can store the key:value pair into any data structure.
{
'14-9-2023': 'true',
'4-12-2023': 'true',
'31-1-2024': 'true',
'1-2-2024': 'true',
'27-2-2024': 'true',
'28-2-2024': 'true',
'4-3-2024': 'true',
'5-3-2024': 'true',
'6-3-2024': 'true',
'7-3-2024': 'true',
'11-3-2024': 'true',
'12-3-2024': 'true',
'13-3-2024': 'true',
'14-3-2024': 'true',
'18-3-2024': 'true',
'19-3-2024': 'true',
'20-3-2024': 'true',
'21-3-2024': 'true',
'25-3-2024': 'true',
'26-3-2024': 'true',
'27-3-2024': 'true'
}
TLDR
from bs4 import BeautifulSoup
import requests
import re
r = requests.get('http://website.com/test.html')
soup = BeautifulSoup(r.content)
array = soup.select('script')
text = ' '.join([elem.text for elem in array])
myDayHash = re.findall(r"myDayHash\[\'(.*?)\'\] = (.*?);", text)
print(dict(myDayHash))
Answered By - Himanshuman
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.