Issue
I'm trying very hard to make a webscraping bot to retrieve my grades every hour. I have already coded the part where it logs in to the website but I can't figure out how to extract just the grade with bs4 and instead end up getting most of the page.
# Importing all modules
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
# Onening myHIES through webdriver
driver=webdriver.Chrome("chromedriver.exe")
driver.get("https://hies.myschoolapp.com/app#login")
time.sleep(2.5)
# Logging in to myHIES then going on algebra grade page
driver.find_element(By.ID, "Username").send_keys("myemail")
driver.find_element(By.ID, "nextBtn").click()
time.sleep(4)
driver.find_element(By.ID, "i0118").send_keys("mypassword")
driver.find_element(By.ID, "idSIButton9").click()
time.sleep(2)
driver.find_element(By.ID, "idSIButton9").click()
print("*Breaths Lightly* WERE IN BABY!")
time.sleep(3.0)
driver.find_element(By.CSS_SELECTOR, "div#showHideGrade > div > label > span").click()
time.sleep(1.3)
driver.find_element(By.XPATH, '//*[@id="coursesContainer"]/div[1]/div[4]/a[1]').click()
print("handing off to bs4")
# Handing off manipulated page to bs4
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml')
print("handed off to bs4")
for tag in soup.find_all():
print(tag.text)
print("should have printed tag text")
And the this is the html of where I am attempting to extract from
<div class="col-md-2"> <div class="text-align-center"> <h1> 69.00<span class="lead">%</span> </h1> <h6>marking period</h6> </div> <div class="text-align-center"> <h1>69.00<span class="lead">%</span></h1> <h6>year</h6> </div> </div>
The code I'm trying to use to extract (again)
<div class="col-md-2"> <div class="text-align-center"> <h1> 69.00<span class="lead">%</span> </h1> <h6>marking period</h6> </div> <div class="text-align-center"> <h1>69.00<span class="lead">%</span></h1> <h6>year</h6> </div> </div>
Solution
If provided html section is part of your soup you could try this:
from bs4 import BeautifulSoup
mystr_1 = '<div class="col-md-2"> <div class="text-align-center"> <h1> 69.00<span class="lead">%</span> </h1> <h6>marking period</h6> </div> <div class="text-align-center"> <h1>69.00<span class="lead">%</span></h1> <h6>year</h6> </div> </div>'
soup = BeautifulSoup(mystr_1, features='lxml')
main_div = soup.find('div', {'class': 'col-md-2'})
data_tags = main_div.find_all('h1')
data_notes = main_div.find_all('h6')
out_dct = {}
for i in range(2):
grades = data_tags[i].text.replace(' ', '').replace('\t', '').split('\n')
notes = data_notes[i].text.replace('\t', '').split('\n')
out_dct['grade_' + str(i)] = grades
out_dct['grade_' + str(i)].append(notes[0])
print(out_dct)
''' R e s u l t :
{'grade_0': ['69.00%', 'marking period'], 'grade_1': ['69.00%', 'year']}
'''
EDITED ANSWER (after new html content)
I've created a file bb1.html to store the data you sent me.
The structure of the html is quite different than in your initial post. Tryed to collect the grades and some other data connected to grades. Here is the code:
from bs4 import BeautifulSoup
with open('bb1.html') as f:
mystr_1 = f.read()
soup = BeautifulSoup(mystr_1, features='lxml')
divs = soup.find_all('div', {'class': 'col-md-2'})
lst = []
i = 0
for div in divs:
data_tags = div.find_all('h3', {'class': 'showGrade'})
data_notes = div.find('a', {'class': 'btn btn-default'})
if len(data_tags) > 0:
out_dct = {}
grade = data_tags[0].text.replace(' ', '')
if grade[:2] != '--':
out_dct['grade_' + str(i+1)] = grade
out_dct['tag_a_data-analysis'] = data_notes['data-analysis']
grade_class = soup.find_all('a', {'href': '#academicclass/' + data_notes['data-analysis'] + '/undefined/bulletinboard'})
if len(grade_class) > 0:
clss = grade_class[0].find('h3').text
out_dct['academic_class'] = clss
i += 1
lst.append(out_dct)
dct = {}
print(lst)
.... and the result ...
''' R e s u l t i n g l i s t (lst):
[
{'grade_1': '69.00%', 'tag_a_data-analysis': '113087042', 'academic_class': 'Algebra I - 8th Grade - 3 (D)'},
{'grade_2': '58.33%', 'tag_a_data-analysis': '113087763', 'academic_class': 'Computer Science 8 - 1 (E)'},
{'grade_3': '82.40%', 'tag_a_data-analysis': '113086978', 'academic_class': 'English 8/Boys - 3 (G)'},
{'grade_4': '50.77%', 'tag_a_data-analysis': '113087637', 'academic_class': 'Leadership Seminar 8 - 1 (Q1B)'}
]
'''
I do hope that it could be helpfull. Regards...
Answered By - d r
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.