Issue
It's been hours since I am stuck here.
The entire code has been uploaded below.
The url: "https://www.bostonpublicschools.org/Page/628" is of only 1 page. However, it has 5 pages within itself that can be loaded by clicking on the page-icon at the bottom. This loads a javascript file that cannot be opened on a new tab (trying to open results in page getting blocked). Also, it loads on the same page.
And along with that, I'm trying to fetch EMail-IDs
by visiting individual websites. But there exist a catch which I'm unable to overcome. That is, some Email IDs exist within
tags and some in tags and some more in different positions say index 3 or 2. Is there any way where I can fetch all of them? Yes,
school.find('div', {'class': "ui-column-one-half region"}).find(class_="ui-article-description")
this code can fetch them but it's really hard to filter as I mentioned before.
I've so far looked at more than 100 solutions and tried many of them and none of them worked for me. Also, I do believe that this can be solved by using selenium
as it can interact with the UI.
Is there any way I can get the job done within BeautifulSoup?
from bs4 import BeautifulSoup as bs
import requests
import re
valid = False
def URL_Parser(url):
# Desktop user-agent
DESKTOP_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
# Mobile user-agent
MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"
headers = {"user-agent": MOBILE_USER_AGENT}
resp = requests.get(url, headers=headers)
return resp
def validate_mail(e_mail):
return bool(re.search(r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$", e_mail))
def main():
global valid
URL = "https://www.bostonpublicschools.org/Page/628"
response = URL_Parser(URL)
if response.status_code == 200:
soup = bs(response.content, "html.parser")
school_database = {"School Name": [],
"School Email ID": [],
"School Link": []
}
links = []
for link in soup.findAll('a', attrs={'href': re.compile("^https://www.bostonpublicschools.org/Page/")}):
if link.get('aria-invalid'):
links.append(link.get('href'))
for link in links:
school = bs(URL_Parser(link).content, 'html.parser')
school_name = school.find('div', {'class': "ui-widget app flexpage"}).find(class_="ui-widget-header").select('h1')[0].text.strip()
try:
school_email = school.find('div', {'class': "ui-column-one-half region"}).find(class_="ui-article-description").select('p')[2].text.strip()
valid = validate_mail(school_email)
if not valid:
school_email = school.find('div', {'class': "ui-column-one-half region"}).find(class_="ui-article-description").select('p')[1].text.strip()
valid = validate_mail(school_email)
if not valid:
school_email = school.find('div', {'class': "ui-column-one-half region"}).find(class_="ui-article-description").select('p')[3].text.strip()
valid = validate_mail(school_email)
except IndexError:
print(">>> Email is not in expected place!")
else:
school_database["School Name"].append(school_name)
school_database["School Email ID"].append(school_email)
school_database["School Link"].append(link)
print(school_database["School Link"])
print(school_database["School Name"])
print(school_database["School Email ID"])
if '__init__' == main():
main()
Solution
Try:
import re
import requests
from bs4 import BeautifulSoup
url = "https://www.bostonpublicschools.org/Page/628"
# get required parameters:
html_doc = requests.get(url).text
soup = BeautifulSoup(html_doc, "html.parser")
url = re.search(r"window\.open\((.*)\);", html_doc).group(1)
ModuleInstanceID = re.search(r"ModuleInstanceID=(\d+)", url).group(1)
PageModuleInstanceID = re.search(r"PageModuleInstanceID=(\d+)", url).group(1)
DirectoryType = soup.select_one("[id$=displaytype]")["value"]
total_pages = int(soup.select(".ui-pagination-list li")[-1].text)
url = "https://www.bostonpublicschools.org/site/UserControls/Minibase/MinibaseListWrapper.aspx"
params = {
"ModuleInstanceID": ModuleInstanceID,
"PageModuleInstanceID": PageModuleInstanceID,
"FilterFields": "",
"DirectoryType": DirectoryType,
"PageIndex": "",
}
# iterate over the pages and print sample info:
for params["PageIndex"] in range(1, total_pages + 1):
soup = BeautifulSoup(
requests.get(url, params=params).content, "html.parser"
)
for a in soup.select(".sw-flex-item-group a"):
print(a.text)
Prints:
...
UP Academy Holland
Warren/Prescott K-8
West Zone Early Learning Center
Winship, F. Lyman Elementary
Winthrop, John Elementary
Young Achievers Science & Math K-8
Answered By - Andrej Kesely
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.