Issue
I'm trying to list all titles from a specific position in job page for some reason when I apply the function on the Beautifulsoup object to get all the 'id's, it returns None. How do I remove result that contains None?
my code:
from bs4 import BeautifulSoup
import requests
from sys import exit
import time
job_name = "python"
html_text = requests.get(f"https://www.jobstreet.com.my/{job_name}-jobs/").text
web_html = BeautifulSoup(html_text, "lxml")
def info(link):
page_job = requests.get(f"https://www.jobstreet.com.my{link}").text
web_html2 = BeautifulSoup(page_job, "lxml")
position = web_html2.find("h1", class_="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21").text
print(f"Position : {position}")
list_job = web_html.find("div", class_="z1s6m00 _1hbhsw67i _1hbhsw66e _1hbhsw69q _1hbhsw68m _1hbhsw6n _1hbhsw65a _1hbhsw6ga _1hbhsw6fy")
open_listjob = list_job.find('a', class_="jdlu994 jdlu996 jdlu999 y44q7i2 z1s6m00 z1s6m0f _1hbhsw6h")
for link in web_html.find_all('a', href=True):
if '/en/job' in link['href']:
temp_link=link.get('href')
info(temp_link)
output:
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Internship for Software Engineer</h1>
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Software Engineer</h1>
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Software Engineer</h1>
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Software Developer</h1>
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Project Implementation Engineer (Software Engineer)</h1>
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Fresh/Junior Software Engineer</h1>
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Lead Engineer - Cloud (Hybrid Working)</h1>
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Sofware Developer</h1>
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Senior Python Developer</h1>
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Robotics Software Engineer</h1>
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Python Developer</h1>
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Data Scientist / Analyst</h1>
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">R&D Software Developer (DevOps)</h1>
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Software Developer</h1>
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Software Engineer (Junior)</h1>
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Software Developer</h1>
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Software Engineer</h1>
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Senior Software Programmer</h1>
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Data Scientist (AI Systems and Applications)</h1>
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Software Development Engineer</h1>
None
None
<h1 class="z1s6m00 _1hbhsw64y y44q7i0 y44q7il _1d0g9qk4 y44q7is y44q7i21">Software Engineer - Equipment Software Integration</h1>
None
None
None
None
None
None
None
expected output:
1. Position: Internship for Software Engineer
2. Position: Software Engineer
3: Position: Software Engineer
...
..
Solution
There's no need to specify the class attributes especially as they're likely to change from time to time - i.e., they are not constant.
I suggest a different structure that includes multithreading for optimal performance.
import requests
from bs4 import BeautifulSoup as BS
from threading import Thread
job_name = 'python'
url = f'https://www.jobstreet.com.my/{job_name}-jobs/'
def process(href):
try:
with requests.get(f'https://www.jobstreet.com.my{href}') as response:
response.raise_for_status()
soup = BS(response.text, 'lxml')
if (h := soup.find('h1')):
print(h.getText())
except Exception:
pass
def main():
with requests.get(url) as response:
response.raise_for_status()
threads = []
soup = BS(response.text, 'lxml')
for a in soup.find_all('a'):
if (href := a['href']).startswith('/en/job'):
threads.append(Thread(target=process, args=(href,)))
threads[-1].start()
for thread in threads:
thread.join()
if __name__ == '__main__':
main()
Output:
Fresh/Junior Software Engineer
Software Engineer
Internship for Software Engineer
Robotics Software Engineer
Application Engineer (Software Systems)
Data Scientist / Analyst
Software Developer
etc
Answered By - CtrlZ
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.