Issue
Currently attempting to select the "sector" for a job posting. Not having an luck with xpath or css selector thus far...any assistance would be appreciated! My code is already iterating through each job posting successfully and pulling company name, location, job description etc. My vs code is below.
Original code credit: Omer Sakarya and Ken Jee.
Here are some of my attempts:
'.//span[@class="css-1ff36h2 e1pvx6aw0"]'
'.//div[@id="EmpBasicInfo"]//div[@class="d-flex flex-wrap"]/div[5]/span[@class="css-1ff36h2 e1pvx6aw0"]'
'.//div[@class="EmpBasicInfo"]//span[text()="Sector"]//following-sibling::*'
Glassdoor Job Posting/Sector Element
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.common.alert import Alert
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def get_jobs(keyword, num_jobs, verbose, path, slp_time):
'''Gathers jobs as a dataframe, scraped from Glassdoor'''
#Initializing the webdriver
options = webdriver.ChromeOptions()
#Uncomment the line below if you'd like to scrape without a new Chrome window every time.
#options.add_argument('headless')
#Change the path to where chromedriver is in your home folder.
driver = webdriver.Chrome(executable_path=path, options=options)
driver.set_window_size(1120, 1000)
url='https://www.glassdoor.com/Job/' + keyword + '-jobs-SRCH_KO0,14.htm'
driver.get(url)
jobs = []
while len(jobs) < num_jobs: #If true, should be still looking for new jobs.
#Let the page load. Change this number based on your internet speed.
#Or, wait until the webpage is loaded, instead of hardcoding it.
time.sleep(slp_time)
#Test for the "Sign Up" prompt and get rid of it.
try:
driver.find_element(By.CSS_SELECTOR, '[data-selected="true"]').click()
except ElementClickInterceptedException:
pass
time.sleep(.1)
try:
driver.find_element(By.XPATH,('.//div[@id="JAModal"]//span[@alt="Close"]')).click()
except NoSuchElementException:
pass
#Going through each job in this page
job_buttons = driver.find_elements(By.CSS_SELECTOR,'[data-test="job-link"]') #jl for Job Listing. These are the buttons were going to click.
for job_button in job_buttons:
print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
if len(jobs) >= num_jobs:
break
job_button.click() #You might
time.sleep(1)
collected_successfully = False
while not collected_successfully:
try:
company_name = driver.find_element(By.XPATH,'.//div[@class="css-xuk5ye e1tk4kwz5"]').text
location = driver.find_element(By.XPATH,'.//div[@class="css-56kyx5 e1tk4kwz1"]').text
job_title = driver.find_element(By.XPATH,'.//div[contains(@class, "css-1j389vi e1tk4kwz2")]').text
job_description = driver.find_element(By.XPATH,'.//div[@class="jobDescriptionContent desc"]').text
collected_successfully = True
except:
time.sleep(5)
try:
salary_estimate = driver.find_element(By.XPATH,'.//span[@class="css-1hbqxax e1wijj240"]').text
except NoSuchElementException:
salary_estimate = -1 #You need to set a "not found value. It's important."
try:
rating = driver.find_element(By.CSS_SELECTOR,'[data-test="detailRating"]').text
except NoSuchElementException:
rating = -1 #You need to set a "not found value. It's important."
#Printing for debugging
if verbose:
print("Job Title: {}".format(job_title))
print("Salary Estimate: {}".format(salary_estimate))
print("Job Description: {}".format(job_description[:500]))
print("Rating: {}".format(rating))
print("Company Name: {}".format(company_name))
print("Location: {}".format(location))
#Going to the Company tab...
#clicking on this:
#<div class="tab" data-tab-type="overview"><span>Company</span></div>
try:
driver.find_element(By.XPATH,'.//div[@class="tab" and @data-tab-type="overview"]').click()
try:
#<div class="infoEntity">
# <label>Headquarters</label>
# <span class="value">San Francisco, CA</span>
#</div>
headquarters = driver.find_element(By.XPATH,'.//div[@class="infoEntity"]//label[text()="Headquarters"]//following-sibling::*').text
except NoSuchElementException:
headquarters = -1
try:
size = driver.find_element(By.XPATH,'.//div[@id="EmpBasicInfo"]//div[@class="d-flex flex-wrap"]/div[1]/span[@class="css-1ff36h2 e1pvx6aw0"]').text
except NoSuchElementException:
size = -1
try:
founded = driver.find_element(By.XPATH,'.//div[@class="css-1pldt9b e1pvx6aw1"]//span[text()="Founded"]//following-sibling::*').text
except NoSuchElementException:
founded = -1
try:
type_of_ownership = driver.find_element(By.XPATH,'.//div[@class="infoEntity"]//label[text()="Type"]//following-sibling::*').text
except NoSuchElementException:
type_of_ownership = -1
try:
industry = driver.find_element(By.XPATH,'.//div[@id="EmpBasicInfo"]//div[@class="d-flex flex-wrap"]/div[4]/span[@class="css-1ff36h2 e1pvx6aw0"]').text
except NoSuchElementException:
industry = -1
try:
sector = driver.find_element(By.XPATH,".//div[@id='EmpBasicInfo']//div[@class='d-flex flex-wrap']/div[5]/span[@class='css-1pldt9b e1pvx6aw1']//following-sibling::*").text
except NoSuchElementException:
sector = -1
try:
revenue = driver.find_element(By.XPATH,'.//span[@class="css-1ff36h2 e1pvx6aw0"]').text
except NoSuchElementException:
revenue = -1
try:
competitors = driver.find_element(By.XPATH,'.//div[@class="infoEntity"]//label[text()="Competitors"]//following-sibling::*').text
except NoSuchElementException:
competitors = -1
except NoSuchElementException: #Rarely, some job postings do not have the "Company" tab.
headquarters = -1
size = -1
founded = -1
type_of_ownership = -1
industry = -1
sector = -1
revenue = -1
competitors = -1
if verbose:
print("Headquarters: {}".format(headquarters))
print("Size: {}".format(size))
print("Founded: {}".format(founded))
print("Type of Ownership: {}".format(type_of_ownership))
print("Industry: {}".format(industry))
print("Sector: {}".format(sector))
print("Revenue: {}".format(revenue))
print("Competitors: {}".format(competitors))
print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
jobs.append({"Job Title" : job_title,
"Salary Estimate" : salary_estimate,
"Job Description" : job_description,
"Rating" : rating,
"Company Name" : company_name,
"Location" : location,
"Headquarters" : headquarters,
"Size" : size,
"Founded" : founded,
"Type of ownership" : type_of_ownership,
"Industry" : industry,
"Sector" : sector,
"Revenue" : revenue,
"Competitors" : competitors})
#add job to jobs
#Clicking on the "next page" button
try:
driver.find_element(By.CSS_SELECTOR, "[alt='next-icon']").click
except NoSuchElementException:
print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
break
return pd.DataFrame(jobs) #This line converts the dictionary object into a pandas DataFrame.
Solution
Looks like I was missing the click through to the other tab on the page.
driver.find_element(By.XPATH,'.//div[@class="css-r7fjfn ead8scz1"]').click()
Thanks all that attempted to assist!
Answered By - BuffaloJ
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.