Issue
By referring https://medium.com/analytics-vidhya/how-to-scrape-data-from-a-website-using-python-for-beginner-5c770a1fbe2d I have started scraping data from a website with login information. My website is a bit different and I got result. But it is in a different format. code:
from pprint import pprint
import datetime
import requests
from bs4 import BeautifulSoup
cookies = {
'CFID': '180615757',
'CFTOKEN': '64089929988eb934-58E2ACC9-AD21-785B-2AFBCE86106B41FE',
'visid_incap_2388351': '0Vmr7QpDRvmVw8fbXUJFkB5XEWMAAAAAQUIPAAAAAADtlXunU/D8GLU5VofHHier',
'_ga_6ZQNJ4ELG2': 'GS1.1.1662315508.15.1.1662315668.0.0.0',
'_ga': 'GA1.2.147261521.1662080801',
'_gid': 'GA1.2.1149490171.1662080801',
'reese84': '3:yMGXsdMquwoCj3IoSFRCMg==:Vf20HwL77P8oWYTTKbE0XigwyQE3d2lLQpPVoZYcoL8SJTmLeqAani+7GspfC2BiJYOOytBlkIp9MewLgs/XbkaiLrSvLnMdZ0aT8/M9FvBohByybnJXNl25ya/yfpGhL9oT1HKMZYnKqSR0Sg8+nHTUEO0/YErJgQmfoeYIT4kmE01S8cndGIemtuGjvq1hzB/D9VAQL7S3idutOumBNu84j5FyCdOBClCJTriE+X9j40lj1swIxFlryTmBAtLHnEvN9M57N4LMb13yuSBaCawrv4fnron0JnUvfKpLU0CXTnpcM9hJNGv9Ekb4Ap43CZDPdeLVzEmj+39wCVtXPtMqBNCU6mPVBSeJCRHyRuQjY+y0Sv5w7ME2LXhT8bEGHyE8yeuxddxvoG51STebu+pb0mSp5n+iKotUEn9h+sA=:WH64twwKGqtE4pUorYOeGylONeXRsfG+3Qe3zAfpdrs=',
'__atuvc': '65%7C35%2C2%7C36',
'COOKIESTATUS': 'ON',
'HIDECOOKIEBANNER': 'TRUE',
'nlbi_2388351': 'jGGxMFazFBqnU+x+okRrFAAAAAC/AJ/k+R2U+vs5Q4LIRTS7',
'nlbi_2388351_2147483392': 'PUildkEvtiZ9uje3okRrFAAAAABv1NR/7gPLX7Lc/iS5ei8N',
'incap_ses_989_2388351': 'mWy+Uq7aLX000xomDaO5DfTrFGMAAAAA6XmB42vG5CO6i609/RhyKg==',
'incap_ses_468_2388351': 'sDNcR2labTHyNXYlUqx+BipAFGMAAAAAImV2A07lGANZGfpvhvPlLg==',
'__atuvs': '6314ec0cdbe92a78001',
'_gat_gtag_UA_12825325_1': '1',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
# 'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.higheredjobs.com/admin/',
'Connection': 'keep-alive',
# Requests sorts cookies= alphabetically
# 'Cookie': 'CFID=180615757; CFTOKEN=64089929988eb934-58E2ACC9-AD21-785B-2AFBCE86106B41FE; visid_incap_2388351=0Vmr7QpDRvmVw8fbXUJFkB5XEWMAAAAAQUIPAAAAAADtlXunU/D8GLU5VofHHier; _ga_6ZQNJ4ELG2=GS1.1.1662315508.15.1.1662315668.0.0.0; _ga=GA1.2.147261521.1662080801; _gid=GA1.2.1149490171.1662080801; reese84=3:yMGXsdMquwoCj3IoSFRCMg==:Vf20HwL77P8oWYTTKbE0XigwyQE3d2lLQpPVoZYcoL8SJTmLeqAani+7GspfC2BiJYOOytBlkIp9MewLgs/XbkaiLrSvLnMdZ0aT8/M9FvBohByybnJXNl25ya/yfpGhL9oT1HKMZYnKqSR0Sg8+nHTUEO0/YErJgQmfoeYIT4kmE01S8cndGIemtuGjvq1hzB/D9VAQL7S3idutOumBNu84j5FyCdOBClCJTriE+X9j40lj1swIxFlryTmBAtLHnEvN9M57N4LMb13yuSBaCawrv4fnron0JnUvfKpLU0CXTnpcM9hJNGv9Ekb4Ap43CZDPdeLVzEmj+39wCVtXPtMqBNCU6mPVBSeJCRHyRuQjY+y0Sv5w7ME2LXhT8bEGHyE8yeuxddxvoG51STebu+pb0mSp5n+iKotUEn9h+sA=:WH64twwKGqtE4pUorYOeGylONeXRsfG+3Qe3zAfpdrs=; __atuvc=65%7C35%2C2%7C36; COOKIESTATUS=ON; HIDECOOKIEBANNER=TRUE; nlbi_2388351=jGGxMFazFBqnU+x+okRrFAAAAAC/AJ/k+R2U+vs5Q4LIRTS7; nlbi_2388351_2147483392=PUildkEvtiZ9uje3okRrFAAAAABv1NR/7gPLX7Lc/iS5ei8N; incap_ses_989_2388351=mWy+Uq7aLX000xomDaO5DfTrFGMAAAAA6XmB42vG5CO6i609/RhyKg==; incap_ses_468_2388351=sDNcR2labTHyNXYlUqx+BipAFGMAAAAAImV2A07lGANZGfpvhvPlLg==; __atuvs=6314ec0cdbe92a78001; _gat_gtag_UA_12825325_1=1',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
# Requests doesn't support trailers
# 'TE': 'trailers',
}
params = {
'JobCat': '141',
'CatName': 'Academic Advising',
}
response = requests.get('https://www.higheredjobs.com/admin/search.cfm', params=params, cookies=cookies, headers=headers)
soup = BeautifulSoup(response.text,'html.parser')#'lxml')#
job_title = soup.find_all('div',class_=["row record","col-sm-5 text-sm-right"])
jobs_list = []
for i in job_title:
name = i.text
jobs_list.append(name)
df = pd.DataFrame({'Jobs title':jobs_list})
Present output:
df =
Jobs title
0 \n\nRe-Sort\n\n\r\n\t\t\tResults 1 - 70 of 70\...
1 \n\n\r\n\t\t\t\t\t\t\t\t\t\t\tAssistant Profes...
2 \r\n\t\t\t\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\t\t\t...
Expected output:
df =
Jobs title Company name location Posted
0 Assistant Professor/Associate University of Southern Indiana Evansville, IN 09/02/22
Professor of Engineering,
Pott College of Science,
Engineering, and Education - F22057F1
Solution
Main issue is that you try to create your DataFrame
from unstructured data, that is collected in your list
.
So try to structure it first e.g. as dict
, append it to your list
and then create your DataFrame
:
jobs_list = []
for i in soup.select('.row.record'):
jobs_list.append(dict(zip(['title','university','location','study','date'],i.stripped_strings)))
pd.DataFrame(jobs_list)
Note: If you like to change the headers, change this list
-> ['title','university','location','study','date']
Example
from bs4 import BeautifulSoup
html ='''
<div class="row record">
<div class="col-sm-7"><a href="details.cfm?JobCode=178085874&Title=Assistant%20Professor%2FAssociate%20Professor%20of%20Engineering%2C%20Pott%20College%20of%20Science%2C%20Engineering%2C%20and%20Education%20%2D%20F22057F1">
Assistant Professor/Associate Professor of Engineering, Pott College of Science, Engineering, and Education - F22057F1</a>
<br/>
University of Southern Indiana <br/>
Evansville, IN
</div>
<div class="col-sm-5 text-sm-right">
Electrical Engineering
<br/> Posted 09/02/22<br/>
</div>
</div>
<div class="row record">
<div class="col-sm-7">
<a href="details.cfm?JobCode=178085843&Title=Assistant%20Professor%20of%20Engineering%20F99507">
Assistant Professor of Engineering F99507</a>
<br/>
McNeese State University <br/>
Lake Charles, LA
</div>
<div class="col-sm-5 text-sm-right">
Electrical Engineering
<br/> Posted 09/02/22<br/>
</div>
</div>
'''
soup = BeautifulSoup(html)
jobs_list = []
for i in soup.select('.row.record'):
jobs_list.append(dict(zip(['title','university','location','study','date'],i.stripped_strings)))
pd.DataFrame(jobs_list)
Output
title | university | location | study | date | |
---|---|---|---|---|---|
0 | Assistant Professor/Associate Professor of Engineering, Pott College of Science, Engineering, and Education - F22057F1 | University of Southern Indiana | Evansville, IN | Electrical Engineering | Posted 09/02/22 |
1 | Assistant Professor of Engineering F99507 | McNeese State University | Lake Charles, LA | Electrical Engineering | Posted 09/02/22 |
Answered By - HedgeHog
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.