Issue
I am extracting annual reports pdf file from the website .
import requests
import pandas as pd
from bs4 import BeautifulSoup
url1 = "https://investor.alaskaair.com/financial-information/sec-filings?field_nir_sec_form_group_target_id%5B%5D=471&field_nir_sec_date_filed_value=#views-exposed-form-widget-sec-filings-table"
source = requests.get(url1)
soup = BeautifulSoup(source.text , "html.parser")
I am trying to extract columns from the above mentioned URL but in the view column we have only 8 rows thus throws value Error = arrays must all be same length
tag2 = soup.find_all('div' , class_="field field--name-field-nir-sec-form-group field--type-entity-reference field--label-inline field__item")
def filing_group(tag2):
filing_group = []
for i in tag2:
filing_group.append(i.text.strip())
return filing_group
filing_group(tag2)
tag4 = soup.find_all('span' , class_ = "file file--mime-application-pdf file--application-pdf")
def view(tag4):
view = []
try:
for i in tag4:
view.append(i.a.get('href'))
except AttributeError:
view.append(None)
return view
view(tag4)
def scrape_page():
all_info = {}
all_info = {
"Filing Group" : [],
"View" : []
}
all_info["Filing Group"] += filing_group(tag2)
all_info["View"] += view(tag4)
return all_info
scrape_page_df = pd.DataFrame(scrape_page())
Solution
Use:
table = soup.find('table', {'class':"nirtable views-table views-view-table cols-5 collapse-table-wide"})
trs = [x.find_all('td') for x in table.find_all('tr')]
vs = []
ls = []
for tr in trs:
if len(tr)>0:
v = 'https://investor.alaskaair.com/'+tr[1].a['href']
print(v)
try:
l = tr[4].find('span', {'class':'file file--mime-application-pdf file--application-pdf'}).a.get('href')
except:
l = None
print(l)
vs.append(v)
ls.append(l)
pd.DataFrame({'v':vs, 'l':ls})
Output:
Answered By - keramat
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.