Issue
I want to gather all of the global 2000 data from Forbes: https://www.forbes.com/global2000/#23d5375d335d
I've been trying to find what other people have done. The most recent project on github I found worked for these billionaire datasets. There isn't any documentation for the API so I'm not sure if there is an endpoint for the global 2000 list.
I have never worked with API's before. I want to make sure this is gatherable data...
"""
Scrapes lists from Forbes.
Forbes API is undocumented so code could break if url structure is changed.
"""
import requests
import numpy as np
from pandas import DataFrame
from pathlib import Path
# Forbes lists
lists = [
{ 'type': 'person', 'year': 2017, 'uri': 'billionaires' }, # World richest
{ 'type': 'person', 'year': 2017, 'uri': 'forbes-400' }, # American richest 400
{ 'type': 'person', 'year': 2017, 'uri': 'hong-kong-billionaires' }, # Hong Kong richest 50
{ 'type': 'person', 'year': 2017, 'uri': 'australia-billionaires' }, # Australia richest 50
{ 'type': 'person', 'year': 2017, 'uri': 'china-billionaires' }, # China richest 400
{ 'type': 'person', 'year': 2017, 'uri': 'taiwan-billionaires' }, # Taiwan richest 50
{ 'type': 'person', 'year': 2017, 'uri': 'india-billionaires' }, # India richest 100
{ 'type': 'person', 'year': 2017, 'uri': 'japan-billionaires' }, # Japan richest 50
{ 'type': 'person', 'year': 2017, 'uri': 'africa-billionaires' }, # Africa richest 50
{ 'type': 'person', 'year': 2017, 'uri': 'korea-billionaires' }, # Korea richest 50
{ 'type': 'person', 'year': 2017, 'uri': 'malaysia-billionaires' }, # Malaysia richest 50
{ 'type': 'person', 'year': 2017, 'uri': 'philippines-billionaires' }, # Philippines richest 50
{ 'type': 'person', 'year': 2017, 'uri': 'singapore-billionaires' }, # Singapore richest 50
{ 'type': 'person', 'year': 2017, 'uri': 'indonesia-billionaires' }, # Indonesia richest 50
{ 'type': 'person', 'year': 2017, 'uri': 'thailand-billionaires' }, # Thailand richest 50
{ 'type': 'person', 'year': 2017, 'uri': 'self-made-women' }, # American richest self-made women
{ 'type': 'person', 'year': 2017, 'uri': 'richest-in-tech' }, # tech richest
{ 'type': 'person', 'year': 2017, 'uri': 'hedge-fund-managers' }, # hedge fund highest-earning
{ 'type': 'person', 'year': 2016, 'uri': 'powerful-people' }, # world powerful
{ 'type': 'person', 'year': 2017, 'uri': 'power-women' }, # world powerful women
{ 'type': 'person', 'year': 0, 'uri': 'rtb' }, # real-time world billionaires
{ 'type': 'person', 'year': 0, 'uri': 'rtrl' }, # real-time American richest 400
]
url = 'http://www.forbes.com/ajax/list/data'
SOURCES_DIR = Path('./sources')
for forbes_list in lists:
response = requests.get(url, params=forbes_list)
if not SOURCES_DIR.exists():
SOURCES_DIR.mkdir(exist_ok=True, parents=True)
DataFrame(response.json()).to_csv('sources/forbes-{}.csv'.format(forbes_list['uri']))
Solution
If all you want is Forbes Global 2000, then you might want to try this:
import pandas as pd
import requests
headers = {
"accept": "application/json, text/plain, */*",
"referer": "https://www.forbes.com/global2000/",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36",
}
cookies = {
"notice_behavior": "expressed,eu",
"notice_gdpr_prefs": "0,1,2:1a8b5228dd7ff0717196863a5d28ce6c",
}
api_url = "https://www.forbes.com/forbesapi/org/global2000/2020/position/true.json?limit=2000"
response = requests.get(api_url, headers=headers, cookies=cookies).json()
sample_table = [
[
item["organizationName"],
item["country"],
item["revenue"],
item["profits"],
item["assets"],
item["marketValue"]
] for item in
sorted(response["organizationList"]["organizationsLists"], key=lambda k: k["position"])
]
df = pd.DataFrame(sample_table, columns=["Company", "Country", "Sales", "Profits", "Assets", "Market Value"])
df.to_csv("forbes_2020.csv", index=False)
Output:
Interestingly enough, if you want all the data for a given year, just pick a year and dump the response list of dictionaries to a pandas
DataFrame
and there you have it! =]
import pandas as pd
import requests
headers = {
"accept": "application/json, text/plain, */*",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/87.0.4280.67 Safari/537.36",
}
cookies = {
"notice_behavior": "expressed,eu",
"notice_gdpr_prefs": "0,1,2:1a8b5228dd7ff0717196863a5d28ce6c",
}
year = 2019
api_url = f"https://www.forbes.com/forbesapi/org/global2000/{year}/position/true.json?limit=2000"
response = requests.get(api_url, headers=headers, cookies=cookies).json()
df = pd.DataFrame(
sorted(
response["organizationList"]["organizationsLists"],
key=lambda k: k["position"],
)
)
df.to_csv("forbes_2019.csv", index=False)
Answered By - baduker
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.