Issue
So, I was trying to scrape a tale for a website, I manage to scrape the 1st URL's table, but I'm not sure how to iterate to the next URL.
Here's my for one URL :
u = 'https://www.bursamalaysia.com/market_information/announcements/company_announcement/announcement_details?ann_id=393739'
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
#openurl
driver = webdriver.Chrome('chromedriver',options=options)
web = driver.get(u)
html = driver.page_source
soup = bs(html, 'html.parser')
iframe = soup.find('iframe')['src']
openiframe = driver.get(iframe)
iframehtml = driver.page_source
soupiframe = bs(iframehtml, 'html.parser')
#extracting table
df = pd.read_html(iframehtml)
table1 = df[1]
table2 = df[2]
table3 = df[3]
#cleanup table
t1 = table1.set_index([0, table1.groupby(0).cumcount()])[1].unstack(0)
t1['Remarks'] = table2.iloc[1]
t3 = table3.set_index([0, table3.groupby(0).cumcount()])[1].unstack(0)
#join all table
frame = [t1,t3]
merge = pd.concat(frame,axis=1,join="outer",ignore_index=False)
merge
and now, I don't know how to iterate for 2 or more URL in this script :
u = {'https://www.bursamalaysia.com/market_information/announcements/company_announcement/announcement_details?ann_id=393739','https://www.bursamalaysia.com/market_information/announcements/company_announcement/announcement_details?ann_id=393738'}
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
#openurl
driver = webdriver.Chrome('chromedriver',options=options)
web = driver.get(u)
html = driver.page_source
soup = bs(html, 'html.parser')
iframe = soup.find('iframe')['src']
openiframe = driver.get(iframe)
iframehtml = driver.page_source
soupiframe = bs(iframehtml, 'html.parser')
#extracting table
df = pd.read_html(iframehtml)
table1 = df[1]
table2 = df[2]
table3 = df[3]
#cleanup table
t1 = table1.set_index([0, table1.groupby(0).cumcount()])[1].unstack(0)
t1['Remarks'] = table2.iloc[1]
t3 = table3.set_index([0, table3.groupby(0).cumcount()])[1].unstack(0)
#join all table
frame = [t1,t3]
merge = pd.concat(frame,axis=1,join="outer",ignore_index=False)
merge
output should look like this :
Solution
import trio
import httpx
import pandas as pd
keys = [393738, 393739]
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"
}
allin = []
async def worker(channel):
async with channel:
async for key_ in channel:
async with httpx.AsyncClient(timeout=None) as client:
client.headers.update(headers)
params = {
"e": key_
}
r = await client.get('https://disclosure.bursamalaysia.com/FileAccess/viewHtml', params=params)
all = pd.read_html(
r.text, index_col=0)
df = all[1].T.join(all[-1].T)
df['Remarks'] = all[2].iloc[1].name
allin.append(df)
async def main():
async with trio.open_nursery() as nurse:
sender, receiver = trio.open_memory_channel(0)
async with receiver:
for _ in range(3):
nurse.start_soon(worker, receiver.clone())
async with sender:
for k in keys:
await sender.send(k)
finaldf = pd.concat(allin, ignore_index=True)
print(finaldf)
# finaldf.to_csv('data.csv', index=False)
if __name__ == "__main__":
trio.run(main)
Output:
0 Date of change Type of change ... Reference No Remarks
0 11/11/2011 Resignation ... CC-111111-50017 Resigned as Chief Executive Officer of the Com...
1 31/12/2011 Others ... CC-110907-47379 It was Mr Yen Wen Hwa's desire to retire and t...
[2 rows x 17 columns]
Answered By - αԋɱҽԃ αмєяιcαη
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.