Issue
The task is to get data from the site. I have 800 URLs to request. But it takes a long time .I use aiohttp. At this stage, I have received links, by clicking on each of them, I also get a number of links. I applied aiohttp, but the code is still slow: 390.9560036659241 sec. Sorry if this is a simple kind of question but I have little experience with asyncio so if anyone could help that would be massively appreciated. Thanks
import json
import time
import requests
from bs4 import BeautifulSoup
import datetime
import csv
import asyncio
import aiohttp
iso_data = []
iso_list = []
iso_catalogue = []
iso_links = ''
start_time = time.time()
async def get_page_data(session, url): #get links 256 from main page
url = "https://www.iso.org/standards-catalogue/browse-by-tc.html"
async with session.get(url=url) as response:
response_text = await response.text()
soup = BeautifulSoup(response_text, "lxml")
iso_link = soup.find("tbody")
for iso in iso_link.find_all("tr"):
iso_url = iso.find('a').attrs['href']
d = iso.find('a').text
m = iso.find('td', {'data-title': 'Title'}).text
try:
level_2 = (f'{d}{m}').strip()
except:
level_2 = "nothing"
iso_links = f'https://www.iso.org{iso_url}'
iso_list.append(iso_links)
iso_data.append({'level_1': 'tc', 'level_2': level_2})
return iso_list
async def collect_data(): #get 800 links
async with aiohttp.ClientSession() as session:
for i in iso_list:
response = await session.get(url=i)
soup = BeautifulSoup(await response.text(), "lxml")
row = soup.find_all('td', attrs={'data-title': 'Subcommittee'})
if row:
for el in row:
a = el.find('a').attrs['href']
iso_catalogue.append(f'https://www.iso.org{a}')
else:
iso_catalogue.append(iso_links)
return iso_catalogue
async def gather_data():
url = "https://www.iso.org/standards-catalogue/browse-by-tc.html"
async with aiohttp.ClientSession() as session:
response = await session.get(url=url)
soup = BeautifulSoup(await response.text(), "lxml")
tasks = []
task = asyncio.create_task(get_page_data(session, url))
tasks.append(task)
await asyncio.gather(*tasks)
async def worker_iso(q):
for urls in out:
while True:
response = await q.get(urls)
soup = BeautifulSoup(await response.text(), "lxml")
for i in soup.find_all('tr', {'ng-show': 'pChecked || pChecked == null'}):
a1 = i.find('a').attrs['href']
iso_standarts = f'https://www.iso.org{a1}'
iso_standart.append(iso_standarts)
q.task_done()
def main():
asyncio.run(gather_data())
asyncio.run(collect_data())
cur_time = datetime.datetime.now().strftime("%d_%m_%Y_%H_%M")
finish_time = time.time() - start_time
print(f"Spend time: {finish_time}")
if __name__ == "__main__":
main()
``
Solution
I slightly reworked your example from the question. Now you're opening 256 links from main page in serial manner, so it takes time.
In my example I created 16 workers (coroutines) which share one Queue. The workers then wait for new values I put in the queue and process the request.
256 pages opened and processed on my computer in ~19 seconds:
import tqdm # <-- I use this for nice progress bar/timing
import asyncio
import aiohttp
from bs4 import BeautifulSoup
out = []
async def get_soup(session, url):
async with session.get(url=url) as resp:
return BeautifulSoup(await resp.text(), "lxml")
async def worker(session, q):
while True:
url, link_name, title = await q.get()
soup = await get_soup(session, url)
links = soup.select('[data-title="Subcommittee"] a')
if links:
for a in links:
out.append("https://www.iso.org" + a["href"])
else:
out.append(url)
q.task_done()
async def main():
url = "https://www.iso.org/standards-catalogue/browse-by-tc.html"
async with aiohttp.ClientSession() as session:
soup = await get_soup(session, url)
titles = soup.select('td[data-title="Title"]')
links = soup.select('td[data-title="Committee"] a')
committees = []
for a, t in zip(links, titles):
committees.append(
[
"https://www.iso.org" + a["href"],
a.get_text(strip=True),
t.get_text(strip=True),
]
)
queue = asyncio.Queue(maxsize=16)
tasks = []
# create 16 workers that will process data in parallel
for i in range(16):
task = asyncio.create_task(worker(session, queue))
tasks.append(task)
# put some data to worker queue
for c in tqdm.tqdm(committees):
await queue.put(c)
# wait for all data to be processed
await queue.join()
# cancel all worker tasks
for task in tasks:
task.cancel()
# Wait until all worker tasks are cancelled.
await asyncio.gather(*tasks, return_exceptions=True)
print(len(out))
if __name__ == "__main__":
asyncio.run(main())
Prints:
100%|██████████████████████████████████████████████████████████████████| 256/256 [00:19<00:00, 13.18it/s]
653
Answered By - Andrej Kesely
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.