Saturday, June 4, 2022

[FIXED] How can I speed up the aiohttp parser bs4?

June 04, 2022 aiohttp, beautifulsoup, python No comments

Issue

The task is to get data from the site. I have 800 URLs to request. But it takes a long time .I use aiohttp. At this stage, I have received links, by clicking on each of them, I also get a number of links. I applied aiohttp, but the code is still slow: 390.9560036659241 sec. Sorry if this is a simple kind of question but I have little experience with asyncio so if anyone could help that would be massively appreciated. Thanks

import json
import time
import requests
from bs4 import BeautifulSoup
import datetime
import csv
import asyncio
import aiohttp

iso_data = []
iso_list = []
iso_catalogue = []
iso_links = ''
start_time = time.time()


async def get_page_data(session, url):          #get links 256 from main page
    url = "https://www.iso.org/standards-catalogue/browse-by-tc.html"

    async with session.get(url=url) as response:
        response_text = await response.text()

        soup = BeautifulSoup(response_text, "lxml")
        iso_link = soup.find("tbody")

        for iso in iso_link.find_all("tr"):
            iso_url = iso.find('a').attrs['href']
            d = iso.find('a').text
            m = iso.find('td', {'data-title': 'Title'}).text

            try:
                level_2 = (f'{d}{m}').strip()
            except:
                level_2 = "nothing"
            iso_links = f'https://www.iso.org{iso_url}'
            iso_list.append(iso_links)
            iso_data.append({'level_1': 'tc', 'level_2': level_2})
        return iso_list


async def collect_data():                            #get 800 links
   
    async with aiohttp.ClientSession() as session:
        for i in iso_list:
            response = await session.get(url=i)
            soup = BeautifulSoup(await response.text(), "lxml")
            row = soup.find_all('td', attrs={'data-title': 'Subcommittee'})
            if row:
                for el in row:
                    a = el.find('a').attrs['href']
                    iso_catalogue.append(f'https://www.iso.org{a}')
            else:
                iso_catalogue.append(iso_links)
        return iso_catalogue


async def gather_data():
    url = "https://www.iso.org/standards-catalogue/browse-by-tc.html"
    async with aiohttp.ClientSession() as session:
        response = await session.get(url=url)
        soup = BeautifulSoup(await response.text(), "lxml")

        tasks = []

        task = asyncio.create_task(get_page_data(session, url))
        tasks.append(task)

        await asyncio.gather(*tasks)

async def worker_iso(q):

    for urls in out:
        while True:
            response = await q.get(urls)
            soup = BeautifulSoup(await response.text(), "lxml")
            for i in soup.find_all('tr', {'ng-show': 'pChecked || pChecked == null'}):
                a1 = i.find('a').attrs['href']
                iso_standarts = f'https://www.iso.org{a1}'
                iso_standart.append(iso_standarts)

            q.task_done()


def main():

    asyncio.run(gather_data())
    asyncio.run(collect_data())

    cur_time = datetime.datetime.now().strftime("%d_%m_%Y_%H_%M")

    finish_time = time.time() - start_time
    print(f"Spend time: {finish_time}")


if __name__ == "__main__":
    main()
 ``

Solution

I slightly reworked your example from the question. Now you're opening 256 links from main page in serial manner, so it takes time.

In my example I created 16 workers (coroutines) which share one Queue. The workers then wait for new values I put in the queue and process the request.

256 pages opened and processed on my computer in ~19 seconds:

import tqdm  # <-- I use this for nice progress bar/timing
import asyncio
import aiohttp
from bs4 import BeautifulSoup

out = []


async def get_soup(session, url):
    async with session.get(url=url) as resp:
        return BeautifulSoup(await resp.text(), "lxml")


async def worker(session, q):
    while True:
        url, link_name, title = await q.get()

        soup = await get_soup(session, url)

        links = soup.select('[data-title="Subcommittee"] a')
        if links:
            for a in links:
                out.append("https://www.iso.org" + a["href"])
        else:
            out.append(url)

        q.task_done()


async def main():

    url = "https://www.iso.org/standards-catalogue/browse-by-tc.html"

    async with aiohttp.ClientSession() as session:
        soup = await get_soup(session, url)

        titles = soup.select('td[data-title="Title"]')
        links = soup.select('td[data-title="Committee"] a')

        committees = []
        for a, t in zip(links, titles):
            committees.append(
                [
                    "https://www.iso.org" + a["href"],
                    a.get_text(strip=True),
                    t.get_text(strip=True),
                ]
            )

        queue = asyncio.Queue(maxsize=16)

        tasks = []

        # create 16 workers that will process data in parallel
        for i in range(16):
            task = asyncio.create_task(worker(session, queue))
            tasks.append(task)

        # put some data to worker queue
        for c in tqdm.tqdm(committees):
            await queue.put(c)

        # wait for all data to be processed
        await queue.join()

        # cancel all worker tasks
        for task in tasks:
            task.cancel()

        # Wait until all worker tasks are cancelled.
        await asyncio.gather(*tasks, return_exceptions=True)

        print(len(out))


if __name__ == "__main__":
    asyncio.run(main())

Prints:

100%|██████████████████████████████████████████████████████████████████| 256/256 [00:19<00:00, 13.18it/s]
653

Answered By - Andrej Kesely

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Saturday, June 4, 2022

[FIXED] How can I speed up the aiohttp parser bs4?

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels