Tuesday, December 12, 2023

[FIXED] Lack of data using async parsing (Python)

December 12, 2023 asynchronous, python, python-asyncio No comments

Issue

I'm trying to write an async parser for local local store, but the results are unstable. It should get ~11k items, but sometimes it gets a random amount without any exception.

What might be the problem and how can I catch/log it?

URL = 'https://shop.samberi.com'

HEADERS = {
    'Accept': '*/*',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                  'AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/101.0.4951.54 Safari/537.36'
}

all_products = []

async def get_products(url):
    async with aiohttp.ClientSession() as session:
        res = await session.get(url=url, headers=HEADERS)
        bs = BeautifulSoup(await res.text(), 'lxml')
        cats = [URL + cat.get('href') + '?SHOWALL_1=1'
                for cat in bs.find('ul', id='vertical-multilevel-menu')
                .find_all('a', class_='parent')] + [
            #Костыль, не могу получить эти ссылки автоматически(
            'https://shop.samberi.com/catalog/aziya/?SHOWALL_1=1',
            'https://shop.samberi.com/catalog/sportivnye_tovary/?SHOWALL_1=1',
            'https://shop.samberi.com/catalog/upakovka/?SHOWALL_1=1'
        ]
        tasks = [asyncio.shield(parse_page(session, url, max_s)) for url in cats]

        await asyncio.gather(*tasks)

async def parse_page(session, cat_url, max_s):
    async with session.get(url=cat_url, headers=HEADERS) as res:
        res_text = await res.text()
        pagebs = BeautifulSoup(res_text, 'lxml')
        products_on_page = pagebs.find_all('div', class_='product-item')
        for product in products_on_page:
            name = product.find('div', class_='product-item-title').text.strip()
            price = product.find('span', class_='product-item-price-current')\
                .text.strip().strip('₽').strip()
            all_products.append([name, price])
def main():
    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
    asyncio.run(get_products(URL))

Solution

You probably get kicked-out by the server because you make too many request simultaneously. Try to inspect the https responses to further inspect the issue.

If this is actually the issue, you can limit the number of simultaneous requests by using a TCPConnector. This program which uses a limit of 8 consistently returns 10886 products:

import asyncio
import aiohttp
from bs4 import BeautifulSoup
import json

URL = "https://shop.samberi.com"

HEADERS = {
    "Accept": "*/*",
}


async def get_products(url: str) -> list:
    connector = aiohttp.TCPConnector(limit=8, limit_per_host=8)
    async with aiohttp.ClientSession(connector=connector) as session:
        res = await session.get(url=url, headers=HEADERS)
        bs = BeautifulSoup(await res.text(), "lxml")
        cats = [
            URL + cat.get("href") + "?SHOWALL_1=1"
            for cat in bs.find("ul", id="vertical-multilevel-menu").find_all(
                "a", class_="parent"
            )
        ] + [
            # Костыль, не могу получить эти ссылки автоматически(
            "https://shop.samberi.com/catalog/aziya/?SHOWALL_1=1",
            "https://shop.samberi.com/catalog/sportivnye_tovary/?SHOWALL_1=1",
            "https://shop.samberi.com/catalog/upakovka/?SHOWALL_1=1",
        ]

        tasks = [parse_page(session, url) for url in cats]
        print(f"Fetching {len(tasks)} pages")

        results = await asyncio.gather(*tasks)

        return [product for products in results for product in products]


async def parse_page(session: aiohttp.ClientSession, cat_url: str) -> list:
    all_products = []

    async with session.get(url=cat_url, headers=HEADERS) as res:
        res_text = await res.text()
        pagebs = BeautifulSoup(res_text, "lxml")
        products_on_page = pagebs.find_all("div", class_="product-item")

        print(f"Fething {len(products_on_page)} products")
        for product in products_on_page:
            name = product.find("div", class_="product-item-title").text.strip()
            price = (
                product.find("span", class_="product-item-price-current")
                .text.strip()
                .strip("₽")
                .strip()
            )
            all_products.append([name, price])

    return all_products


async def main():
    products = await get_products(URL)
    print(len(products))

    with open("products.json", "w") as f:
        json.dump({"products": products}, f)


if __name__ == "__main__":
    asyncio.run(main())

Answered By - Louis Lac

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Tuesday, December 12, 2023

[FIXED] Lack of data using async parsing (Python)

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels