Issue
I'm trying to write an async parser for local local store, but the results are unstable. It should get ~11k items, but sometimes it gets a random amount without any exception.
What might be the problem and how can I catch/log it?
URL = 'https://shop.samberi.com'
HEADERS = {
'Accept': '*/*',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/101.0.4951.54 Safari/537.36'
}
all_products = []
async def get_products(url):
async with aiohttp.ClientSession() as session:
res = await session.get(url=url, headers=HEADERS)
bs = BeautifulSoup(await res.text(), 'lxml')
cats = [URL + cat.get('href') + '?SHOWALL_1=1'
for cat in bs.find('ul', id='vertical-multilevel-menu')
.find_all('a', class_='parent')] + [
#Костыль, не могу получить эти ссылки автоматически(
'https://shop.samberi.com/catalog/aziya/?SHOWALL_1=1',
'https://shop.samberi.com/catalog/sportivnye_tovary/?SHOWALL_1=1',
'https://shop.samberi.com/catalog/upakovka/?SHOWALL_1=1'
]
tasks = [asyncio.shield(parse_page(session, url, max_s)) for url in cats]
await asyncio.gather(*tasks)
async def parse_page(session, cat_url, max_s):
async with session.get(url=cat_url, headers=HEADERS) as res:
res_text = await res.text()
pagebs = BeautifulSoup(res_text, 'lxml')
products_on_page = pagebs.find_all('div', class_='product-item')
for product in products_on_page:
name = product.find('div', class_='product-item-title').text.strip()
price = product.find('span', class_='product-item-price-current')\
.text.strip().strip('₽').strip()
all_products.append([name, price])
def main():
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
asyncio.run(get_products(URL))
Solution
You probably get kicked-out by the server because you make too many request simultaneously. Try to inspect the https responses to further inspect the issue.
If this is actually the issue, you can limit the number of simultaneous requests by using a TCPConnector
. This program which uses a limit of 8 consistently returns 10886 products:
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import json
URL = "https://shop.samberi.com"
HEADERS = {
"Accept": "*/*",
}
async def get_products(url: str) -> list:
connector = aiohttp.TCPConnector(limit=8, limit_per_host=8)
async with aiohttp.ClientSession(connector=connector) as session:
res = await session.get(url=url, headers=HEADERS)
bs = BeautifulSoup(await res.text(), "lxml")
cats = [
URL + cat.get("href") + "?SHOWALL_1=1"
for cat in bs.find("ul", id="vertical-multilevel-menu").find_all(
"a", class_="parent"
)
] + [
# Костыль, не могу получить эти ссылки автоматически(
"https://shop.samberi.com/catalog/aziya/?SHOWALL_1=1",
"https://shop.samberi.com/catalog/sportivnye_tovary/?SHOWALL_1=1",
"https://shop.samberi.com/catalog/upakovka/?SHOWALL_1=1",
]
tasks = [parse_page(session, url) for url in cats]
print(f"Fetching {len(tasks)} pages")
results = await asyncio.gather(*tasks)
return [product for products in results for product in products]
async def parse_page(session: aiohttp.ClientSession, cat_url: str) -> list:
all_products = []
async with session.get(url=cat_url, headers=HEADERS) as res:
res_text = await res.text()
pagebs = BeautifulSoup(res_text, "lxml")
products_on_page = pagebs.find_all("div", class_="product-item")
print(f"Fething {len(products_on_page)} products")
for product in products_on_page:
name = product.find("div", class_="product-item-title").text.strip()
price = (
product.find("span", class_="product-item-price-current")
.text.strip()
.strip("₽")
.strip()
)
all_products.append([name, price])
return all_products
async def main():
products = await get_products(URL)
print(len(products))
with open("products.json", "w") as f:
json.dump({"products": products}, f)
if __name__ == "__main__":
asyncio.run(main())
Answered By - Louis Lac
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.