Issue
I'm using aiohttp
to asynchronously get download images from different sites using image URL. Before, I used requests.get
to synchronously do the same. I am able successfully able to download images using requests.get
but the same URL throws 403 Forbidden
error when I'm trying to download images using aiohttp. I try to find what could be the issue but I haven't got any success so far. requests.get
doesn't need any extra headers to get the image. The URL is important because that site's URL are getting this 403 error.
The requests.get
version:
import requests
from io import BytesIO
async def download_image(self, url: str):
## is_url is just a small function which returns True if url is valid
if not is_url(url):
return None
VALID_MIME_TYPES = {
"image/jpeg": ".jpeg",
"image/png": ".png",
"image/jpg": ".jpg",
"image/gif": ".gif",
"image/tiff": ".tiff",
"image/webp": ".webp",
"image/apng": ".apng",
"image/svg+xml": ".svg",
"application/octet-stream": get_file_extension_from_url(url=url)
# get_file_extension_from_url used to get image's type from the URL
}
response = requests.get(url) # Worked successfully and downloads the image
mimetype = response.headers.get("Content-Type", "").lower()
if mimetype in VALID_MIME_TYPES:
# creating file name for file
file_name = f"cimage.{VALID_MIME_TYPES[mimetype]}"
content = response.content
# converting to BytesIO stream
return BytesIO(content), file_name, mimetype
else:
return None
image_url = "https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc.com%2Fimages%2F602272019%2F1430182031443%2F1%2Foriginal.20230920-130504?w=940&auto=format%2Ccompress&q=75&sharp=10&rect=0%2C15%2C1200%2C600&s=13645e838fd09f2552c8f8500410abec"
image_output = download_image(image_url)
The aiohttp
version:
import aiohttp, asyncio
from io import BytesIO
async def download_image(self, url: str, session: aiohttp.ClientSession):
"""
Args:
url (str): image url
session (aiohttp.ClientSession): Using a common aiohttp session for speedup
"""
if not is_url(url):
return None
VALID_MIME_TYPES = {
"image/jpeg": ".jpeg",
"image/png": ".png",
"image/jpg": ".jpg",
"image/gif": ".gif",
"image/tiff": ".tiff",
"image/webp": ".webp",
"image/apng": ".apng",
"image/svg+xml": ".svg",
"application/octet-stream": get_file_extension_from_url(url=url)
}
res = await session.request(method="GET", url=url)
mimetype = res.headers.get("Content-Type", "").lower()
if mimetype in VALID_MIME_TYPES:
file_name = f"cimage.{VALID_MIME_TYPES[mimetype]}"
content = await res.read()
return BytesIO(content), file_name, mimetype
else:
return None
if __name__ == "__main__":
image_url = "https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc.com%2Fimages%2F602272019%2F1430182031443%2F1%2Foriginal.20230920-130504?w=940&auto=format%2Ccompress&q=75&sharp=10&rect=0%2C15%2C1200%2C600&s=13645e838fd09f2552c8f8500410abec"
image_urls = [image_url] * 1 # I just increase the number for testing
async def main():
async with aiohttp.ClientSession(trust_env=True) as session:
tasks = []
for url in image_urls:
task = asyncio.create_task(download_image(url=url, session=session))
tasks.append(task)
# returns list of output image in 3 item tuple.
images = await asyncio.gather(*tasks)
asyncio.run(main())
Here's print output of what res
returns after the aiohttp request is done:
<ClientResponse(https://img.evbuc.com/https:%2F%2Fcdn.evbuc.com%2Fimages%2F602272019%2F1430182031443%2F1%2Foriginal.20230920-130504?w=940&auto=format,compress&q=75&sharp=10&rect=0,15,1200,600&s=13645e838fd09f2552c8f8500410abec) [403 Forbidden]>
<CIMultiDictProxy('Content-Type': 'text/plain', 'Content-Length': '14', 'Connection': 'keep-alive', 'Cache-Control': 'public, max-age=5', 'Server': 'imgix', 'x-imgix-id': 'e3fb1d9c2f4cdf79dc45ca6fa20455560bdc05a5', 'x-imgix-proxy-status': '403', 'x-imgix-proxy-reason': '', 'X-Imgix-Render-Farm': '01.140360', 'Date': 'Wed, 18 Oct 2023 20:11:20 GMT', 'Accept-Ranges': 'bytes', 'Access-Control-Allow-Origin': '*', 'Timing-Allow-Origin': '*', 'Cross-Origin-Resource-Policy': 'cross-origin', 'X-Content-Type-Options': 'nosniff', 'X-Served-By': 'cache-sjc10076-SJC, cache-bom4734-BOM', 'X-Cache': 'Error from cloudfront', 'Via': '1.1 9e8c29342ff6f7610166562f3559cbe4.cloudfront.net (CloudFront)', 'X-Amz-Cf-Pop': 'BOM78-P1', 'X-Amz-Cf-Id': 'cFMR0YKkz5pLrgzH-IkmYd0JTYqgZPT-wDKbTdxDiOr_ZJH_v3xLeg==', 'Age': '0')>
What is the issue in my situation?
I hope I get the solution. Thanks.
Just to be clear this aiohttp code works on other URL but I'm facing this strange issue with this type of URLs.
Solution
aiohttp
normalizes the URL (that's not what requests
does, so the request succeeds). You can disable this behavior using yarl.URL
with encoded=True
(aiohttp
uses Yarl for URL processing):
import asyncio
import aiohttp
import yarl
async def download_image(url: str, session: aiohttp.ClientSession):
"""
Args:
url (str): image url
session (aiohttp.ClientSession): Using a common aiohttp session for speedup
"""
url = yarl.URL(url, encoded=True)
res = await session.request(method="GET", url=url)
print(res)
print()
if __name__ == "__main__":
image_url = "https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc.com%2Fimages%2F602272019%2F1430182031443%2F1%2Foriginal.20230920-130504?w=940&auto=format%2Ccompress&q=75&sharp=10&rect=0%2C15%2C1200%2C600&s=13645e838fd09f2552c8f8500410abec"
image_urls = [image_url] * 1 # I just increase the number for testing
async def main():
async with aiohttp.ClientSession(trust_env=True) as session:
tasks = []
for url in image_urls:
task = asyncio.create_task(download_image(url=url, session=session))
tasks.append(task)
images = await asyncio.gather(*tasks)
asyncio.run(main())
Prints:
<ClientResponse(https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc.com%2Fimages%2F602272019%2F1430182031443%2F1%2Foriginal.20230920-130504?w=940&auto=format%2Ccompress&q=75&sharp=10&rect=0%2C15%2C1200%2C600&s=13645e838fd09f2552c8f8500410abec) [200 OK]>
<CIMultiDictProxy('Content-Type': 'image/jpeg', 'Content-Length': '94266', 'Connection': 'keep-alive', 'Last-Modified': 'Wed, 20 Sep 2023 13:07:41 GMT', 'Cache-Control': 'public, max-age=315360001', 'Server': 'imgix', 'x-imgix-id': 'ec36b5116879ca860a0352578065a6c96481160e', 'X-Imgix-Render-Farm': '01.140360', 'Date': 'Wed, 18 Oct 2023 21:05:54 GMT', 'Accept-Ranges': 'bytes', 'Access-Control-Allow-Origin': '*', 'Timing-Allow-Origin': '*', 'Cross-Origin-Resource-Policy': 'cross-origin', 'X-Content-Type-Options': 'nosniff', 'X-Served-By': 'cache-sjc10040-SJC, cache-fra-eddf8230087-FRA', 'X-Cache': 'Miss from cloudfront', 'Via': '1.1 41b7bdf4fb536a6c72b9f49d9b6affe8.cloudfront.net (CloudFront)', 'X-Amz-Cf-Pop': 'PRG50-C1', 'X-Amz-Cf-Id': 'iYy-EXyB519KYFu_luKo9bAMnMvANxcrHEj6-Sps0LYWJ5cx60Rbvg==', 'Age': '2447892')>
Answered By - Andrej Kesely
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.