Source code for imgutils.data.url

"""
This module provides utilities for downloading and handling images from URLs, with special support for GitHub and Hugging Face URLs.

The module includes functions for:

- Downloading images from URLs with progress tracking
- URL validation and processing
- Special handling for GitHub and Hugging Face hosted images

Main components:

- download_image_from_url: Downloads and returns an image from a given URL
- is_http_url: Checks if a given URL is a valid HTTP/HTTPS URL
- Internal utilities for processing GitHub and Hugging Face URLs
"""

import io
from typing import Optional

import pyrfc6266
from PIL import Image
from hbutils.system import urlsplit
from huggingface_hub import get_session
from tqdm import tqdm
from urlobject import URLObject

__all__ = [
    'download_image_from_url',
    'is_http_url',
]


[docs]def download_image_from_url(url: str, silent: bool = False, expected_size: Optional[int] = None, **kwargs) -> Image.Image: """ Download an image from a URL and return it as a PIL Image object. :param url: URL of the image to download :type url: str :param silent: If True, suppress progress bar display :type silent: bool :param expected_size: Expected file size in bytes, used for progress bar :type expected_size: Optional[int] :param kwargs: Additional keyword arguments passed to the session.get() method :return: Downloaded image as PIL Image object :rtype: Image.Image :raises ValueError: If the URL is not supported (especially for HF URLs) :raises requests.RequestException: If download fails :raises PIL.UnidentifiedImageError: If downloaded content is not a valid image :example: >>> image = download_image_from_url('https://example.com/image.jpg') >>> image.show() """ if _is_github_url(url): url = _process_github_url_for_downloading(url) elif _is_hf_url(url): url = _process_hf_url_for_downloading(url) session = get_session() with session.get(url, stream=True, allow_redirects=True, **kwargs) as response: expected_size = expected_size or response.headers.get('Content-Length', None) expected_size = int(expected_size) if expected_size is not None else expected_size filename = None if response.headers.get('Content-Disposition'): filename = pyrfc6266.parse_filename(response.headers.get('Content-Disposition')) filename = filename or urlsplit(url).filename with io.BytesIO() as bf: with tqdm(total=expected_size, unit='B', unit_scale=True, unit_divisor=1024, desc=filename, disable=silent) as pbar: for chunk in response.iter_content(chunk_size=1024): bf.write(chunk) pbar.update(len(chunk)) bf.seek(0) image = Image.open(bf) image.load() return image
[docs]def is_http_url(url: str) -> bool: """ Check if a given URL is a valid HTTP or HTTPS URL. :param url: URL to check :type url: str :return: True if URL is a valid HTTP/HTTPS URL, False otherwise :rtype: bool :example: >>> is_http_url('https://example.com') True >>> is_http_url('ftp://example.com') False """ if not isinstance(url, str): return False split = urlsplit(url) return split.scheme == 'http' or split.scheme == 'https'
_GITHUB_SUFFIX = {('github', 'com')} def _is_github_url(url: str) -> bool: """ Check if a URL is a GitHub URL. :param url: URL to check :type url: str :return: True if URL is a GitHub URL, False otherwise :rtype: bool """ return tuple(urlsplit(url).host.split('.')[-2:]) in _GITHUB_SUFFIX def _process_github_url_for_downloading(url: str) -> str: """ Process a GitHub URL to make it suitable for raw file downloading. :param url: GitHub URL to process :type url: str :return: Processed URL for downloading :rtype: str """ return str(URLObject(url).with_query('raw=True')) _HF_SUFFIX = {('hf', 'co'), ('huggingface', 'co')} def _is_hf_url(url: str) -> bool: """ Check if a URL is a Hugging Face URL. :param url: URL to check :type url: str :return: True if URL is a Hugging Face URL, False otherwise :rtype: bool """ return tuple(urlsplit(url).host.split('.')[-2:]) in _HF_SUFFIX def _process_hf_url_for_downloading(url: str) -> str: """ Process a Hugging Face URL to make it suitable for file downloading. :param url: Hugging Face URL to process :type url: str :return: Processed URL for downloading :rtype: str :raises ValueError: If the URL format is not supported """ split = urlsplit(url) segments = split.path_segments if len(segments) >= 2 and (segments[1] == 'datasets' or segments[1] == 'spaces'): position = 4 else: position = 3 if len(segments) > position and segments[position] == 'blob': segments = [*segments[:position], 'resolve', *segments[position + 1:]] elif len(segments) > position and segments[position] == 'resolve': pass else: raise ValueError(f'Unsupported huggingface URL - {url!r}.') return f'{split.scheme}://{split.host}{"/".join(segments)}'