Source code for imgutils.tagging.deepdanbooru
"""
Overview:
Tagging utils based on deepdanbooru.
.. warning::
Due to the usage of an outdated model and training data in deepdanbooru,
its performance is limited, and it is **not suitable for use as the main tagging model anymore**.
The integration of this model within the present project serves only as a baseline for comparison,
and it is advisable to avoid using this model extensively in practical applications.
"""
from functools import lru_cache
from typing import Tuple, List
import numpy as np
import pandas as pd
from PIL import Image
from huggingface_hub import hf_hub_download
from .overlap import drop_overlap_tags
from ..data import ImageTyping, load_image
from ..utils import open_onnx_model
@lru_cache()
def _get_deepdanbooru_labels():
csv_file = hf_hub_download('deepghs/imgutils-models', 'deepdanbooru/deepdanbooru_tags.csv')
df = pd.read_csv(csv_file)
tag_names = df["name"].tolist()
tag_real_names = df['real_name'].tolist()
rating_indexes = list(np.where(df["category"] == 9)[0])
general_indexes = list(np.where(df["category"] == 0)[0])
character_indexes = list(np.where(df["category"] == 4)[0])
return tag_names, tag_real_names, \
rating_indexes, general_indexes, character_indexes
@lru_cache()
def _get_deepdanbooru_model():
return open_onnx_model(hf_hub_download(
'deepghs/imgutils-models',
'deepdanbooru/deepdanbooru.onnx',
))
def _image_preprocess(image: Image.Image) -> np.ndarray:
o_width, o_height = image.size
scale = 512.0 / max(o_width, o_height)
f_width, f_height = map(lambda x: int(x * scale), (o_width, o_height))
image = image.resize((f_width, f_height))
data = np.asarray(image).astype(np.float32) / 255 # H x W x C
height_pad_left = (512 - f_height) // 2
height_pad_right = 512 - f_height - height_pad_left
width_pad_left = (512 - f_width) // 2
width_pad_right = 512 - f_width - width_pad_left
data = np.pad(data, ((height_pad_left, height_pad_right), (width_pad_left, width_pad_right), (0, 0)),
mode='constant', constant_values=0.0)
assert data.shape == (512, 512, 3), f'Shape (512, 512, 3) expected, but {data.shape!r} found.'
return data.reshape((1, 512, 512, 3)) # B x H x W x C
[docs]def get_deepdanbooru_tags(image: ImageTyping, use_real_name: bool = False,
general_threshold: float = 0.5, character_threshold: float = 0.5,
drop_overlap: bool = False):
"""
Overview:
Get tags for anime image based on ``deepdanbooru`` model.
:param image: Image to tagging.
:param use_real_name: Use real name on danbooru. Due to the renaming and redirection of many tags
on the Danbooru website after the training of ``deepdanbooru``,
it may be necessary to use the latest tag names in some application scenarios.
The default value of ``False`` indicates the use of the original tag names.
:param general_threshold: Threshold for default tags, default is ``0.35``.
:param character_threshold: Threshold for character tags, default is ``0.85``.
:param drop_overlap: Drop overlap tags or not, default is ``False``.
:return: Tagging results for levels, features and characters.
Example:
Here are some images for example
.. image:: tagging_demo.plot.py.svg
:align: center
>>> from imgutils.tagging import get_deepdanbooru_tags
>>>
>>> rating, features, chars = get_deepdanbooru_tags('skadi.jpg')
>>> rating
{'rating:safe': 0.9897817373275757, 'rating:questionable': 0.010265946388244629, 'rating:explicit': 5.2809715270996094e-05}
>>> features
{'1girl': 0.9939777851104736, 'bangs': 0.5032387375831604, 'black_border': 0.9943548440933228, 'black_gloves': 0.5011609792709351, 'blue_sky': 0.6877802610397339, 'blush': 0.5543792843818665, 'breasts': 0.8268730640411377, 'cloud': 0.8504303693771362, 'cowboy_shot': 0.6008237600326538, 'crop_top': 0.6635787487030029, 'day': 0.8496965765953064, 'gloves': 0.6107005476951599, 'hair_between_eyes': 0.668294370174408, 'holding': 0.5619469285011292, 'holding_baseball_bat': 0.5141720771789551, 'letterboxed': 1.0, 'long_hair': 0.9884189963340759, 'looking_at_viewer': 0.5673105120658875, 'midriff': 0.6290556192398071, 'navel': 0.9631235003471375, 'no_hat': 0.7978747487068176, 'no_headwear': 0.7577926516532898, 'outdoors': 0.7118550539016724, 'parted_lips': 0.5452839136123657, 'pillarboxed': 0.9841411709785461, 'red_eyes': 0.958786129951477, 'shirt': 0.6720131039619446, 'short_sleeves': 0.7077711820602417, 'silver_hair': 0.6673924326896667, 'sky': 0.8709812760353088, 'solo': 0.9614333510398865, 'sportswear': 0.7786177396774292, 'standing': 0.6842771172523499, 'sweat': 0.9076308012008667, 'thighs': 0.580970823764801}
>>> chars
{'skadi_(arknights)': 0.9633345007896423}
>>>
>>> rating, features, chars = get_deepdanbooru_tags('hutao.jpg')
>>> rating
{'rating:safe': 0.9988503456115723, 'rating:questionable': 0.001651763916015625, 'rating:explicit': 0.00012505054473876953}
>>> features
{'1girl': 0.9829280972480774, ':p': 0.894218385219574, 'ahoge': 0.8733789920806885, 'backpack': 0.6322951316833496, 'bag': 0.9987058639526367, 'bag_charm': 0.9754379987716675, 'bangs': 0.6810564994812012, 'black_border': 0.9708781838417053, 'blush': 0.6356008052825928, 'bow': 0.5633733868598938, 'brick_wall': 0.5315935611724854, 'brown_hair': 0.9397273659706116, 'building': 0.9229896664619446, 'charm_(object)': 0.9006357789039612, 'city': 0.9020784497261047, 'cityscape': 0.9547432661056519, 'cowboy_shot': 0.5296419262886047, 'flower': 0.8253412246704102, 'hair_between_eyes': 0.5619839429855347, 'hair_flower': 0.8277763724327087, 'hair_ornament': 0.9356368780136108, 'hair_ribbon': 0.5288072824478149, 'jacket': 0.6336134076118469, 'letterboxed': 1.0, 'long_hair': 0.9703260064125061, 'looking_at_viewer': 0.8188960552215576, 'phone_screen': 0.9579574465751648, 'pillarboxed': 0.9954615235328674, 'plaid': 0.9725285172462463, 'plaid_skirt': 0.9638455510139465, 'pleated_skirt': 0.7226815819740295, 'red_eyes': 0.5321241021156311, 'red_nails': 0.5493080615997314, 'school_bag': 0.9863407611846924, 'school_uniform': 0.6794284582138062, 'shirt': 0.5062428116798401, 'shoulder_bag': 0.9325523972511292, 'skirt': 0.92237788438797, 'skyscraper': 0.7728171348571777, 'sleeves_past_wrists': 0.7257086038589478, 'smile': 0.5357837080955505, 'solo': 0.6939404010772705, 'thighhighs': 0.7054293155670166, 'tongue': 0.9990814924240112, 'tongue_out': 0.9992498755455017, 'twintails': 0.5012534260749817, 'very_long_hair': 0.7461410164833069}
>>> chars
{}
"""
session = _get_deepdanbooru_model()
_image_data = _image_preprocess(load_image(image, mode='RGB'))
input_name = session.get_inputs()[0].name
output_names = [output.name for output in session.get_outputs()]
probs = session.run(output_names, {input_name: _image_data})[0]
tag_names, tag_real_names, rating_indexes, general_indexes, character_indexes = _get_deepdanbooru_labels()
labels: List[Tuple[str, float]] = list(zip(
tag_real_names if use_real_name else tag_names,
probs[0].astype(float).tolist(),
))
# First 4 labels are actually ratings: pick one with argmax
ratings_names = [labels[i] for i in rating_indexes]
rating = dict(ratings_names)
# Then we have general tags: pick anywhere prediction confidence > threshold
general_names = [labels[i] for i in general_indexes]
general_res = [x for x in general_names if x[1] > general_threshold]
general_res = dict(general_res)
if drop_overlap:
general_res = drop_overlap_tags(general_res)
# Everything else is characters: pick anywhere prediction confidence > threshold
character_names = [labels[i] for i in character_indexes]
character_res = [x for x in character_names if x[1] > character_threshold]
character_res = dict(character_res)
return rating, general_res, character_res