ml-finetuning/stable-diffusion/preprocess_training_data.py

import random
import os
from glob import glob
from PIL import Image

TRAINING_SIZE = 768
MARSEY_SIZE = TRAINING_SIZE * 0.65  # Resize the Marsey to this width/height.


# Apply the alpha channel from marseys/?????.png to upscaled/?????_Swin2SR.png
def remask(upscaled_path):
    basename = upscaled_path.split("/")[-1]
    # Strip the _Swin2SR suffix.
    original_basename = basename.split("_Swin2SR")[0] + ".png"
    original = Image.open(f"marseys/{original_basename}")
    upscaled = Image.open(upscaled_path)

    try:
        mask = original.getchannel("A").resize((upscaled.width, upscaled.height))
        upscaled.putalpha(mask)
        return (original_basename, upscaled)
    except ValueError as e:  # Usually means it's animated.
        print(e, upscaled_path)


# Remasks the images in upscaled/ with alpha masks from marseys/, resizes the result,
# and saves a copy with both a random background color and a white background.
def generate_training_set():
    backgrounds = [
        "#fabfb7",
        "#ffda9e",
        "#c5c6c8",
        "#b2e2f2",
        "#b0c2f2",
        "#b0f2c2",
        "#fdcae1",
        "#8f7193",
        "#77DD77",
        "#836953",
        "#89cff0",
        "#99c5c4",
        "#9adedb",
        "#aa9499",
        "#aaf0d1",
        "#b2fba5",
        "#b39eb5",
        "#bdb0d0",
        "#bee7a5",
        "#befd73",
        "#c1c6fc",
        "#c6a4a4",
        "#cb99c9",
        "#ff6961",
        "#ff694f",
        "#ff9899",
        "#ffb7ce",
        "#ca9bf7",
    ]

    if not os.path.exists("training-white"):
        os.mkdir("training-white")

    if not os.path.exists("training-colors"):
        os.mkdir("training-colors")

    remasked_images = [remask(path) for path in glob("upscaled/*.png")]
    remasked_images = [i for i in remasked_images if i is not None]

    for basename, marsey in remasked_images:
        max_dim = max(marsey.size)
        scale_factor = 1

        if max_dim > MARSEY_SIZE:
            scale_factor = MARSEY_SIZE / max_dim

        marsey = marsey.resize(
            (
                round(marsey.size[0] * scale_factor),
                round(marsey.size[1] * scale_factor),
            ),
            resample=Image.Resampling.LANCZOS,
        )

        box = (
            TRAINING_SIZE // 2 - marsey.size[0] // 2,
            TRAINING_SIZE // 2 - marsey.size[1] // 2,
        )

        white_bg = Image.new("RGBA", (TRAINING_SIZE, TRAINING_SIZE), "#ffffff")
        white_bg.paste(marsey, box, marsey)
        white_bg.save(f"training-white/{basename}")

        color_bg = Image.new(
            "RGBA", (TRAINING_SIZE, TRAINING_SIZE), random.choice(backgrounds)
        )
        color_bg.paste(marsey, box, marsey)
        color_bg.save(f"training-colors/{basename}")


# Creates a metadata.jsonl in the format of HuggingFace's `datasets` library.
def generate_metadata(training_dir):
    for path in glob(f"{training_dir}/*.png"):
        # "3 - Marsey walking.png" -> "Marsey walking"
        stem = path.split("/")[-1].split(".")[0]
        caption = stem.split("- ")[-1]

        with open(f"{training_dir}/metadata.jsonl", "a") as f:
            caption = caption.replace("Marsey", "Marsey the cat")
            f.write(f'{{"file_name": "{stem}.png", "text": "{caption}"}}\n')


generate_training_set()
generate_metadata("training-colors")
generate_metadata("training-white")