Source code for text_renderer.dataset

"""
Dataset utilities for text rendering operations.

This module provides dataset classes for storing and retrieving generated text images
and their corresponding labels. It supports both image file storage and LMDB database storage.
"""

import json
import os
from typing import Dict, Tuple

import cv2
import lmdb
import numpy as np


[docs]class Dataset:
    """
    Abstract base class for dataset storage and retrieval.

    This class provides a common interface for storing generated text images
    and their corresponding labels. It supports both image file storage and
    database storage formats.

    Args:
        data_dir (str): Directory path for storing dataset files
        jpg_quality (int): JPEG compression quality (1-100, default: 95)
    """

    def __init__(self, data_dir: str, jpg_quality: int = 95):
        self.data_dir = data_dir
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)
        self.jpg_quality = jpg_quality

[docs]    def encode_param(self) -> list:
        """
        Get JPEG encoding parameters for image compression.

        Returns:
            list: OpenCV JPEG encoding parameters
        """
        return [int(cv2.IMWRITE_JPEG_QUALITY), self.jpg_quality]

[docs]    def write(self, name: str, image: np.ndarray, label: str):
        """
        Write an image and its label to the dataset.

        Args:
            name (str): Unique identifier for the image
            image (np.ndarray): Image data as numpy array
            label (str): Text label corresponding to the image
        """
        pass

[docs]    def read(self, name: str) -> Dict:
        """
        Read an image and its metadata from the dataset.

        Args:
            name (str): Unique identifier for the image

        Returns:
            Dict: Dictionary containing:
                - "image": Image data as numpy array
                - "label": Text label for the image
                - "size": [width, height] of the image
        """
        pass

[docs]    def read_count(self) -> int:
        """
        Get the total number of samples in the dataset.

        Returns:
            int: Number of samples in the dataset
        """
        pass

[docs]    def write_count(self, count: int):
        """
        Write the total count of samples to the dataset.

        Args:
            count (int): Total number of samples
        """
        pass

[docs]    def close(self):
        """
        Close the dataset and release any resources.
        """
        pass

    def __enter__(self):
        """Context manager entry."""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit."""
        self.close()


[docs]class ImgDataset(Dataset):
    """
    Save generated images as JPEG files with labels and metadata in JSON.

    This dataset implementation stores images as individual JPEG files in an
    'images' subdirectory and maintains a JSON file with labels and metadata.

    JSON file format:
        {
            "labels": {
                "000000000": "test",
                "000000001": "text2"
            },
            "sizes": {
                "000000000": [width, height],
                "000000001": [width, height]
            },
            "num-samples": 2
        }

    Args:
        data_dir (str): Directory path for storing dataset files
    """

    LABEL_NAME = "labels.json"

    def __init__(self, data_dir: str):
        super().__init__(data_dir)
        self._img_dir = os.path.join(data_dir, "images")
        if not os.path.exists(self._img_dir):
            os.makedirs(self._img_dir)
        self._label_path = os.path.join(data_dir, self.LABEL_NAME)

        self._data = {"num-samples": 0, "labels": {}, "sizes": {}}
        if os.path.exists(self._label_path):
            with open(self._label_path, "r", encoding="utf-8") as f:
                self._data = json.load(f)

    def write(self, name: str, image: np.ndarray, label: str):
        """
        Write an image as JPEG file and update the JSON metadata.

        Args:
            name (str): Unique identifier for the image
            image (np.ndarray): Image data as numpy array
            label (str): Text label corresponding to the image
        """
        img_path = os.path.join(self._img_dir, name + ".jpg")
        cv2.imwrite(img_path, image, self.encode_param())
        self._data["labels"][name] = label

        height, width = image.shape[:2]
        self._data["sizes"][name] = (width, height)

    def read(self, name: str) -> Dict:
        img_path = os.path.join(self._img_dir, name + ".jpg")
        image = cv2.imread(img_path)
        label = self._data["labels"][name]
        size = self._data["sizes"][name]
        return {"image": image, "label": label, "size": size}

    def read_size(self, name: str) -> Tuple[int, int]:
        """
        Read only the size information for an image.

        Args:
            name (str): Unique identifier for the image

        Returns:
            Tuple[int, int]: (width, height) of the image
        """
        return self._data["sizes"][name]

    def read_count(self) -> int:
        return self._data.get("num-samples", 0)

    def write_count(self, count: int):
        self._data["num-samples"] = count

    def close(self):
        with open(self._label_path, "w", encoding="utf-8") as f:
            json.dump(self._data, f, indent=2, ensure_ascii=False)


[docs]class LmdbDataset(Dataset):
    """
    Save generated images into LMDB database format.

    This dataset implementation stores images in LMDB (Lightning Memory-Mapped Database)
    format, which is compatible with PaddleOCR and provides efficient storage and
    retrieval for large datasets.

    LMDB Keys format:
        - image-{name}: Image raw bytes (JPEG encoded)
        - label-{name}: Text label as string
        - size-{name}: Image dimensions as "width,height" string
        - num-samples: Total number of samples in the dataset

    Args:
        data_dir (str): Directory path for the LMDB database
    """

    def __init__(self, data_dir: str):
        """
        Initialize the LMDB dataset.

        Args:
            data_dir (str): Directory path for the LMDB database
        """
        super().__init__(data_dir)
        self._lmdb_env = lmdb.open(self.data_dir, map_size=1099511627776)  # 1T
        self._lmdb_txn = self._lmdb_env.begin(write=True)

    def write(self, name: str, image: np.ndarray, label: str):
        """
        Write an image and its label to the LMDB database.

        Args:
            name (str): Unique identifier for the image
            image (np.ndarray): Image data as numpy array
            label (str): Text label corresponding to the image
        """
        self._lmdb_txn.put(
            self.image_key(name),
            cv2.imencode(".jpg", image, self.encode_param())[1].tobytes(),
        )
        self._lmdb_txn.put(self.label_key(name), label.encode())

        height, width = image.shape[:2]
        self._lmdb_txn.put(self.size_key(name), f"{width},{height}".encode())

    def read(self, name: str) -> Dict:
        """
        Read an image and its metadata from the LMDB database.

        Args:
            name (str): Unique identifier for the image

        Returns:
            Dict: Dictionary containing:
                - "image": Image data as numpy array
                - "label": Text label for the image
                - "size": [width, height] of the image
        """
        label = self._lmdb_txn.get(self.label_key(name)).decode()
        size_str = self._lmdb_txn.get(self.size_key(name)).decode()
        size = [int(it) for it in size_str.split(",")]

        image_bytes = self._lmdb_txn.get(self.image_key(name))
        image_buf = np.frombuffer(image_bytes, dtype=np.uint8)
        image = cv2.imdecode(image_buf, cv2.IMREAD_UNCHANGED)

        return {"image": image, "label": label, "size": size}

    def read_size(self, name: str) -> Tuple[int, int]:
        """
        Read only the size information for an image from the LMDB database.

        Args:
            name (str): Unique identifier for the image

        Returns:
            Tuple[int, int]: (width, height) of the image
        """
        size_str = self._lmdb_txn.get(self.size_key(name)).decode()
        width, height = map(int, size_str.split(","))

        return width, height

    def read_count(self) -> int:
        """
        Get the total number of samples in the LMDB dataset.

        Returns:
            int: Number of samples in the dataset
        """
        count = self._lmdb_txn.get("num-samples".encode())
        if count is None:
            return 0
        return int(count)

    def write_count(self, count: int):
        """
        Write the total count of samples to the LMDB dataset.

        Args:
            count (int): Total number of samples
        """
        self._lmdb_txn.put("num-samples".encode(), str(count).encode())

    def image_key(self, name: str) -> bytes:
        """
        Generate the LMDB key for image data.

        Args:
            name (str): Image identifier

        Returns:
            bytes: Encoded key for image data
        """
        return f"image-{name}".encode()

    def label_key(self, name: str) -> bytes:
        """
        Generate the LMDB key for label data.

        Args:
            name (str): Image identifier

        Returns:
            bytes: Encoded key for label data
        """
        return f"label-{name}".encode()

    def size_key(self, name: str) -> bytes:
        """
        Generate the LMDB key for size data.

        Args:
            name (str): Image identifier

        Returns:
            bytes: Encoded key for size data
        """
        return f"size-{name}".encode()

    def __enter__(self):
        """Context manager entry."""
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        """
        Context manager exit.

        This method properly closes the LMDB transaction and environment
        to ensure data is committed and resources are released.
        """
        self._lmdb_txn.__exit__(exc_type, exc_value, traceback)
        self._lmdb_env.close()


if __name__ == "__main__":
    # image = cv2.imread("f_004.jpg")
    # label = "test"

    with LmdbDataset("./test/train") as writer:
        # writer.write("test", image, label)
        writer.write_count(1)
        print(writer.read_count())

    # with LmdbDataset("train") as ld:
    #     data = ld.read("test")
    #     cv2.imwrite("test.jpg", data["image"])