Source code for text_renderer.corpus.word_corpus

from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Tuple

import numpy as np
from loguru import logger

from text_renderer.utils.errors import PanicError

from .corpus import Corpus, CorpusCfg


[docs]@dataclass
class WordCorpusCfg(CorpusCfg):
    """
    Word corpus config

    args:
        text_paths (List[Path]): Text file paths
        separator (str): word separator of texts and join char in get_text()
        num_word (Tuple[int, int]): Range of output word count  [min_length, max_length)
        filter_by_chars (bool): If True, filtering text by character set
        chars_file (Path): Character set
        filter_font (bool): Only work when filter_by_chars is True. If True filter font file
                            by intersection of font support chars with chars file
        filter_font_min_support_chars (int): If intersection of font support chars with chars file is lower
                                             than filter_font_min_support_chars, filter this font file.

    """

    text_paths: List[Path] = field(default_factory=list)
    separator: str = " "
    num_word: (int, int) = (1, 5)
    filter_by_chars: bool = False
    chars_file: Path = None
    filter_font: bool = False
    filter_font_min_support_chars: int = 100


[docs]class WordCorpus(Corpus):
    """
    Output contiguous words of a certain length
    """

    def __init__(self, cfg: "CorpusCfg"):
        super().__init__(cfg)

        self.cfg: WordCorpusCfg
        if len(self.cfg.text_paths) == 0:
            raise PanicError("text_paths must not be empty")

        self.words: List[str] = []

        texts = []
        for text_path in self.cfg.text_paths:
            with open(text_path, "r", encoding="utf-8") as f:
                text = f.read()
                texts.append(text.strip())

        if self.cfg.chars_file is not None:
            self.font_manager.update_font_support_chars(self.cfg.chars_file)

        if self.cfg.filter_by_chars:
            texts = Corpus.filter_by_chars(texts, self.cfg.chars_file)
            if self.cfg.filter_font:
                self.font_manager.filter_font_path(self.cfg.filter_font_min_support_chars)

        for text in texts:
            self.words.extend(text.split(self.cfg.separator))

        logger.info(f"Load {len(self.words)} words")

        if len(self.words) < self.cfg.num_word[1]:
            raise PanicError("too few words")

    def get_text(self):
        self.cfg: WordCorpusCfg
        if self.cfg.num_word[0] == self.cfg.num_word[1]:
            length = self.cfg.num_word[0]
        else:
            length = np.random.randint(*self.cfg.num_word)

        start = np.random.randint(0, len(self.words) - length + 1)
        words = self.words[start : start + length]
        word = self.cfg.separator.join(words)
        return word