Source code for nexusLIMS.extractors.plugins.preview_generators.text_preview

"""Text file preview generator."""

import logging
import textwrap
from pathlib import Path
from typing import ClassVar, Union

import matplotlib.pyplot as plt
from matplotlib.figure import Figure
from PIL import Image

from nexusLIMS.extractors.base import ExtractionContext

_logger = logging.getLogger(__name__)

_LANCZOS = Image.Resampling.LANCZOS

# Constants for text preview formatting
_MAX_ROWS_NOTE = 18  # Maximum rows for note-style text
_MAX_ROWS_DATA = 17  # Maximum rows for data-style text
_MAX_COLS = 44  # Maximum columns for text display
_DEFAULT_SIZE = 5  # default size in inches for the preview


def _pad_to_square(im_path: Path, new_width: int = 500):
    """
    Pad an image to square.

    Helper method to pad an image saved on disk to a square with size
    ``width x width``. This ensures consistent display on the front-end web
    page. Increasing the size of a dimension is done by padding with empty
    space. The original image is overwritten.

    Method adapted from:
    https://jdhao.github.io/2017/11/06/resize-image-to-square-with-padding/

    Parameters
    ----------
    im_path
        The path to the image that should be resized/padded
    new_width
        Desired output width/height of the image (in pixels)
    """
    image = Image.open(im_path)
    old_size = image.size  # old_size[0] is in (width, height) format
    ratio = float(new_width) / max(old_size)
    new_size = tuple(int(x * ratio) for x in old_size)
    image = image.resize(new_size, _LANCZOS)

    new_im = Image.new("RGBA", (new_width, new_width))
    new_im.paste(
        image,
        ((new_width - new_size[0]) // 2, (new_width - new_size[1]) // 2),
    )
    new_im.save(im_path)


[docs] def text_to_thumbnail( f: Path, out_path: Path, output_size: int = 500, ) -> Union[Figure, bool]: """ Generate a preview thumbnail from a text file. For a text file, the contents will be formatted and written to a 500x500 pixel jpg image of size 5 in by 5 in. If the text file has many newlines, it is probably data and the first 42 characters of each of the first 20 lines of the text file will be written to the image. If the text file has a few (or fewer) newlines, it is probably a manually generated note and the text will be written to a 42 column, 18 row box until the space is exhausted. Parameters ---------- f The path of a text file for which a thumbnail should be generated. out_path A path to the desired thumbnail filename. All formats supported by :py:meth:`~matplotlib.figure.Figure.savefig` can be used. output_size : int The pixel width (and height, since the image is padded to square) of the saved image file. Returns ------- f : :py:class:`matplotlib.figure.Figure` or bool Handle to a matplotlib Figure, or the value False if a preview could not be generated """ plt.close("all") plt.rcParams["image.cmap"] = "gray" try: # Try to decode with common encodings raw_bytes = f.read_bytes() # Try encodings in order of preference encodings_to_try = ["utf-8", "windows-1250", "windows-1252"] content = None for encoding in encodings_to_try: try: content = raw_bytes.decode(encoding) _logger.debug("Successfully decoded %s with %s encoding", f, encoding) break except (UnicodeDecodeError, LookupError): continue if content is None: _logger.warning( "Failed to decode text file %s with any supported encoding", f ) return False except Exception as e: _logger.warning("Failed to read text file %s: %s", f, e) return False # Normalize line endings (CRLF to LF) for consistent handling content = content.replace("\r\n", "\n").replace("\r", "\n") # Expand tabs to spaces (tabs can render as black squares in matplotlib) content = content.expandtabs(tabsize=4) # Count newlines to determine if it's data or a note newline_count = content.count("\n") # Threshold to distinguish between data (many newlines) and notes (few newlines) # Using _MAX_ROWS_NOTE as threshold since notes are displayed in that many rows is_data = newline_count > _MAX_ROWS_NOTE if is_data: # Data mode: first _MAX_COLS characters of first _MAX_ROWS_DATA lines lines = content.split("\n")[:_MAX_ROWS_DATA] formatted_text = "\n".join(line[:_MAX_COLS] for line in lines) else: # Note mode: wrap to _MAX_COLS columns, up to _MAX_ROWS_NOTE rows # Wrap the text to _MAX_COLS columns wrapper = textwrap.TextWrapper(width=_MAX_COLS) wrapped_lines = [] for line in content.split("\n"): if line.strip(): # Non-empty lines wrapped_lines.extend(wrapper.wrap(line)) else: # Preserve empty lines wrapped_lines.append("") # Take first _MAX_ROWS_NOTE rows formatted_text = "\n".join(wrapped_lines[:_MAX_ROWS_NOTE]) # Escape special characters that matplotlib's mathtext parser might interpret # Replace $ with \$ to prevent mathtext parsing, and escape backslashes formatted_text = formatted_text.replace("\\", "\\\\").replace("$", r"\$") # Create a matplotlib figure with no frame fig = plt.figure( figsize=(_DEFAULT_SIZE, _DEFAULT_SIZE), dpi=output_size / _DEFAULT_SIZE, ) plt.axis("off") # Add the text to the figure # Using monospace font and left-aligned at top # Use DejaVu Sans Mono for better Unicode/emoji support than generic monospace # This font is included with matplotlib and has wider character support fig.text( 0.02, 0.97, formatted_text, fontfamily="DejaVu Sans Mono", fontsize=12, verticalalignment="top", horizontalalignment="left", usetex=False, linespacing=1.7, # Increase line spacing (default is 1.2) ) fig.tight_layout() # Save the figure try: fig.savefig(out_path, dpi=output_size / _DEFAULT_SIZE) _pad_to_square(out_path, output_size) except Exception as e: _logger.warning("Failed to save text thumbnail to %s: %s", out_path, e) plt.close(fig) return False else: plt.close(fig) return fig
[docs] class TextPreviewGenerator: """ Preview generator for text files. This generator creates thumbnail previews of text files by rendering the first few lines of text as an image. """ name = "text_preview" priority = 100 supported_extensions: ClassVar = {"txt"}
[docs] def supports(self, context: ExtractionContext) -> bool: """ Check if this generator supports the given file. Parameters ---------- context The extraction context containing file information Returns ------- bool True if file extension is .txt """ extension = context.file_path.suffix.lower().lstrip(".") return extension == "txt"
[docs] def generate(self, context: ExtractionContext, output_path: Path) -> bool: """ Generate a thumbnail preview from a text file. Parameters ---------- context The extraction context containing file information output_path Path where the preview image should be saved Returns ------- bool True if preview was successfully generated, False otherwise """ try: _logger.debug("Generating text preview for: %s", context.file_path) # Generate the thumbnail using the local function text_to_thumbnail( context.file_path, output_path, output_size=500, ) return output_path.exists() except Exception as e: _logger.warning( "Failed to generate text preview for %s: %s", context.file_path, e, ) return False