"""
Extract metadata from various electron microscopy file types.
Extractors should return a list of dictionaries, where each dictionary contains
the extracted metadata under the key ``nx_meta``. The ``nx_meta`` structure is
validated against the :class:`~nexusLIMS.schemas.metadata.NexusMetadata` Pydantic
schema to ensure consistency across all extractors.
Required Fields
---------------
All extractors must include these fields in ``nx_meta``:
* ``'Creation Time'`` - ISO-8601 timestamp string **with timezone** (e.g.,
``"2024-01-15T10:30:00-05:00"`` or ``"2024-01-15T15:30:00Z"``)
* ``'Data Type'`` - Human-readable description using underscores (e.g.,
``"STEM_Imaging"``, ``"TEM_EDS"``, ``"SEM_Imaging"``)
* ``'DatasetType'`` - Schema-defined category, must be one of: ``"Image"``,
``"Spectrum"``, ``"SpectrumImage"``, ``"Diffraction"``, ``"Misc"``, or ``"Unknown"``
Optional Fields
---------------
Common optional fields include:
* ``'Data Dimensions'`` - Dataset shape as string (e.g., ``"(1024, 1024)"``)
* ``'Instrument ID'`` - Instrument PID from database (e.g., ``"FEI-Titan-TEM-635816"``)
* ``'warnings'`` - List of warning messages or [message, context] pairs
Additional instrument-specific fields are allowed beyond these standard fields.
Schema Validation
-----------------
The ``nx_meta`` structure is validated using Pydantic strict mode. Validation occurs
after default values are set (e.g., missing ``DatasetType`` defaults to ``"Misc"``).
If validation fails, a ``pydantic.ValidationError`` is raised with detailed information
about which fields are invalid.
For complete schema details, see :class:`~nexusLIMS.schemas.metadata.NexusMetadata`.
"""
import base64
import inspect
import json
import logging
import shutil
from datetime import datetime as dt
from decimal import Decimal
from pathlib import Path
from typing import Any, Callable, Dict, Tuple
import hyperspy.api as hs
import numpy as np
from benedict import benedict
from pydantic import ValidationError
from nexusLIMS.extractors.base import ExtractionContext
from nexusLIMS.extractors.registry import get_registry
from nexusLIMS.instruments import get_instr_from_filepath
from nexusLIMS.schemas.metadata import (
DiffractionMetadata,
ImageMetadata,
NexusMetadata,
SpectrumImageMetadata,
SpectrumMetadata,
)
from nexusLIMS.schemas.units import ureg
from nexusLIMS.utils.paths import replace_instrument_data_path
from nexusLIMS.utils.time import current_system_tz
from nexusLIMS.version import __version__
from . import utils
from .plugins.preview_generators.hyperspy_preview import sig_to_thumbnail
from .plugins.preview_generators.image_preview import (
down_sample_image,
image_to_square_thumbnail,
)
from .plugins.preview_generators.text_preview import text_to_thumbnail
_logger = logging.getLogger(__name__)
def _config_available() -> bool:
"""Return True if NexusLIMS settings can be loaded without error."""
try:
from nexusLIMS.config import settings # noqa: PLC0415
_ = settings.NX_DATA_PATH
except Exception:
return False
else:
return True
PLACEHOLDER_PREVIEW = Path(__file__).parent / "assets" / "extractor_error.png"
"""Path to placeholder preview image used when preview generation fails."""
__all__ = [
"PLACEHOLDER_PREVIEW",
"_logger",
"create_preview",
"down_sample_image",
"flatten_dict",
"get_instr_from_filepath",
"get_registry",
"image_to_square_thumbnail",
"parse_metadata",
"sig_to_thumbnail",
"text_to_thumbnail",
"unextracted_preview_map",
"utils",
"validate_nx_meta",
]
unextracted_preview_map = {
"txt": text_to_thumbnail,
"png": image_to_square_thumbnail,
"tiff": image_to_square_thumbnail,
"bmp": image_to_square_thumbnail,
"gif": image_to_square_thumbnail,
"jpg": image_to_square_thumbnail,
"jpeg": image_to_square_thumbnail,
}
"""Filetypes that will only have basic metadata extracted but will nonetheless
have a custom preview image generated"""
def _add_extraction_details(
nx_meta: Dict,
extractor_module: Callable,
) -> Dict[str, str]:
"""
Add extraction details to the NexusLIMS metadata.
Adds metadata about the extraction process, given an extractor module
to the ``nx_meta`` metadata dictionary under the ``'NexusLIMS Extraction'``
sub-key. The ``'Extractor Module'`` metadata key will contain the fully
qualified path of a given extractor, e.g.
``nexusLIMS.extractors.basic_metadata``.
Note
----
If the ``'NexusLIMS Extraction'`` key already exists in the ``nx_meta``
metadata dictionary, this method *will* overwrite its value.
Parameters
----------
nx_meta
The metadata dictionary as returend by :py:meth:`parse_metadata`
extractor_module
The (callable) module for a specific metadata extractor from the
:py:mod:`~nexusLIMS.extractors` module.
Returns
-------
dict
An updated ``nx_meta`` dictionary, containing extraction details
"""
# PHASE 1 MIGRATION: Handle both old-style functions and new-style extractors
# Try to get the module name in different ways for backward compatibility
module_name = None
# Try __module__ attribute first (works for new extractor system)
if hasattr(extractor_module, "__module__"):
module_name = extractor_module.__module__
# Fallback to inspect.getmodule() for old-style functions
if module_name is None: # pragma: no cover
module = inspect.getmodule(extractor_module) # pragma: no cover
# Last resort - use "unknown"
module_name = ( # pragma: no cover
module.__name__ if module is not None else "unknown"
)
# Build NexusLIMS Extraction details
extraction_details = {
"Date": dt.now(tz=current_system_tz()).isoformat(),
"Module": module_name,
"Version": __version__,
}
# Move "Extractor Warnings" from nx_meta to extraction details if present
# Check both nx_meta and extensions (some extractors migrate it to extensions)
if "Extractor Warnings" in nx_meta["nx_meta"]:
extraction_details["Extractor Warnings"] = nx_meta["nx_meta"].pop(
"Extractor Warnings"
)
elif (
"extensions" in nx_meta["nx_meta"]
and "Extractor Warnings" in nx_meta["nx_meta"]["extensions"]
):
extraction_details["Extractor Warnings"] = nx_meta["nx_meta"]["extensions"].pop(
"Extractor Warnings"
)
nx_meta["nx_meta"]["NexusLIMS Extraction"] = extraction_details
return nx_meta
[docs]
def get_schema_for_dataset_type(dataset_type: str) -> type[NexusMetadata]:
"""
Select the appropriate schema class based on DatasetType.
This function maps dataset types to their corresponding type-specific
metadata schemas. Type-specific schemas (ImageMetadata, SpectrumMetadata, etc.)
provide stricter validation of fields appropriate for each data type.
Parameters
----------
dataset_type : str
The value of the 'DatasetType' field. Must be one of: 'Image', 'Spectrum',
'SpectrumImage', 'Diffraction', 'Misc', or 'Unknown'.
Returns
-------
type[NexusMetadata]
The schema class to use for validation. Returns a type-specific schema
(ImageMetadata, SpectrumMetadata, etc.) for known dataset types, or the
base NexusMetadata schema for 'Misc' and 'Unknown' types.
Notes
-----
Schema mapping:
- 'Image' → ImageMetadata (SEM/TEM/STEM images)
- 'Spectrum' → SpectrumMetadata (EDS/EELS spectra)
- 'SpectrumImage' → SpectrumImageMetadata (hyperspectral data)
- 'Diffraction' → DiffractionMetadata (diffraction patterns)
- 'Misc' → NexusMetadata (base schema)
- 'Unknown' → NexusMetadata (base schema)
- Other values → NexusMetadata (fallback)
Examples
--------
>>> schema = get_schema_for_dataset_type("Image")
>>> schema.__name__
'ImageMetadata'
>>> schema = get_schema_for_dataset_type("Unknown")
>>> schema.__name__
'NexusMetadata'
"""
schema_mapping = {
"Image": ImageMetadata,
"Spectrum": SpectrumMetadata,
"SpectrumImage": SpectrumImageMetadata,
"Diffraction": DiffractionMetadata,
"Misc": NexusMetadata,
"Unknown": NexusMetadata,
}
return schema_mapping.get(dataset_type, NexusMetadata)
[docs]
def create_preview( # noqa: PLR0911, PLR0912, PLR0915
fname: Path, *, overwrite: bool, signal_index: int | None = None
) -> Path | None:
"""
Generate a preview image for a given file using the plugin system.
This method uses the preview generator plugin system to create thumbnail
previews. It first tries to find a suitable preview generator plugin, and
falls back to legacy methods if no plugin is found.
Parameters
----------
fname
The filename from which to read data
overwrite
Whether to overwrite the .json metadata file and thumbnail
image if either exists
signal_index
For files with multiple signals, the index of the signal to preview.
If None, generates a single preview (legacy behavior). If an int,
generates preview with _signalN suffix in filename.
Returns
-------
preview_fname : Optional[pathlib.Path]
The filename of the generated preview image; if None, a preview could not be
successfully generated.
"""
# Generate preview filename with signal index suffix if provided
if signal_index is None:
preview_fname = replace_instrument_data_path(fname, ".thumb.png")
else:
preview_fname = replace_instrument_data_path(
fname, f"_signal{signal_index}.thumb.png"
)
# Skip if preview exists and overwrite is False
if preview_fname.is_file() and not overwrite:
_logger.info("Preview already exists: %s", preview_fname)
return preview_fname
# Create context for preview generation
instrument = get_instr_from_filepath(fname)
context = ExtractionContext(
file_path=fname, instrument=instrument, signal_index=signal_index
)
# Try to get a preview generator from the registry
registry = get_registry()
generator = registry.get_preview_generator(context)
if generator:
# Use plugin-based preview generation
_logger.info("Generating preview using %s: %s", generator.name, preview_fname)
# Create the directory for the thumbnail, if needed
preview_fname.parent.mkdir(parents=True, exist_ok=True)
success = generator.generate(context, preview_fname)
if success:
return preview_fname
_logger.warning(
"Preview generator %s failed for %s",
generator.name,
fname,
)
# Fall through to legacy methods
# Legacy fallback for .tif files (special case with downsampling)
extension = fname.suffix[1:]
if extension == "tif":
_logger.info("Using legacy downsampling for .tif: %s", preview_fname)
preview_fname.parent.mkdir(parents=True, exist_ok=True)
factor = 2
down_sample_image(fname, out_path=preview_fname, factor=factor)
return preview_fname
# Legacy fallback for files in unextracted_preview_map
if extension in unextracted_preview_map:
_logger.info("Using legacy preview map for %s: %s", extension, preview_fname)
preview_fname.parent.mkdir(parents=True, exist_ok=True)
preview_return = unextracted_preview_map[extension](
f=fname,
out_path=preview_fname,
output_size=500,
)
# handle the case where PIL cannot open an image
if preview_return is False:
return None
return preview_fname
# Legacy fallback for HyperSpy-loadable files
_logger.info("Trying legacy HyperSpy preview generation: %s", preview_fname)
load_options = {"lazy": True}
if extension == "ser":
load_options["only_valid_data"] = True
# noinspection PyBroadException
try:
s = hs.load(fname, **load_options)
except Exception: # pylint: disable=broad-exception-caught
_logger.warning(
"Signal could not be loaded by HyperSpy. "
"Using placeholder image for preview.",
)
preview_fname.parent.mkdir(parents=True, exist_ok=True)
shutil.copyfile(PLACEHOLDER_PREVIEW, preview_fname)
return preview_fname
# If s is a list of signals, select the appropriate one
if isinstance(s, list):
num_sigs = len(s)
original_fname = s[0].metadata.General.original_filename
if signal_index is not None:
# Use specified signal index
s = s[signal_index]
s.metadata.General.title = (
s.metadata.General.title
+ f" (signal {signal_index + 1} of "
+ f'{num_sigs} in file "{original_fname}")'
)
else:
# Legacy: use first signal only
s = s[0]
s.metadata.General.title = (
s.metadata.General.title
+ f' (1 of {num_sigs} total signals in file "{original_fname}")'
)
elif not s.metadata.General.title:
s.metadata.General.title = s.metadata.General.original_filename.replace(
extension,
"",
).strip(".")
# Generate the preview
_logger.info("Generating HyperSpy preview: %s", preview_fname)
preview_fname.parent.mkdir(parents=True, exist_ok=True)
s.compute(show_progressbar=False)
sig_to_thumbnail(s, out_path=preview_fname)
return preview_fname
[docs]
def flatten_dict(_dict, parent_key="", separator=" "): # noqa: ARG001
"""
Flatten a nested dictionary into a single level.
Utility method to take a nested dictionary structure and flatten it into a
single level, separating the levels by a string as specified by
``separator``.
Uses python-benedict for robust nested dictionary operations.
Parameters
----------
_dict : dict
The dictionary to flatten
parent_key : str
The "root" key to add to the existing keys (unused in current implementation)
separator : str
The string to use to separate values in the flattened keys (i.e.
{'a': {'b': 'c'}} would become {'a' + sep + 'b': 'c'})
Returns
-------
flattened_dict : str
The dictionary with depth one, with nested dictionaries flattened
into root-level keys
"""
# Disable keypath_separator to avoid conflicts with keys containing
# dots or other special chars
return benedict(_dict, keypath_separator=None).flatten(separator=separator)
class _CustomEncoder(json.JSONEncoder):
"""
Allow non-serializable types to be written in a JSON format.
A custom JSON Encoder class that will allow certain types to be serialized that are
not able to be by default (taken from https://stackoverflow.com/a/27050186).
"""
def default(self, o): # noqa: PLR0911
if isinstance(o, np.integer):
return int(o)
if isinstance(o, np.floating):
return float(o)
if isinstance(o, np.ndarray):
return o.tolist()
if isinstance(o, np.bytes_):
return o.decode()
if isinstance(o, np.void):
# np.void array may contain arbitary binary, so base64 encode it
return base64.b64encode(o.tolist()).decode("utf-8")
# Handle Pint Quantity objects
if isinstance(o, ureg.Quantity):
return {"value": float(o.magnitude), "unit": str(o.units)}
# Handle Decimal objects (convert to float for JSON serialization)
if isinstance(o, Decimal):
return float(o)
return super().default(o)