Source code for nexusLIMS.extractors.base

"""Base protocols and data structures for the extractor plugin system.

This module defines the core interfaces that all extractors must implement,
along with supporting data structures for passing context to extractors.

The plugin system uses Protocol-based structural typing (PEP 544) rather than
inheritance, allowing flexibility in implementation while maintaining type safety.
"""

from __future__ import annotations

import logging
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, Callable, NamedTuple, Protocol

if TYPE_CHECKING:
    from pathlib import Path

    from nexusLIMS.instruments import Instrument

_logger = logging.getLogger(__name__)

__all__ = [
    "BaseExtractor",
    "ExtractionContext",
    "FieldDefinition",
    "PreviewGenerator",
]


[docs] class FieldDefinition(NamedTuple): """ Configuration for extracting a single metadata field. This NamedTuple provides a declarative way to define how metadata fields should be extracted from instrument data files. It's used by TIFF-based extractors (Quanta, Tescan, Orion HIM) to reduce code duplication. Attributes ---------- section : str Section name in metadata dict (e.g., "Beam", "User", "System"). For nested dicts, this is the top-level key. source_key : str Key within the section to extract the value from. output_key : str | list[str] Output key in nx_meta. Can be a string for flat keys or a list for nested paths (e.g., ["Stage Position", "X"]). factor : float Unit conversion factor. The extracted value is multiplied by this. Use 1.0 for no conversion. For SI unit conversions, use powers of 10 (e.g., 1e6 to convert meters to micrometers). is_string : bool If True, keep value as string. If False, attempt numeric conversion with Decimal for precision. suppress_zero : bool If True, skip field if the numeric value equals zero. Only applies when is_string=False. Defaults to False. target_unit : str or None Pint unit string for the output value (e.g., "kilovolt", "millimeter"). If provided, the value will be converted to a Pint Quantity with this unit. The factor is still applied before creating the Quantity. If None, numeric values remain as floats (legacy behavior). Defaults to None. Examples -------- >>> # Simple numeric field with unit conversion (m → μm) >>> FieldDefinition("Beam", "HFW", "Horizontal Field Width (μm)", 1e6, False) >>> # String field (no conversion) >>> FieldDefinition("System", "Chamber", "Chamber ID", 1.0, True) >>> # Nested output path >>> FieldDefinition("Beam", "StageX", ["Stage Position", "X"], 1.0, False) >>> # Suppress zero values >>> FieldDefinition("Beam", "BeamShiftX", "Beam Shift X", >>> 1.0, False, suppress_zero=True) >>> # Pint Quantity output (new approach) >>> FieldDefinition("Beam", "HV", "Voltage", 1.0, False, unit="kilovolt") """ section: str source_key: str output_key: str | list[str] factor: float is_string: bool suppress_zero: bool = False target_unit: str | None = None # Pint unit string (e.g., "kilovolt", "millimeter")
[docs] @dataclass class ExtractionContext: """ Context information passed to extractors and preview generators. This dataclass encapsulates all the information an extractor needs to process a file. Using a context object allows us to add new parameters in the future without breaking existing extractors. Attributes ---------- file_path Path to the file to be processed instrument The instrument that created this file, if known. Can be None for files that cannot be associated with a specific instrument. signal_index For files with multiple signals, the index of the signal to process. If None, processes all signals or defaults to the first signal. Examples -------- >>> from pathlib import Path >>> from nexusLIMS.instruments import get_instr_from_filepath >>> file_path = Path("/path/to/data.dm3") >>> instrument = get_instr_from_filepath(file_path) >>> context = ExtractionContext(file_path, instrument) """ file_path: Path instrument: Instrument | None = None signal_index: int | None = None
[docs] class BaseExtractor(Protocol): """ Protocol defining the interface for metadata extractors. This is a Protocol (structural subtype) rather than an ABC, meaning any class that implements these attributes and methods is automatically considered a valid extractor - no inheritance required. All extractors MUST implement defensive error handling: - Never raise exceptions from extract() - catch all and return minimal metadata - Always return a list of metadata dicts (one per signal) - Log errors for debugging but don't propagate them Attributes ---------- name : str Unique identifier for this extractor (e.g., "dm3_extractor"). Should be a valid Python identifier. priority : int Priority for this extractor (0-1000, higher = preferred). See notes below for conventions. supported_extensions : set[str] | None File extensions this extractor supports (without dots). Set to None for wildcard extractors that support all files. Empty set means no extensions are directly supported (content sniffing only). Notes ----- **Priority Conventions:** - 0-49: Low priority (generic/fallback extractors) - 50-149: Normal priority (standard extractors) - 150-249: High priority (specialized/optimized extractors) - 250+: Override priority (force specific behavior) When multiple extractors support the same file, the registry will try them in descending priority order until one's supports() method returns True. Examples -------- >>> class DM3Extractor: ... \"\"\"Extract metadata from DigitalMicrograph .dm3/.dm4 files.\"\"\" ... ... name = "dm3_extractor" ... priority = 100 ... ... def supports(self, context: ExtractionContext) -> bool: ... ext = context.file_path.suffix.lower().lstrip('.') ... return ext in ('dm3', 'dm4') ... ... def extract(self, context: ExtractionContext) -> list[dict[str, Any]]: ... # Extraction logic here ... return [{"nx_meta": {...}}] """ name: str priority: int supported_extensions: set[str] | None
[docs] def supports(self, context: ExtractionContext) -> bool: """ Determine if this extractor can handle the given file. This method allows complex logic beyond simple extension matching: - Content sniffing (read file headers) - File size checks - Instrument-specific handling - Metadata validation The registry will call supports() on extractors in priority order until one returns True. Parameters ---------- context Context containing file path, instrument info, etc. Returns ------- bool True if this extractor can handle this file, False otherwise Examples -------- Extension-based matching: >>> def supports(self, context: ExtractionContext) -> bool: ... ext = context.file_path.suffix.lower().lstrip('.') ... return ext in ('dm3', 'dm4') Content sniffing: >>> def supports(self, context: ExtractionContext) -> bool: ... if context.file_path.suffix.lower() != '.tif': ... return False ... with open(context.file_path, 'rb') as f: ... header = f.read(1024) ... return b'[User]' in header # FEI signature Instrument-specific: >>> def supports(self, context: ExtractionContext) -> bool: ... return (context.instrument is not None and ... context.instrument.name.startswith("FEI-Quanta")) """ ... # pragma: no cover
[docs] def extract(self, context: ExtractionContext) -> dict[str, Any]: """ Extract metadata from the file. CRITICAL: This method MUST follow defensive design principles: - Never raise exceptions - catch all errors and return minimal metadata - Always return a list of metadata dicts where each contains an 'nx_meta' key - Log errors for debugging but continue gracefully Return Format: All extractors return a list of metadata dicts. Each dict contains: - 'nx_meta': Required - NexusLIMS-specific metadata (dict) - Other keys: Optional - Raw metadata extracted from the file Single-signal files return a list with one element. Multi-signal files return a list with one element per signal. This consistent list-based approach allows the Activity layer to expand multi-signal files into multiple datasets. Each 'nx_meta' dict MUST contain these required fields (validated against :class:`~nexusLIMS.schemas.metadata.NexusMetadata`): - 'Creation Time': ISO-8601 timestamp string **with timezone** (REQUIRED) Examples: "2024-01-15T10:30:00-05:00" or "2024-01-15T15:30:00Z" - 'Data Type': Human-readable data type (e.g., "STEM_Imaging") (REQUIRED) - 'DatasetType': Must be one of: "Image", "Spectrum", "SpectrumImage", "Diffraction", "Misc", or "Unknown" (REQUIRED) Optional standard fields: - 'Data Dimensions': String like "(1024, 1024)" or "(12, 1024, 1024)" - 'Instrument ID': Instrument PID from database - 'warnings': List of warning messages (string or [message, context] pairs) Additional instrument-specific fields beyond these are allowed. The nx_meta structure is strictly validated after extraction - validation failures will raise pydantic.ValidationError with detailed field errors. Parameters ---------- context Context containing file path, instrument info, etc. For multi-signal files, signal_index indicates which signal to process. If None, extractors may return all signals or the first signal. Returns ------- list[dict] List of metadata dicts (one per signal). Each dict contains 'nx_meta' key with NexusLIMS-specific metadata, plus optional raw metadata keys. Examples -------- Single-signal extraction: >>> def extract(self, context: ExtractionContext) -> list[dict[str, Any]]: ... try: ... metadata = [{"nx_meta": { ... "Creation Time": "2024-01-15T10:30:00-05:00", ... "Data Type": "STEM_Imaging", ... "DatasetType": "Image", ... "Data Dimensions": "(1024, 1024)", ... "Instrument ID": "643-Titan" ... }}] ... return metadata ... except Exception as e: ... logger.error(f"Extraction failed: {e}") ... return self._minimal_metadata(context) Multi-signal extraction: >>> def extract(self, context: ExtractionContext) -> list[dict[str, Any]]: ... try: ... # For a file with 2 signals ... return [ ... {"nx_meta": { ... "Creation Time": "2024-01-15T10:30:00-05:00", ... "Data Type": "STEM_Imaging", ...}}, ... {"nx_meta": { ... "Creation Time": "2024-01-15T10:30:00-05:00", ... "Data Type": "EDS_Spectrum", ...}} ... ] ... except Exception as e: ... logger.error(f"Extraction failed: {e}") ... return self._minimal_metadata(context) Minimal metadata on error: >>> def _minimal_metadata(self, context: ExtractionContext) -> list[dict]: ... return [{ ... "nx_meta": { ... "DatasetType": "Unknown", ... "Data Type": "Unknown", ... "Creation Time": context.file_path.stat().st_mtime, ... "Instrument ID": None, ... "warnings": ["Extraction failed"] ... } ... }] """ ... # pragma: no cover
[docs] class PreviewGenerator(Protocol): """ Protocol for thumbnail/preview image generation. Preview generators are separate from extractors to allow: - Different preview strategies for the same file type - Reusable preview logic across extractors - Batch preview generation independent of extraction Like BaseExtractor, this is a Protocol (structural subtype). Attributes ---------- name : str Unique identifier for this generator priority : int Priority (same conventions as BaseExtractor) supported_extensions : set[str] | None File extensions this generator supports (without dots). Set to None for wildcard generators that support all files. Empty set means no extensions are directly supported (content sniffing only). Examples -------- >>> class HyperSpyPreview: ... \"\"\"Generate previews using HyperSpy.\"\"\" ... ... name = "hyperspy_preview" ... priority = 100 ... ... def supports(self, context: ExtractionContext) -> bool: ... ext = context.file_path.suffix.lower().lstrip('.') ... return ext in ('dm3', 'dm4', 'ser') ... ... def generate(self, context: ExtractionContext, ... output_path: Path) -> bool: ... # Preview generation logic ... return True """ name: str priority: int supported_extensions: set[str] | None
[docs] def supports(self, context: ExtractionContext) -> bool: """ Determine if this generator can create a preview for the given file. Parameters ---------- context Context containing file path, instrument info, etc. Returns ------- bool True if this generator can handle this file """ ... # pragma: no cover
[docs] def generate(self, context: ExtractionContext, output_path: Path) -> bool: """ Generate a thumbnail preview and save to output_path. This method should: - Create a square thumbnail (typically 500x500 pixels) - Save to output_path as PNG - Return True on success, False on failure - Never raise exceptions (catch all and return False) Parameters ---------- context Context containing file path, instrument info, etc. output_path Where to save the generated preview PNG Returns ------- bool True if preview was successfully generated, False otherwise Examples -------- >>> def generate(self, context: ExtractionContext, ... output_path: Path) -> bool: ... try: ... # Create thumbnail ... output_path.parent.mkdir(parents=True, exist_ok=True) ... # ... generation logic ... ... return True ... except Exception as e: ... logger.error(f"Preview generation failed: {e}") ... return False """ ... # pragma: no cover
[docs] @dataclass class InstrumentProfile: """ Instrument-specific customization profile. Decouples instrument-specific logic from extractors, making it easy to add custom behavior for specific microscopes without modifying extractor code. This is the CRITICAL component for extensibility - each NexusLIMS installation has unique instruments, and this system makes it trivial to add customizations. Attributes ---------- instrument_id Instrument identifier (e.g., "FEI-Titan-STEM-630901") parsers Custom metadata parsing functions for this instrument. Keys are parser names, values are callables. transformations Metadata transformation functions applied after extraction. Keys are transform names, values are callables. extension_fields Metadata to inject into the extensions section for all files. Keys are field names, values are static values. These populate the nx_meta.extensions dict. Examples -------- Creating a custom profile for FEI Titan STEM: >>> def parse_643_titan_microscope(metadata: dict) -> dict: ... # Custom parsing logic ... return metadata >>> >>> titan_stem_profile = InstrumentProfile( ... instrument_id="FEI-Titan-STEM-630901", ... parsers={ ... "microscope_info": parse_643_titan_microscope, ... }, ... extension_fields={ ... "facility": "Nexus Facility", ... "building": "Bldg. 1", ... } ... ) """ instrument_id: str parsers: dict[str, Callable] = field(default_factory=dict) transformations: dict[str, Callable] = field(default_factory=dict) extension_fields: dict[str, Any] = field(default_factory=dict)