Source code for nexusLIMS.extractors.base
"""Base protocols and data structures for the extractor plugin system.
This module defines the core interfaces that all extractors must implement,
along with supporting data structures for passing context to extractors.
The plugin system uses Protocol-based structural typing (PEP 544) rather than
inheritance, allowing flexibility in implementation while maintaining type safety.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, Callable, NamedTuple, Protocol
if TYPE_CHECKING:
from pathlib import Path
from nexusLIMS.instruments import Instrument
_logger = logging.getLogger(__name__)
__all__ = [
"BaseExtractor",
"ExtractionContext",
"FieldDefinition",
"PreviewGenerator",
]
[docs]
class FieldDefinition(NamedTuple):
"""
Configuration for extracting a single metadata field.
This NamedTuple provides a declarative way to define how metadata fields
should be extracted from instrument data files. It's used by TIFF-based
extractors (Quanta, Tescan, Orion HIM) to reduce code duplication.
Attributes
----------
section : str
Section name in metadata dict (e.g., "Beam", "User", "System").
For nested dicts, this is the top-level key.
source_key : str
Key within the section to extract the value from.
output_key : str | list[str]
Output key in nx_meta. Can be a string for flat keys or a list
for nested paths (e.g., ["Stage Position", "X"]).
factor : float
Unit conversion factor. The extracted value is multiplied by this.
Use 1.0 for no conversion. For SI unit conversions, use powers of 10
(e.g., 1e6 to convert meters to micrometers).
is_string : bool
If True, keep value as string. If False, attempt numeric conversion
with Decimal for precision.
suppress_zero : bool
If True, skip field if the numeric value equals zero.
Only applies when is_string=False. Defaults to False.
target_unit : str or None
Pint unit string for the output value (e.g., "kilovolt", "millimeter").
If provided, the value will be converted to a Pint Quantity with this unit.
The factor is still applied before creating the Quantity.
If None, numeric values remain as floats (legacy behavior). Defaults to None.
Examples
--------
>>> # Simple numeric field with unit conversion (m → μm)
>>> FieldDefinition("Beam", "HFW", "Horizontal Field Width (μm)", 1e6, False)
>>> # String field (no conversion)
>>> FieldDefinition("System", "Chamber", "Chamber ID", 1.0, True)
>>> # Nested output path
>>> FieldDefinition("Beam", "StageX", ["Stage Position", "X"], 1.0, False)
>>> # Suppress zero values
>>> FieldDefinition("Beam", "BeamShiftX", "Beam Shift X",
>>> 1.0, False, suppress_zero=True)
>>> # Pint Quantity output (new approach)
>>> FieldDefinition("Beam", "HV", "Voltage", 1.0, False, unit="kilovolt")
"""
section: str
source_key: str
output_key: str | list[str]
factor: float
is_string: bool
suppress_zero: bool = False
target_unit: str | None = None # Pint unit string (e.g., "kilovolt", "millimeter")
[docs]
@dataclass
class ExtractionContext:
"""
Context information passed to extractors and preview generators.
This dataclass encapsulates all the information an extractor needs to
process a file. Using a context object allows us to add new parameters
in the future without breaking existing extractors.
Attributes
----------
file_path
Path to the file to be processed
instrument
The instrument that created this file, if known. Can be None for
files that cannot be associated with a specific instrument.
signal_index
For files with multiple signals, the index of the signal to process.
If None, processes all signals or defaults to the first signal.
Examples
--------
>>> from pathlib import Path
>>> from nexusLIMS.instruments import get_instr_from_filepath
>>> file_path = Path("/path/to/data.dm3")
>>> instrument = get_instr_from_filepath(file_path)
>>> context = ExtractionContext(file_path, instrument)
"""
file_path: Path
instrument: Instrument | None = None
signal_index: int | None = None
[docs]
class BaseExtractor(Protocol):
"""
Protocol defining the interface for metadata extractors.
This is a Protocol (structural subtype) rather than an ABC, meaning any class
that implements these attributes and methods is automatically considered a
valid extractor - no inheritance required.
All extractors MUST implement defensive error handling:
- Never raise exceptions from extract() - catch all and return minimal metadata
- Always return a list of metadata dicts (one per signal)
- Log errors for debugging but don't propagate them
Attributes
----------
name : str
Unique identifier for this extractor (e.g., "dm3_extractor").
Should be a valid Python identifier.
priority : int
Priority for this extractor (0-1000, higher = preferred).
See notes below for conventions.
supported_extensions : set[str] | None
File extensions this extractor supports (without dots).
Set to None for wildcard extractors that support all files.
Empty set means no extensions are directly supported (content sniffing only).
Notes
-----
**Priority Conventions:**
- 0-49: Low priority (generic/fallback extractors)
- 50-149: Normal priority (standard extractors)
- 150-249: High priority (specialized/optimized extractors)
- 250+: Override priority (force specific behavior)
When multiple extractors support the same file, the registry will
try them in descending priority order until one's supports() method
returns True.
Examples
--------
>>> class DM3Extractor:
... \"\"\"Extract metadata from DigitalMicrograph .dm3/.dm4 files.\"\"\"
...
... name = "dm3_extractor"
... priority = 100
...
... def supports(self, context: ExtractionContext) -> bool:
... ext = context.file_path.suffix.lower().lstrip('.')
... return ext in ('dm3', 'dm4')
...
... def extract(self, context: ExtractionContext) -> list[dict[str, Any]]:
... # Extraction logic here
... return [{"nx_meta": {...}}]
"""
name: str
priority: int
supported_extensions: set[str] | None
[docs]
def supports(self, context: ExtractionContext) -> bool:
"""
Determine if this extractor can handle the given file.
This method allows complex logic beyond simple extension matching:
- Content sniffing (read file headers)
- File size checks
- Instrument-specific handling
- Metadata validation
The registry will call supports() on extractors in priority order
until one returns True.
Parameters
----------
context
Context containing file path, instrument info, etc.
Returns
-------
bool
True if this extractor can handle this file, False otherwise
Examples
--------
Extension-based matching:
>>> def supports(self, context: ExtractionContext) -> bool:
... ext = context.file_path.suffix.lower().lstrip('.')
... return ext in ('dm3', 'dm4')
Content sniffing:
>>> def supports(self, context: ExtractionContext) -> bool:
... if context.file_path.suffix.lower() != '.tif':
... return False
... with open(context.file_path, 'rb') as f:
... header = f.read(1024)
... return b'[User]' in header # FEI signature
Instrument-specific:
>>> def supports(self, context: ExtractionContext) -> bool:
... return (context.instrument is not None and
... context.instrument.name.startswith("FEI-Quanta"))
"""
... # pragma: no cover
[docs]
def extract(self, context: ExtractionContext) -> dict[str, Any]:
"""
Extract metadata from the file.
CRITICAL: This method MUST follow defensive design principles:
- Never raise exceptions - catch all errors and return minimal metadata
- Always return a list of metadata dicts where each contains an 'nx_meta' key
- Log errors for debugging but continue gracefully
Return Format:
All extractors return a list of metadata dicts. Each dict contains:
- 'nx_meta': Required - NexusLIMS-specific metadata (dict)
- Other keys: Optional - Raw metadata extracted from the file
Single-signal files return a list with one element. Multi-signal files return
a list with one element per signal. This consistent list-based approach allows
the Activity layer to expand multi-signal files into multiple datasets.
Each 'nx_meta' dict MUST contain these required fields (validated against
:class:`~nexusLIMS.schemas.metadata.NexusMetadata`):
- 'Creation Time': ISO-8601 timestamp string **with timezone** (REQUIRED)
Examples: "2024-01-15T10:30:00-05:00" or "2024-01-15T15:30:00Z"
- 'Data Type': Human-readable data type (e.g., "STEM_Imaging") (REQUIRED)
- 'DatasetType': Must be one of: "Image", "Spectrum", "SpectrumImage",
"Diffraction", "Misc", or "Unknown" (REQUIRED)
Optional standard fields:
- 'Data Dimensions': String like "(1024, 1024)" or "(12, 1024, 1024)"
- 'Instrument ID': Instrument PID from database
- 'warnings': List of warning messages (string or [message, context] pairs)
Additional instrument-specific fields beyond these are allowed.
The nx_meta structure is strictly validated after extraction - validation
failures will raise pydantic.ValidationError with detailed field errors.
Parameters
----------
context
Context containing file path, instrument info, etc.
For multi-signal files, signal_index indicates which signal to process.
If None, extractors may return all signals or the first signal.
Returns
-------
list[dict]
List of metadata dicts (one per signal). Each dict contains 'nx_meta'
key with NexusLIMS-specific metadata, plus optional raw metadata keys.
Examples
--------
Single-signal extraction:
>>> def extract(self, context: ExtractionContext) -> list[dict[str, Any]]:
... try:
... metadata = [{"nx_meta": {
... "Creation Time": "2024-01-15T10:30:00-05:00",
... "Data Type": "STEM_Imaging",
... "DatasetType": "Image",
... "Data Dimensions": "(1024, 1024)",
... "Instrument ID": "643-Titan"
... }}]
... return metadata
... except Exception as e:
... logger.error(f"Extraction failed: {e}")
... return self._minimal_metadata(context)
Multi-signal extraction:
>>> def extract(self, context: ExtractionContext) -> list[dict[str, Any]]:
... try:
... # For a file with 2 signals
... return [
... {"nx_meta": {
... "Creation Time": "2024-01-15T10:30:00-05:00",
... "Data Type": "STEM_Imaging", ...}},
... {"nx_meta": {
... "Creation Time": "2024-01-15T10:30:00-05:00",
... "Data Type": "EDS_Spectrum", ...}}
... ]
... except Exception as e:
... logger.error(f"Extraction failed: {e}")
... return self._minimal_metadata(context)
Minimal metadata on error:
>>> def _minimal_metadata(self, context: ExtractionContext) -> list[dict]:
... return [{
... "nx_meta": {
... "DatasetType": "Unknown",
... "Data Type": "Unknown",
... "Creation Time": context.file_path.stat().st_mtime,
... "Instrument ID": None,
... "warnings": ["Extraction failed"]
... }
... }]
"""
... # pragma: no cover
[docs]
class PreviewGenerator(Protocol):
"""
Protocol for thumbnail/preview image generation.
Preview generators are separate from extractors to allow:
- Different preview strategies for the same file type
- Reusable preview logic across extractors
- Batch preview generation independent of extraction
Like BaseExtractor, this is a Protocol (structural subtype).
Attributes
----------
name : str
Unique identifier for this generator
priority : int
Priority (same conventions as BaseExtractor)
supported_extensions : set[str] | None
File extensions this generator supports (without dots).
Set to None for wildcard generators that support all files.
Empty set means no extensions are directly supported (content sniffing only).
Examples
--------
>>> class HyperSpyPreview:
... \"\"\"Generate previews using HyperSpy.\"\"\"
...
... name = "hyperspy_preview"
... priority = 100
...
... def supports(self, context: ExtractionContext) -> bool:
... ext = context.file_path.suffix.lower().lstrip('.')
... return ext in ('dm3', 'dm4', 'ser')
...
... def generate(self, context: ExtractionContext,
... output_path: Path) -> bool:
... # Preview generation logic
... return True
"""
name: str
priority: int
supported_extensions: set[str] | None
[docs]
def supports(self, context: ExtractionContext) -> bool:
"""
Determine if this generator can create a preview for the given file.
Parameters
----------
context
Context containing file path, instrument info, etc.
Returns
-------
bool
True if this generator can handle this file
"""
... # pragma: no cover
[docs]
def generate(self, context: ExtractionContext, output_path: Path) -> bool:
"""
Generate a thumbnail preview and save to output_path.
This method should:
- Create a square thumbnail (typically 500x500 pixels)
- Save to output_path as PNG
- Return True on success, False on failure
- Never raise exceptions (catch all and return False)
Parameters
----------
context
Context containing file path, instrument info, etc.
output_path
Where to save the generated preview PNG
Returns
-------
bool
True if preview was successfully generated, False otherwise
Examples
--------
>>> def generate(self, context: ExtractionContext,
... output_path: Path) -> bool:
... try:
... # Create thumbnail
... output_path.parent.mkdir(parents=True, exist_ok=True)
... # ... generation logic ...
... return True
... except Exception as e:
... logger.error(f"Preview generation failed: {e}")
... return False
"""
... # pragma: no cover
[docs]
@dataclass
class InstrumentProfile:
"""
Instrument-specific customization profile.
Decouples instrument-specific logic from extractors, making it easy to add
custom behavior for specific microscopes without modifying extractor code.
This is the CRITICAL component for extensibility - each NexusLIMS installation
has unique instruments, and this system makes it trivial to add customizations.
Attributes
----------
instrument_id
Instrument identifier (e.g., "FEI-Titan-STEM-630901")
parsers
Custom metadata parsing functions for this instrument.
Keys are parser names, values are callables.
transformations
Metadata transformation functions applied after extraction.
Keys are transform names, values are callables.
extension_fields
Metadata to inject into the extensions section for all files.
Keys are field names, values are static values.
These populate the nx_meta.extensions dict.
Examples
--------
Creating a custom profile for FEI Titan STEM:
>>> def parse_643_titan_microscope(metadata: dict) -> dict:
... # Custom parsing logic
... return metadata
>>>
>>> titan_stem_profile = InstrumentProfile(
... instrument_id="FEI-Titan-STEM-630901",
... parsers={
... "microscope_info": parse_643_titan_microscope,
... },
... extension_fields={
... "facility": "Nexus Facility",
... "building": "Bldg. 1",
... }
... )
"""
instrument_id: str
parsers: dict[str, Callable] = field(default_factory=dict)
transformations: dict[str, Callable] = field(default_factory=dict)
extension_fields: dict[str, Any] = field(default_factory=dict)