Coverage for nexusLIMS/extractors/base.py: 100%
37 statements
« prev ^ index » next coverage.py v7.11.3, created at 2026-03-24 05:23 +0000
« prev ^ index » next coverage.py v7.11.3, created at 2026-03-24 05:23 +0000
1"""Base protocols and data structures for the extractor plugin system.
3This module defines the core interfaces that all extractors must implement,
4along with supporting data structures for passing context to extractors.
6The plugin system uses Protocol-based structural typing (PEP 544) rather than
7inheritance, allowing flexibility in implementation while maintaining type safety.
8"""
10from __future__ import annotations
12import logging
13from dataclasses import dataclass, field
14from typing import TYPE_CHECKING, Any, Callable, NamedTuple, Protocol
16if TYPE_CHECKING:
17 from pathlib import Path
19 from nexusLIMS.instruments import Instrument
21_logger = logging.getLogger(__name__)
23__all__ = [
24 "BaseExtractor",
25 "ExtractionContext",
26 "FieldDefinition",
27 "PreviewGenerator",
28]
31class FieldDefinition(NamedTuple):
32 """
33 Configuration for extracting a single metadata field.
35 This NamedTuple provides a declarative way to define how metadata fields
36 should be extracted from instrument data files. It's used by TIFF-based
37 extractors (Quanta, Tescan, Orion HIM) to reduce code duplication.
39 Attributes
40 ----------
41 section : str
42 Section name in metadata dict (e.g., "Beam", "User", "System").
43 For nested dicts, this is the top-level key.
44 source_key : str
45 Key within the section to extract the value from.
46 output_key : str | list[str]
47 Output key in nx_meta. Can be a string for flat keys or a list
48 for nested paths (e.g., ["Stage Position", "X"]).
49 factor : float
50 Unit conversion factor. The extracted value is multiplied by this.
51 Use 1.0 for no conversion. For SI unit conversions, use powers of 10
52 (e.g., 1e6 to convert meters to micrometers).
53 is_string : bool
54 If True, keep value as string. If False, attempt numeric conversion
55 with Decimal for precision.
56 suppress_zero : bool
57 If True, skip field if the numeric value equals zero.
58 Only applies when is_string=False. Defaults to False.
59 target_unit : str or None
60 Pint unit string for the output value (e.g., "kilovolt", "millimeter").
61 If provided, the value will be converted to a Pint Quantity with this unit.
62 The factor is still applied before creating the Quantity.
63 If None, numeric values remain as floats (legacy behavior). Defaults to None.
65 Examples
66 --------
67 >>> # Simple numeric field with unit conversion (m → μm)
68 >>> FieldDefinition("Beam", "HFW", "Horizontal Field Width (μm)", 1e6, False)
70 >>> # String field (no conversion)
71 >>> FieldDefinition("System", "Chamber", "Chamber ID", 1.0, True)
73 >>> # Nested output path
74 >>> FieldDefinition("Beam", "StageX", ["Stage Position", "X"], 1.0, False)
76 >>> # Suppress zero values
77 >>> FieldDefinition("Beam", "BeamShiftX", "Beam Shift X",
78 >>> 1.0, False, suppress_zero=True)
80 >>> # Pint Quantity output (new approach)
81 >>> FieldDefinition("Beam", "HV", "Voltage", 1.0, False, unit="kilovolt")
82 """
84 section: str
85 source_key: str
86 output_key: str | list[str]
87 factor: float
88 is_string: bool
89 suppress_zero: bool = False
90 target_unit: str | None = None # Pint unit string (e.g., "kilovolt", "millimeter")
93@dataclass
94class ExtractionContext:
95 """
96 Context information passed to extractors and preview generators.
98 This dataclass encapsulates all the information an extractor needs to
99 process a file. Using a context object allows us to add new parameters
100 in the future without breaking existing extractors.
102 Attributes
103 ----------
104 file_path
105 Path to the file to be processed
106 instrument
107 The instrument that created this file, if known. Can be None for
108 files that cannot be associated with a specific instrument.
109 signal_index
110 For files with multiple signals, the index of the signal to process.
111 If None, processes all signals or defaults to the first signal.
113 Examples
114 --------
115 >>> from pathlib import Path
116 >>> from nexusLIMS.instruments import get_instr_from_filepath
117 >>> file_path = Path("/path/to/data.dm3")
118 >>> instrument = get_instr_from_filepath(file_path)
119 >>> context = ExtractionContext(file_path, instrument)
120 """
122 file_path: Path
123 instrument: Instrument | None = None
124 signal_index: int | None = None
127class BaseExtractor(Protocol):
128 """
129 Protocol defining the interface for metadata extractors.
131 This is a Protocol (structural subtype) rather than an ABC, meaning any class
132 that implements these attributes and methods is automatically considered a
133 valid extractor - no inheritance required.
135 All extractors MUST implement defensive error handling:
136 - Never raise exceptions from extract() - catch all and return minimal metadata
137 - Always return a list of metadata dicts (one per signal)
138 - Log errors for debugging but don't propagate them
140 Attributes
141 ----------
142 name : str
143 Unique identifier for this extractor (e.g., "dm3_extractor").
144 Should be a valid Python identifier.
145 priority : int
146 Priority for this extractor (0-1000, higher = preferred).
147 See notes below for conventions.
148 supported_extensions : set[str] | None
149 File extensions this extractor supports (without dots).
150 Set to None for wildcard extractors that support all files.
151 Empty set means no extensions are directly supported (content sniffing only).
153 Notes
154 -----
155 **Priority Conventions:**
157 - 0-49: Low priority (generic/fallback extractors)
158 - 50-149: Normal priority (standard extractors)
159 - 150-249: High priority (specialized/optimized extractors)
160 - 250+: Override priority (force specific behavior)
162 When multiple extractors support the same file, the registry will
163 try them in descending priority order until one's supports() method
164 returns True.
166 Examples
167 --------
168 >>> class DM3Extractor:
169 ... \"\"\"Extract metadata from DigitalMicrograph .dm3/.dm4 files.\"\"\"
170 ...
171 ... name = "dm3_extractor"
172 ... priority = 100
173 ...
174 ... def supports(self, context: ExtractionContext) -> bool:
175 ... ext = context.file_path.suffix.lower().lstrip('.')
176 ... return ext in ('dm3', 'dm4')
177 ...
178 ... def extract(self, context: ExtractionContext) -> list[dict[str, Any]]:
179 ... # Extraction logic here
180 ... return [{"nx_meta": {...}}]
181 """
183 name: str
184 priority: int
185 supported_extensions: set[str] | None
187 def supports(self, context: ExtractionContext) -> bool:
188 """
189 Determine if this extractor can handle the given file.
191 This method allows complex logic beyond simple extension matching:
192 - Content sniffing (read file headers)
193 - File size checks
194 - Instrument-specific handling
195 - Metadata validation
197 The registry will call supports() on extractors in priority order
198 until one returns True.
200 Parameters
201 ----------
202 context
203 Context containing file path, instrument info, etc.
205 Returns
206 -------
207 bool
208 True if this extractor can handle this file, False otherwise
210 Examples
211 --------
212 Extension-based matching:
214 >>> def supports(self, context: ExtractionContext) -> bool:
215 ... ext = context.file_path.suffix.lower().lstrip('.')
216 ... return ext in ('dm3', 'dm4')
218 Content sniffing:
220 >>> def supports(self, context: ExtractionContext) -> bool:
221 ... if context.file_path.suffix.lower() != '.tif':
222 ... return False
223 ... with open(context.file_path, 'rb') as f:
224 ... header = f.read(1024)
225 ... return b'[User]' in header # FEI signature
227 Instrument-specific:
229 >>> def supports(self, context: ExtractionContext) -> bool:
230 ... return (context.instrument is not None and
231 ... context.instrument.name.startswith("FEI-Quanta"))
232 """
233 ... # pragma: no cover
235 def extract(self, context: ExtractionContext) -> dict[str, Any]:
236 """
237 Extract metadata from the file.
239 CRITICAL: This method MUST follow defensive design principles:
240 - Never raise exceptions - catch all errors and return minimal metadata
241 - Always return a list of metadata dicts where each contains an 'nx_meta' key
242 - Log errors for debugging but continue gracefully
244 Return Format:
245 All extractors return a list of metadata dicts. Each dict contains:
246 - 'nx_meta': Required - NexusLIMS-specific metadata (dict)
247 - Other keys: Optional - Raw metadata extracted from the file
249 Single-signal files return a list with one element. Multi-signal files return
250 a list with one element per signal. This consistent list-based approach allows
251 the Activity layer to expand multi-signal files into multiple datasets.
253 Each 'nx_meta' dict MUST contain these required fields (validated against
254 :class:`~nexusLIMS.schemas.metadata.NexusMetadata`):
256 - 'Creation Time': ISO-8601 timestamp string **with timezone** (REQUIRED)
257 Examples: "2024-01-15T10:30:00-05:00" or "2024-01-15T15:30:00Z"
258 - 'Data Type': Human-readable data type (e.g., "STEM_Imaging") (REQUIRED)
259 - 'DatasetType': Must be one of: "Image", "Spectrum", "SpectrumImage",
260 "Diffraction", "Misc", or "Unknown" (REQUIRED)
262 Optional standard fields:
263 - 'Data Dimensions': String like "(1024, 1024)" or "(12, 1024, 1024)"
264 - 'Instrument ID': Instrument PID from database
265 - 'warnings': List of warning messages (string or [message, context] pairs)
267 Additional instrument-specific fields beyond these are allowed.
268 The nx_meta structure is strictly validated after extraction - validation
269 failures will raise pydantic.ValidationError with detailed field errors.
271 Parameters
272 ----------
273 context
274 Context containing file path, instrument info, etc.
275 For multi-signal files, signal_index indicates which signal to process.
276 If None, extractors may return all signals or the first signal.
278 Returns
279 -------
280 list[dict]
281 List of metadata dicts (one per signal). Each dict contains 'nx_meta'
282 key with NexusLIMS-specific metadata, plus optional raw metadata keys.
284 Examples
285 --------
286 Single-signal extraction:
288 >>> def extract(self, context: ExtractionContext) -> list[dict[str, Any]]:
289 ... try:
290 ... metadata = [{"nx_meta": {
291 ... "Creation Time": "2024-01-15T10:30:00-05:00",
292 ... "Data Type": "STEM_Imaging",
293 ... "DatasetType": "Image",
294 ... "Data Dimensions": "(1024, 1024)",
295 ... "Instrument ID": "643-Titan"
296 ... }}]
297 ... return metadata
298 ... except Exception as e:
299 ... logger.error(f"Extraction failed: {e}")
300 ... return self._minimal_metadata(context)
302 Multi-signal extraction:
304 >>> def extract(self, context: ExtractionContext) -> list[dict[str, Any]]:
305 ... try:
306 ... # For a file with 2 signals
307 ... return [
308 ... {"nx_meta": {
309 ... "Creation Time": "2024-01-15T10:30:00-05:00",
310 ... "Data Type": "STEM_Imaging", ...}},
311 ... {"nx_meta": {
312 ... "Creation Time": "2024-01-15T10:30:00-05:00",
313 ... "Data Type": "EDS_Spectrum", ...}}
314 ... ]
315 ... except Exception as e:
316 ... logger.error(f"Extraction failed: {e}")
317 ... return self._minimal_metadata(context)
319 Minimal metadata on error:
321 >>> def _minimal_metadata(self, context: ExtractionContext) -> list[dict]:
322 ... return [{
323 ... "nx_meta": {
324 ... "DatasetType": "Unknown",
325 ... "Data Type": "Unknown",
326 ... "Creation Time": context.file_path.stat().st_mtime,
327 ... "Instrument ID": None,
328 ... "warnings": ["Extraction failed"]
329 ... }
330 ... }]
331 """
332 ... # pragma: no cover
335class PreviewGenerator(Protocol):
336 """
337 Protocol for thumbnail/preview image generation.
339 Preview generators are separate from extractors to allow:
340 - Different preview strategies for the same file type
341 - Reusable preview logic across extractors
342 - Batch preview generation independent of extraction
344 Like BaseExtractor, this is a Protocol (structural subtype).
346 Attributes
347 ----------
348 name : str
349 Unique identifier for this generator
350 priority : int
351 Priority (same conventions as BaseExtractor)
352 supported_extensions : set[str] | None
353 File extensions this generator supports (without dots).
354 Set to None for wildcard generators that support all files.
355 Empty set means no extensions are directly supported (content sniffing only).
357 Examples
358 --------
359 >>> class HyperSpyPreview:
360 ... \"\"\"Generate previews using HyperSpy.\"\"\"
361 ...
362 ... name = "hyperspy_preview"
363 ... priority = 100
364 ...
365 ... def supports(self, context: ExtractionContext) -> bool:
366 ... ext = context.file_path.suffix.lower().lstrip('.')
367 ... return ext in ('dm3', 'dm4', 'ser')
368 ...
369 ... def generate(self, context: ExtractionContext,
370 ... output_path: Path) -> bool:
371 ... # Preview generation logic
372 ... return True
373 """
375 name: str
376 priority: int
377 supported_extensions: set[str] | None
379 def supports(self, context: ExtractionContext) -> bool:
380 """
381 Determine if this generator can create a preview for the given file.
383 Parameters
384 ----------
385 context
386 Context containing file path, instrument info, etc.
388 Returns
389 -------
390 bool
391 True if this generator can handle this file
392 """
393 ... # pragma: no cover
395 def generate(self, context: ExtractionContext, output_path: Path) -> bool:
396 """
397 Generate a thumbnail preview and save to output_path.
399 This method should:
400 - Create a square thumbnail (typically 500x500 pixels)
401 - Save to output_path as PNG
402 - Return True on success, False on failure
403 - Never raise exceptions (catch all and return False)
405 Parameters
406 ----------
407 context
408 Context containing file path, instrument info, etc.
409 output_path
410 Where to save the generated preview PNG
412 Returns
413 -------
414 bool
415 True if preview was successfully generated, False otherwise
417 Examples
418 --------
419 >>> def generate(self, context: ExtractionContext,
420 ... output_path: Path) -> bool:
421 ... try:
422 ... # Create thumbnail
423 ... output_path.parent.mkdir(parents=True, exist_ok=True)
424 ... # ... generation logic ...
425 ... return True
426 ... except Exception as e:
427 ... logger.error(f"Preview generation failed: {e}")
428 ... return False
429 """
430 ... # pragma: no cover
433@dataclass
434class InstrumentProfile:
435 """
436 Instrument-specific customization profile.
438 Decouples instrument-specific logic from extractors, making it easy to add
439 custom behavior for specific microscopes without modifying extractor code.
441 This is the CRITICAL component for extensibility - each NexusLIMS installation
442 has unique instruments, and this system makes it trivial to add customizations.
444 Attributes
445 ----------
446 instrument_id
447 Instrument identifier (e.g., "FEI-Titan-STEM-630901")
448 parsers
449 Custom metadata parsing functions for this instrument.
450 Keys are parser names, values are callables.
451 transformations
452 Metadata transformation functions applied after extraction.
453 Keys are transform names, values are callables.
454 extension_fields
455 Metadata to inject into the extensions section for all files.
456 Keys are field names, values are static values.
457 These populate the nx_meta.extensions dict.
459 Examples
460 --------
461 Creating a custom profile for FEI Titan STEM:
463 >>> def parse_643_titan_microscope(metadata: dict) -> dict:
464 ... # Custom parsing logic
465 ... return metadata
466 >>>
467 >>> titan_stem_profile = InstrumentProfile(
468 ... instrument_id="FEI-Titan-STEM-630901",
469 ... parsers={
470 ... "microscope_info": parse_643_titan_microscope,
471 ... },
472 ... extension_fields={
473 ... "facility": "Nexus Facility",
474 ... "building": "Bldg. 1",
475 ... }
476 ... )
477 """
479 instrument_id: str
480 parsers: dict[str, Callable] = field(default_factory=dict)
481 transformations: dict[str, Callable] = field(default_factory=dict)
482 extension_fields: dict[str, Any] = field(default_factory=dict)