Coverage for nexusLIMS/extractors/base.py: 100%

1"""Base protocols and data structures for the extractor plugin system.

3This module defines the core interfaces that all extractors must implement,

4along with supporting data structures for passing context to extractors.

6The plugin system uses Protocol-based structural typing (PEP 544) rather than

7inheritance, allowing flexibility in implementation while maintaining type safety.

8"""

10from __future__ import annotations

12import logging

13from dataclasses import dataclass, field

14from typing import TYPE_CHECKING, Any, Callable, NamedTuple, Protocol

16if TYPE_CHECKING:

17 from pathlib import Path

19 from nexusLIMS.instruments import Instrument

21_logger = logging.getLogger(__name__)

23__all__ = [

24 "BaseExtractor",

25 "ExtractionContext",

26 "FieldDefinition",

27 "PreviewGenerator",

28]

31class FieldDefinition(NamedTuple):

32 """

33 Configuration for extracting a single metadata field.

35 This NamedTuple provides a declarative way to define how metadata fields

36 should be extracted from instrument data files. It's used by TIFF-based

37 extractors (Quanta, Tescan, Orion HIM) to reduce code duplication.

39 Attributes

40 ----------

41 section : str

42 Section name in metadata dict (e.g., "Beam", "User", "System").

43 For nested dicts, this is the top-level key.

44 source_key : str

45 Key within the section to extract the value from.

46 output_key : str | list[str]

47 Output key in nx_meta. Can be a string for flat keys or a list

48 for nested paths (e.g., ["Stage Position", "X"]).

49 factor : float

50 Unit conversion factor. The extracted value is multiplied by this.

51 Use 1.0 for no conversion. For SI unit conversions, use powers of 10

52 (e.g., 1e6 to convert meters to micrometers).

53 is_string : bool

54 If True, keep value as string. If False, attempt numeric conversion

55 with Decimal for precision.

56 suppress_zero : bool

57 If True, skip field if the numeric value equals zero.

58 Only applies when is_string=False. Defaults to False.

59 target_unit : str or None

60 Pint unit string for the output value (e.g., "kilovolt", "millimeter").

61 If provided, the value will be converted to a Pint Quantity with this unit.

62 The factor is still applied before creating the Quantity.

63 If None, numeric values remain as floats (legacy behavior). Defaults to None.

65 Examples

66 --------

67 >>> # Simple numeric field with unit conversion (m → μm)

68 >>> FieldDefinition("Beam", "HFW", "Horizontal Field Width (μm)", 1e6, False)

70 >>> # String field (no conversion)

71 >>> FieldDefinition("System", "Chamber", "Chamber ID", 1.0, True)

73 >>> # Nested output path

74 >>> FieldDefinition("Beam", "StageX", ["Stage Position", "X"], 1.0, False)

76 >>> # Suppress zero values

77 >>> FieldDefinition("Beam", "BeamShiftX", "Beam Shift X",

78 >>> 1.0, False, suppress_zero=True)

80 >>> # Pint Quantity output (new approach)

81 >>> FieldDefinition("Beam", "HV", "Voltage", 1.0, False, unit="kilovolt")

82 """

84 section: str

85 source_key: str

86 output_key: str | list[str]

87 factor: float

88 is_string: bool

89 suppress_zero: bool = False

90 target_unit: str | None = None # Pint unit string (e.g., "kilovolt", "millimeter")

93@dataclass

94class ExtractionContext:

95 """

96 Context information passed to extractors and preview generators.

98 This dataclass encapsulates all the information an extractor needs to

99 process a file. Using a context object allows us to add new parameters

100 in the future without breaking existing extractors.

101

102 Attributes

103 ----------

104 file_path

105 Path to the file to be processed

106 instrument

107 The instrument that created this file, if known. Can be None for

108 files that cannot be associated with a specific instrument.

109 signal_index

110 For files with multiple signals, the index of the signal to process.

111 If None, processes all signals or defaults to the first signal.

112

113 Examples

114 --------

115 >>> from pathlib import Path

116 >>> from nexusLIMS.instruments import get_instr_from_filepath

117 >>> file_path = Path("/path/to/data.dm3")

118 >>> instrument = get_instr_from_filepath(file_path)

119 >>> context = ExtractionContext(file_path, instrument)

120 """

121

122 file_path: Path

123 instrument: Instrument | None = None

124 signal_index: int | None = None

125

126

127class BaseExtractor(Protocol):

128 """

129 Protocol defining the interface for metadata extractors.

130

131 This is a Protocol (structural subtype) rather than an ABC, meaning any class

132 that implements these attributes and methods is automatically considered a

133 valid extractor - no inheritance required.

134

135 All extractors MUST implement defensive error handling:

136 - Never raise exceptions from extract() - catch all and return minimal metadata

137 - Always return a list of metadata dicts (one per signal)

138 - Log errors for debugging but don't propagate them

139

140 Attributes

141 ----------

142 name : str

143 Unique identifier for this extractor (e.g., "dm3_extractor").

144 Should be a valid Python identifier.

145 priority : int

146 Priority for this extractor (0-1000, higher = preferred).

147 See notes below for conventions.

148 supported_extensions : set[str] | None

149 File extensions this extractor supports (without dots).

150 Set to None for wildcard extractors that support all files.

151 Empty set means no extensions are directly supported (content sniffing only).

152

153 Notes

154 -----

155 **Priority Conventions:**

156

157 - 0-49: Low priority (generic/fallback extractors)

158 - 50-149: Normal priority (standard extractors)

159 - 150-249: High priority (specialized/optimized extractors)

160 - 250+: Override priority (force specific behavior)

161

162 When multiple extractors support the same file, the registry will

163 try them in descending priority order until one's supports() method

164 returns True.

165

166 Examples

167 --------

168 >>> class DM3Extractor:

169 ... \"\"\"Extract metadata from DigitalMicrograph .dm3/.dm4 files.\"\"\"

170 ...

171 ... name = "dm3_extractor"

172 ... priority = 100

173 ...

174 ... def supports(self, context: ExtractionContext) -> bool:

175 ... ext = context.file_path.suffix.lower().lstrip('.')

176 ... return ext in ('dm3', 'dm4')

177 ...

178 ... def extract(self, context: ExtractionContext) -> list[dict[str, Any]]:

179 ... # Extraction logic here

180 ... return [{"nx_meta": {...}}]

181 """

182

183 name: str

184 priority: int

185 supported_extensions: set[str] | None

186

187 def supports(self, context: ExtractionContext) -> bool:

188 """

189 Determine if this extractor can handle the given file.

190

191 This method allows complex logic beyond simple extension matching:

192 - Content sniffing (read file headers)

193 - File size checks

194 - Instrument-specific handling

195 - Metadata validation

196

197 The registry will call supports() on extractors in priority order

198 until one returns True.

199

200 Parameters

201 ----------

202 context

203 Context containing file path, instrument info, etc.

204

205 Returns

206 -------

207 bool

208 True if this extractor can handle this file, False otherwise

209

210 Examples

211 --------

212 Extension-based matching:

213

214 >>> def supports(self, context: ExtractionContext) -> bool:

215 ... ext = context.file_path.suffix.lower().lstrip('.')

216 ... return ext in ('dm3', 'dm4')

217

218 Content sniffing:

219

220 >>> def supports(self, context: ExtractionContext) -> bool:

221 ... if context.file_path.suffix.lower() != '.tif':

222 ... return False

223 ... with open(context.file_path, 'rb') as f:

224 ... header = f.read(1024)

225 ... return b'[User]' in header # FEI signature

226

227 Instrument-specific:

228

229 >>> def supports(self, context: ExtractionContext) -> bool:

230 ... return (context.instrument is not None and

231 ... context.instrument.name.startswith("FEI-Quanta"))

232 """

233 ... # pragma: no cover

234

235 def extract(self, context: ExtractionContext) -> dict[str, Any]:

236 """

237 Extract metadata from the file.

238

239 CRITICAL: This method MUST follow defensive design principles:

240 - Never raise exceptions - catch all errors and return minimal metadata

241 - Always return a list of metadata dicts where each contains an 'nx_meta' key

242 - Log errors for debugging but continue gracefully

243

244 Return Format:

245 All extractors return a list of metadata dicts. Each dict contains:

246 - 'nx_meta': Required - NexusLIMS-specific metadata (dict)

247 - Other keys: Optional - Raw metadata extracted from the file

248

249 Single-signal files return a list with one element. Multi-signal files return

250 a list with one element per signal. This consistent list-based approach allows

251 the Activity layer to expand multi-signal files into multiple datasets.

252

253 Each 'nx_meta' dict MUST contain these required fields (validated against

254 :class:`~nexusLIMS.schemas.metadata.NexusMetadata`):

255

256 - 'Creation Time': ISO-8601 timestamp string **with timezone** (REQUIRED)

257 Examples: "2024-01-15T10:30:00-05:00" or "2024-01-15T15:30:00Z"

258 - 'Data Type': Human-readable data type (e.g., "STEM_Imaging") (REQUIRED)

259 - 'DatasetType': Must be one of: "Image", "Spectrum", "SpectrumImage",

260 "Diffraction", "Misc", or "Unknown" (REQUIRED)

261

262 Optional standard fields:

263 - 'Data Dimensions': String like "(1024, 1024)" or "(12, 1024, 1024)"

264 - 'Instrument ID': Instrument PID from database

265 - 'warnings': List of warning messages (string or [message, context] pairs)

266

267 Additional instrument-specific fields beyond these are allowed.

268 The nx_meta structure is strictly validated after extraction - validation

269 failures will raise pydantic.ValidationError with detailed field errors.

270

271 Parameters

272 ----------

273 context

274 Context containing file path, instrument info, etc.

275 For multi-signal files, signal_index indicates which signal to process.

276 If None, extractors may return all signals or the first signal.

277

278 Returns

279 -------

280 list[dict]

281 List of metadata dicts (one per signal). Each dict contains 'nx_meta'

282 key with NexusLIMS-specific metadata, plus optional raw metadata keys.

283

284 Examples

285 --------

286 Single-signal extraction:

287

288 >>> def extract(self, context: ExtractionContext) -> list[dict[str, Any]]:

289 ... try:

290 ... metadata = [{"nx_meta": {

291 ... "Creation Time": "2024-01-15T10:30:00-05:00",

292 ... "Data Type": "STEM_Imaging",

293 ... "DatasetType": "Image",

294 ... "Data Dimensions": "(1024, 1024)",

295 ... "Instrument ID": "643-Titan"

296 ... }}]

297 ... return metadata

298 ... except Exception as e:

299 ... logger.error(f"Extraction failed: {e}")

300 ... return self._minimal_metadata(context)

301

302 Multi-signal extraction:

303

304 >>> def extract(self, context: ExtractionContext) -> list[dict[str, Any]]:

305 ... try:

306 ... # For a file with 2 signals

307 ... return [

308 ... {"nx_meta": {

309 ... "Creation Time": "2024-01-15T10:30:00-05:00",

310 ... "Data Type": "STEM_Imaging", ...}},

311 ... {"nx_meta": {

312 ... "Creation Time": "2024-01-15T10:30:00-05:00",

313 ... "Data Type": "EDS_Spectrum", ...}}

314 ... ]

315 ... except Exception as e:

316 ... logger.error(f"Extraction failed: {e}")

317 ... return self._minimal_metadata(context)

318

319 Minimal metadata on error:

320

321 >>> def _minimal_metadata(self, context: ExtractionContext) -> list[dict]:

322 ... return [{

323 ... "nx_meta": {

324 ... "DatasetType": "Unknown",

325 ... "Data Type": "Unknown",

326 ... "Creation Time": context.file_path.stat().st_mtime,

327 ... "Instrument ID": None,

328 ... "warnings": ["Extraction failed"]

329 ... }

330 ... }]

331 """

332 ... # pragma: no cover

333

334

335class PreviewGenerator(Protocol):

336 """

337 Protocol for thumbnail/preview image generation.

338

339 Preview generators are separate from extractors to allow:

340 - Different preview strategies for the same file type

341 - Reusable preview logic across extractors

342 - Batch preview generation independent of extraction

343

344 Like BaseExtractor, this is a Protocol (structural subtype).

345

346 Attributes

347 ----------

348 name : str

349 Unique identifier for this generator

350 priority : int

351 Priority (same conventions as BaseExtractor)

352 supported_extensions : set[str] | None

353 File extensions this generator supports (without dots).

354 Set to None for wildcard generators that support all files.

355 Empty set means no extensions are directly supported (content sniffing only).

356

357 Examples

358 --------

359 >>> class HyperSpyPreview:

360 ... \"\"\"Generate previews using HyperSpy.\"\"\"

361 ...

362 ... name = "hyperspy_preview"

363 ... priority = 100

364 ...

365 ... def supports(self, context: ExtractionContext) -> bool:

366 ... ext = context.file_path.suffix.lower().lstrip('.')

367 ... return ext in ('dm3', 'dm4', 'ser')

368 ...

369 ... def generate(self, context: ExtractionContext,

370 ... output_path: Path) -> bool:

371 ... # Preview generation logic

372 ... return True

373 """

374

375 name: str

376 priority: int

377 supported_extensions: set[str] | None

378

379 def supports(self, context: ExtractionContext) -> bool:

380 """

381 Determine if this generator can create a preview for the given file.

382

383 Parameters

384 ----------

385 context

386 Context containing file path, instrument info, etc.

387

388 Returns

389 -------

390 bool

391 True if this generator can handle this file

392 """

393 ... # pragma: no cover

394

395 def generate(self, context: ExtractionContext, output_path: Path) -> bool:

396 """

397 Generate a thumbnail preview and save to output_path.

398

399 This method should:

400 - Create a square thumbnail (typically 500x500 pixels)

401 - Save to output_path as PNG

402 - Return True on success, False on failure

403 - Never raise exceptions (catch all and return False)

404

405 Parameters

406 ----------

407 context

408 Context containing file path, instrument info, etc.

409 output_path

410 Where to save the generated preview PNG

411

412 Returns

413 -------

414 bool

415 True if preview was successfully generated, False otherwise

416

417 Examples

418 --------

419 >>> def generate(self, context: ExtractionContext,

420 ... output_path: Path) -> bool:

421 ... try:

422 ... # Create thumbnail

423 ... output_path.parent.mkdir(parents=True, exist_ok=True)

424 ... # ... generation logic ...

425 ... return True

426 ... except Exception as e:

427 ... logger.error(f"Preview generation failed: {e}")

428 ... return False

429 """

430 ... # pragma: no cover

431

432

433@dataclass

434class InstrumentProfile:

435 """

436 Instrument-specific customization profile.

437

438 Decouples instrument-specific logic from extractors, making it easy to add

439 custom behavior for specific microscopes without modifying extractor code.

440

441 This is the CRITICAL component for extensibility - each NexusLIMS installation

442 has unique instruments, and this system makes it trivial to add customizations.

443

444 Attributes

445 ----------

446 instrument_id

447 Instrument identifier (e.g., "FEI-Titan-STEM-630901")

448 parsers

449 Custom metadata parsing functions for this instrument.

450 Keys are parser names, values are callables.

451 transformations

452 Metadata transformation functions applied after extraction.

453 Keys are transform names, values are callables.

454 extension_fields

455 Metadata to inject into the extensions section for all files.

456 Keys are field names, values are static values.

457 These populate the nx_meta.extensions dict.

458

459 Examples

460 --------

461 Creating a custom profile for FEI Titan STEM:

462

463 >>> def parse_643_titan_microscope(metadata: dict) -> dict:

464 ... # Custom parsing logic

465 ... return metadata

466 >>>

467 >>> titan_stem_profile = InstrumentProfile(

468 ... instrument_id="FEI-Titan-STEM-630901",

469 ... parsers={

470 ... "microscope_info": parse_643_titan_microscope,

471 ... },

472 ... extension_fields={

473 ... "facility": "Nexus Facility",

474 ... "building": "Bldg. 1",

475 ... }

476 ... )

477 """

478

479 instrument_id: str

480 parsers: dict[str, Callable] = field(default_factory=dict)

481 transformations: dict[str, Callable] = field(default_factory=dict)

482 extension_fields: dict[str, Any] = field(default_factory=dict)