Coverage for nexusLIMS/extractors/base.py: 100%

37 statements  

« prev     ^ index     » next       coverage.py v7.11.3, created at 2026-03-24 05:23 +0000

1"""Base protocols and data structures for the extractor plugin system. 

2 

3This module defines the core interfaces that all extractors must implement, 

4along with supporting data structures for passing context to extractors. 

5 

6The plugin system uses Protocol-based structural typing (PEP 544) rather than 

7inheritance, allowing flexibility in implementation while maintaining type safety. 

8""" 

9 

10from __future__ import annotations 

11 

12import logging 

13from dataclasses import dataclass, field 

14from typing import TYPE_CHECKING, Any, Callable, NamedTuple, Protocol 

15 

16if TYPE_CHECKING: 

17 from pathlib import Path 

18 

19 from nexusLIMS.instruments import Instrument 

20 

21_logger = logging.getLogger(__name__) 

22 

23__all__ = [ 

24 "BaseExtractor", 

25 "ExtractionContext", 

26 "FieldDefinition", 

27 "PreviewGenerator", 

28] 

29 

30 

31class FieldDefinition(NamedTuple): 

32 """ 

33 Configuration for extracting a single metadata field. 

34 

35 This NamedTuple provides a declarative way to define how metadata fields 

36 should be extracted from instrument data files. It's used by TIFF-based 

37 extractors (Quanta, Tescan, Orion HIM) to reduce code duplication. 

38 

39 Attributes 

40 ---------- 

41 section : str 

42 Section name in metadata dict (e.g., "Beam", "User", "System"). 

43 For nested dicts, this is the top-level key. 

44 source_key : str 

45 Key within the section to extract the value from. 

46 output_key : str | list[str] 

47 Output key in nx_meta. Can be a string for flat keys or a list 

48 for nested paths (e.g., ["Stage Position", "X"]). 

49 factor : float 

50 Unit conversion factor. The extracted value is multiplied by this. 

51 Use 1.0 for no conversion. For SI unit conversions, use powers of 10 

52 (e.g., 1e6 to convert meters to micrometers). 

53 is_string : bool 

54 If True, keep value as string. If False, attempt numeric conversion 

55 with Decimal for precision. 

56 suppress_zero : bool 

57 If True, skip field if the numeric value equals zero. 

58 Only applies when is_string=False. Defaults to False. 

59 target_unit : str or None 

60 Pint unit string for the output value (e.g., "kilovolt", "millimeter"). 

61 If provided, the value will be converted to a Pint Quantity with this unit. 

62 The factor is still applied before creating the Quantity. 

63 If None, numeric values remain as floats (legacy behavior). Defaults to None. 

64 

65 Examples 

66 -------- 

67 >>> # Simple numeric field with unit conversion (m → μm) 

68 >>> FieldDefinition("Beam", "HFW", "Horizontal Field Width (μm)", 1e6, False) 

69 

70 >>> # String field (no conversion) 

71 >>> FieldDefinition("System", "Chamber", "Chamber ID", 1.0, True) 

72 

73 >>> # Nested output path 

74 >>> FieldDefinition("Beam", "StageX", ["Stage Position", "X"], 1.0, False) 

75 

76 >>> # Suppress zero values 

77 >>> FieldDefinition("Beam", "BeamShiftX", "Beam Shift X", 

78 >>> 1.0, False, suppress_zero=True) 

79 

80 >>> # Pint Quantity output (new approach) 

81 >>> FieldDefinition("Beam", "HV", "Voltage", 1.0, False, unit="kilovolt") 

82 """ 

83 

84 section: str 

85 source_key: str 

86 output_key: str | list[str] 

87 factor: float 

88 is_string: bool 

89 suppress_zero: bool = False 

90 target_unit: str | None = None # Pint unit string (e.g., "kilovolt", "millimeter") 

91 

92 

93@dataclass 

94class ExtractionContext: 

95 """ 

96 Context information passed to extractors and preview generators. 

97 

98 This dataclass encapsulates all the information an extractor needs to 

99 process a file. Using a context object allows us to add new parameters 

100 in the future without breaking existing extractors. 

101 

102 Attributes 

103 ---------- 

104 file_path 

105 Path to the file to be processed 

106 instrument 

107 The instrument that created this file, if known. Can be None for 

108 files that cannot be associated with a specific instrument. 

109 signal_index 

110 For files with multiple signals, the index of the signal to process. 

111 If None, processes all signals or defaults to the first signal. 

112 

113 Examples 

114 -------- 

115 >>> from pathlib import Path 

116 >>> from nexusLIMS.instruments import get_instr_from_filepath 

117 >>> file_path = Path("/path/to/data.dm3") 

118 >>> instrument = get_instr_from_filepath(file_path) 

119 >>> context = ExtractionContext(file_path, instrument) 

120 """ 

121 

122 file_path: Path 

123 instrument: Instrument | None = None 

124 signal_index: int | None = None 

125 

126 

127class BaseExtractor(Protocol): 

128 """ 

129 Protocol defining the interface for metadata extractors. 

130 

131 This is a Protocol (structural subtype) rather than an ABC, meaning any class 

132 that implements these attributes and methods is automatically considered a 

133 valid extractor - no inheritance required. 

134 

135 All extractors MUST implement defensive error handling: 

136 - Never raise exceptions from extract() - catch all and return minimal metadata 

137 - Always return a list of metadata dicts (one per signal) 

138 - Log errors for debugging but don't propagate them 

139 

140 Attributes 

141 ---------- 

142 name : str 

143 Unique identifier for this extractor (e.g., "dm3_extractor"). 

144 Should be a valid Python identifier. 

145 priority : int 

146 Priority for this extractor (0-1000, higher = preferred). 

147 See notes below for conventions. 

148 supported_extensions : set[str] | None 

149 File extensions this extractor supports (without dots). 

150 Set to None for wildcard extractors that support all files. 

151 Empty set means no extensions are directly supported (content sniffing only). 

152 

153 Notes 

154 ----- 

155 **Priority Conventions:** 

156 

157 - 0-49: Low priority (generic/fallback extractors) 

158 - 50-149: Normal priority (standard extractors) 

159 - 150-249: High priority (specialized/optimized extractors) 

160 - 250+: Override priority (force specific behavior) 

161 

162 When multiple extractors support the same file, the registry will 

163 try them in descending priority order until one's supports() method 

164 returns True. 

165 

166 Examples 

167 -------- 

168 >>> class DM3Extractor: 

169 ... \"\"\"Extract metadata from DigitalMicrograph .dm3/.dm4 files.\"\"\" 

170 ... 

171 ... name = "dm3_extractor" 

172 ... priority = 100 

173 ... 

174 ... def supports(self, context: ExtractionContext) -> bool: 

175 ... ext = context.file_path.suffix.lower().lstrip('.') 

176 ... return ext in ('dm3', 'dm4') 

177 ... 

178 ... def extract(self, context: ExtractionContext) -> list[dict[str, Any]]: 

179 ... # Extraction logic here 

180 ... return [{"nx_meta": {...}}] 

181 """ 

182 

183 name: str 

184 priority: int 

185 supported_extensions: set[str] | None 

186 

187 def supports(self, context: ExtractionContext) -> bool: 

188 """ 

189 Determine if this extractor can handle the given file. 

190 

191 This method allows complex logic beyond simple extension matching: 

192 - Content sniffing (read file headers) 

193 - File size checks 

194 - Instrument-specific handling 

195 - Metadata validation 

196 

197 The registry will call supports() on extractors in priority order 

198 until one returns True. 

199 

200 Parameters 

201 ---------- 

202 context 

203 Context containing file path, instrument info, etc. 

204 

205 Returns 

206 ------- 

207 bool 

208 True if this extractor can handle this file, False otherwise 

209 

210 Examples 

211 -------- 

212 Extension-based matching: 

213 

214 >>> def supports(self, context: ExtractionContext) -> bool: 

215 ... ext = context.file_path.suffix.lower().lstrip('.') 

216 ... return ext in ('dm3', 'dm4') 

217 

218 Content sniffing: 

219 

220 >>> def supports(self, context: ExtractionContext) -> bool: 

221 ... if context.file_path.suffix.lower() != '.tif': 

222 ... return False 

223 ... with open(context.file_path, 'rb') as f: 

224 ... header = f.read(1024) 

225 ... return b'[User]' in header # FEI signature 

226 

227 Instrument-specific: 

228 

229 >>> def supports(self, context: ExtractionContext) -> bool: 

230 ... return (context.instrument is not None and 

231 ... context.instrument.name.startswith("FEI-Quanta")) 

232 """ 

233 ... # pragma: no cover 

234 

235 def extract(self, context: ExtractionContext) -> dict[str, Any]: 

236 """ 

237 Extract metadata from the file. 

238 

239 CRITICAL: This method MUST follow defensive design principles: 

240 - Never raise exceptions - catch all errors and return minimal metadata 

241 - Always return a list of metadata dicts where each contains an 'nx_meta' key 

242 - Log errors for debugging but continue gracefully 

243 

244 Return Format: 

245 All extractors return a list of metadata dicts. Each dict contains: 

246 - 'nx_meta': Required - NexusLIMS-specific metadata (dict) 

247 - Other keys: Optional - Raw metadata extracted from the file 

248 

249 Single-signal files return a list with one element. Multi-signal files return 

250 a list with one element per signal. This consistent list-based approach allows 

251 the Activity layer to expand multi-signal files into multiple datasets. 

252 

253 Each 'nx_meta' dict MUST contain these required fields (validated against 

254 :class:`~nexusLIMS.schemas.metadata.NexusMetadata`): 

255 

256 - 'Creation Time': ISO-8601 timestamp string **with timezone** (REQUIRED) 

257 Examples: "2024-01-15T10:30:00-05:00" or "2024-01-15T15:30:00Z" 

258 - 'Data Type': Human-readable data type (e.g., "STEM_Imaging") (REQUIRED) 

259 - 'DatasetType': Must be one of: "Image", "Spectrum", "SpectrumImage", 

260 "Diffraction", "Misc", or "Unknown" (REQUIRED) 

261 

262 Optional standard fields: 

263 - 'Data Dimensions': String like "(1024, 1024)" or "(12, 1024, 1024)" 

264 - 'Instrument ID': Instrument PID from database 

265 - 'warnings': List of warning messages (string or [message, context] pairs) 

266 

267 Additional instrument-specific fields beyond these are allowed. 

268 The nx_meta structure is strictly validated after extraction - validation 

269 failures will raise pydantic.ValidationError with detailed field errors. 

270 

271 Parameters 

272 ---------- 

273 context 

274 Context containing file path, instrument info, etc. 

275 For multi-signal files, signal_index indicates which signal to process. 

276 If None, extractors may return all signals or the first signal. 

277 

278 Returns 

279 ------- 

280 list[dict] 

281 List of metadata dicts (one per signal). Each dict contains 'nx_meta' 

282 key with NexusLIMS-specific metadata, plus optional raw metadata keys. 

283 

284 Examples 

285 -------- 

286 Single-signal extraction: 

287 

288 >>> def extract(self, context: ExtractionContext) -> list[dict[str, Any]]: 

289 ... try: 

290 ... metadata = [{"nx_meta": { 

291 ... "Creation Time": "2024-01-15T10:30:00-05:00", 

292 ... "Data Type": "STEM_Imaging", 

293 ... "DatasetType": "Image", 

294 ... "Data Dimensions": "(1024, 1024)", 

295 ... "Instrument ID": "643-Titan" 

296 ... }}] 

297 ... return metadata 

298 ... except Exception as e: 

299 ... logger.error(f"Extraction failed: {e}") 

300 ... return self._minimal_metadata(context) 

301 

302 Multi-signal extraction: 

303 

304 >>> def extract(self, context: ExtractionContext) -> list[dict[str, Any]]: 

305 ... try: 

306 ... # For a file with 2 signals 

307 ... return [ 

308 ... {"nx_meta": { 

309 ... "Creation Time": "2024-01-15T10:30:00-05:00", 

310 ... "Data Type": "STEM_Imaging", ...}}, 

311 ... {"nx_meta": { 

312 ... "Creation Time": "2024-01-15T10:30:00-05:00", 

313 ... "Data Type": "EDS_Spectrum", ...}} 

314 ... ] 

315 ... except Exception as e: 

316 ... logger.error(f"Extraction failed: {e}") 

317 ... return self._minimal_metadata(context) 

318 

319 Minimal metadata on error: 

320 

321 >>> def _minimal_metadata(self, context: ExtractionContext) -> list[dict]: 

322 ... return [{ 

323 ... "nx_meta": { 

324 ... "DatasetType": "Unknown", 

325 ... "Data Type": "Unknown", 

326 ... "Creation Time": context.file_path.stat().st_mtime, 

327 ... "Instrument ID": None, 

328 ... "warnings": ["Extraction failed"] 

329 ... } 

330 ... }] 

331 """ 

332 ... # pragma: no cover 

333 

334 

335class PreviewGenerator(Protocol): 

336 """ 

337 Protocol for thumbnail/preview image generation. 

338 

339 Preview generators are separate from extractors to allow: 

340 - Different preview strategies for the same file type 

341 - Reusable preview logic across extractors 

342 - Batch preview generation independent of extraction 

343 

344 Like BaseExtractor, this is a Protocol (structural subtype). 

345 

346 Attributes 

347 ---------- 

348 name : str 

349 Unique identifier for this generator 

350 priority : int 

351 Priority (same conventions as BaseExtractor) 

352 supported_extensions : set[str] | None 

353 File extensions this generator supports (without dots). 

354 Set to None for wildcard generators that support all files. 

355 Empty set means no extensions are directly supported (content sniffing only). 

356 

357 Examples 

358 -------- 

359 >>> class HyperSpyPreview: 

360 ... \"\"\"Generate previews using HyperSpy.\"\"\" 

361 ... 

362 ... name = "hyperspy_preview" 

363 ... priority = 100 

364 ... 

365 ... def supports(self, context: ExtractionContext) -> bool: 

366 ... ext = context.file_path.suffix.lower().lstrip('.') 

367 ... return ext in ('dm3', 'dm4', 'ser') 

368 ... 

369 ... def generate(self, context: ExtractionContext, 

370 ... output_path: Path) -> bool: 

371 ... # Preview generation logic 

372 ... return True 

373 """ 

374 

375 name: str 

376 priority: int 

377 supported_extensions: set[str] | None 

378 

379 def supports(self, context: ExtractionContext) -> bool: 

380 """ 

381 Determine if this generator can create a preview for the given file. 

382 

383 Parameters 

384 ---------- 

385 context 

386 Context containing file path, instrument info, etc. 

387 

388 Returns 

389 ------- 

390 bool 

391 True if this generator can handle this file 

392 """ 

393 ... # pragma: no cover 

394 

395 def generate(self, context: ExtractionContext, output_path: Path) -> bool: 

396 """ 

397 Generate a thumbnail preview and save to output_path. 

398 

399 This method should: 

400 - Create a square thumbnail (typically 500x500 pixels) 

401 - Save to output_path as PNG 

402 - Return True on success, False on failure 

403 - Never raise exceptions (catch all and return False) 

404 

405 Parameters 

406 ---------- 

407 context 

408 Context containing file path, instrument info, etc. 

409 output_path 

410 Where to save the generated preview PNG 

411 

412 Returns 

413 ------- 

414 bool 

415 True if preview was successfully generated, False otherwise 

416 

417 Examples 

418 -------- 

419 >>> def generate(self, context: ExtractionContext, 

420 ... output_path: Path) -> bool: 

421 ... try: 

422 ... # Create thumbnail 

423 ... output_path.parent.mkdir(parents=True, exist_ok=True) 

424 ... # ... generation logic ... 

425 ... return True 

426 ... except Exception as e: 

427 ... logger.error(f"Preview generation failed: {e}") 

428 ... return False 

429 """ 

430 ... # pragma: no cover 

431 

432 

433@dataclass 

434class InstrumentProfile: 

435 """ 

436 Instrument-specific customization profile. 

437 

438 Decouples instrument-specific logic from extractors, making it easy to add 

439 custom behavior for specific microscopes without modifying extractor code. 

440 

441 This is the CRITICAL component for extensibility - each NexusLIMS installation 

442 has unique instruments, and this system makes it trivial to add customizations. 

443 

444 Attributes 

445 ---------- 

446 instrument_id 

447 Instrument identifier (e.g., "FEI-Titan-STEM-630901") 

448 parsers 

449 Custom metadata parsing functions for this instrument. 

450 Keys are parser names, values are callables. 

451 transformations 

452 Metadata transformation functions applied after extraction. 

453 Keys are transform names, values are callables. 

454 extension_fields 

455 Metadata to inject into the extensions section for all files. 

456 Keys are field names, values are static values. 

457 These populate the nx_meta.extensions dict. 

458 

459 Examples 

460 -------- 

461 Creating a custom profile for FEI Titan STEM: 

462 

463 >>> def parse_643_titan_microscope(metadata: dict) -> dict: 

464 ... # Custom parsing logic 

465 ... return metadata 

466 >>> 

467 >>> titan_stem_profile = InstrumentProfile( 

468 ... instrument_id="FEI-Titan-STEM-630901", 

469 ... parsers={ 

470 ... "microscope_info": parse_643_titan_microscope, 

471 ... }, 

472 ... extension_fields={ 

473 ... "facility": "Nexus Facility", 

474 ... "building": "Bldg. 1", 

475 ... } 

476 ... ) 

477 """ 

478 

479 instrument_id: str 

480 parsers: dict[str, Callable] = field(default_factory=dict) 

481 transformations: dict[str, Callable] = field(default_factory=dict) 

482 extension_fields: dict[str, Any] = field(default_factory=dict)