Coverage for nexusLIMS/extractors/plugins/tescan_tif.py: 100%

247 statements  

« prev     ^ index     » next       coverage.py v7.11.3, created at 2026-03-24 05:23 +0000

1# ruff: noqa: N817, FBT003 

2"""Tescan (P)FIB/SEM TIFF extractor plugin.""" 

3 

4import configparser 

5import contextlib 

6import io 

7import logging 

8from decimal import Decimal 

9from pathlib import Path 

10from typing import Any, ClassVar 

11 

12from PIL import Image 

13 

14from nexusLIMS.extractors.base import ExtractionContext 

15from nexusLIMS.extractors.base import FieldDefinition as FD 

16from nexusLIMS.extractors.utils import _set_instr_name_and_time, add_to_extensions 

17from nexusLIMS.schemas.units import ureg 

18from nexusLIMS.utils.dicts import set_nested_dict_value, sort_dict 

19 

20TESCAN_TIFF_TAG = 50431 

21""" 

22TIFF tag ID where Tescan stores INI-style metadata in TIFF files. 

23The tag contains holds instrument configuration, beam parameters, stage position, 

24detector settings, and other acquisition metadata. 

25""" 

26 

27_MAX_ASCII_VALUE = 128 

28"""Maximum value for ASCII characters. Used to filter non-ASCII binary data.""" 

29 

30_logger = logging.getLogger(__name__) 

31 

32 

33class TescanTiffExtractor: 

34 """ 

35 Extractor for Tescan FIB/SEM TIFF files. 

36 

37 This extractor handles metadata extraction from .tif files saved by 

38 Tescan FIB and SEM instruments (e.g., AMBER X). The extractor uses 

39 a two-tier strategy: 

40 

41 1. Primary: Look for sidecar .hdr file with full metadata in INI format 

42 2. Fallback: Extract basic metadata from TIFF tags if no .hdr file exists 

43 

44 The .hdr file contains comprehensive acquisition parameters in two sections: 

45 [MAIN] and [SEM], which are parsed using Python's configparser. 

46 """ 

47 

48 name = "tescan_tif_extractor" 

49 priority = 150 

50 supported_extensions: ClassVar = {"tif", "tiff"} 

51 

52 def supports(self, context: ExtractionContext) -> bool: 

53 """ 

54 Check if this extractor supports the given file. 

55 

56 Performs content sniffing to verify this is a Tescan TIFF file by: 

57 1. Checking file extension (.tif or .tiff) 

58 2. Looking for either a sidecar .hdr file or Tescan-specific TIFF tags 

59 

60 Parameters 

61 ---------- 

62 context 

63 The extraction context containing file information 

64 

65 Returns 

66 ------- 

67 bool 

68 True if this appears to be a Tescan TIFF file 

69 """ 

70 extension = context.file_path.suffix.lower().lstrip(".") 

71 if extension not in {"tif", "tiff"}: 

72 return False 

73 

74 # Check for sidecar HDR file 

75 hdr_file = self._find_hdr_file(context.file_path) 

76 if hdr_file is not None and self._is_tescan_hdr(hdr_file): 

77 return True 

78 

79 # Fallback: check TIFF tags for Tescan signature 

80 try: 

81 with Image.open(context.file_path) as img: 

82 # Check for TESCAN in Make tag (271) or Software tag (305) 

83 make = img.tag_v2.get(271, "") 

84 software = img.tag_v2.get(305, "") 

85 if "TESCAN" in str(make).upper() or "TESCAN" in str(software).upper(): 

86 return True 

87 # check for custom Tescan metadata tag 

88 tescan_metadata = img.tag_v2.get(TESCAN_TIFF_TAG, "") 

89 if tescan_metadata != "": 

90 return True 

91 except Exception as e: 

92 _logger.debug( 

93 "Could not read TIFF tags from %s: %s", 

94 context.file_path, 

95 e, 

96 ) 

97 return False 

98 

99 return False 

100 

101 def extract(self, context: ExtractionContext) -> list[dict[str, Any]]: 

102 """ 

103 Extract metadata from a Tescan FIB/SEM TIFF file. 

104 

105 Returns the metadata (as a list of dictionaries) from a .tif file saved by 

106 Tescan instruments. Uses a three-tier extraction strategy: 

107 1. Try to parse embedded HDR metadata from TIFF Tag 50431 

108 2. If that fails, look for a sidecar .hdr file 

109 3. Always extract basic TIFF tags as well 

110 

111 Parameters 

112 ---------- 

113 context 

114 The extraction context containing file information 

115 

116 Returns 

117 ------- 

118 list[dict] 

119 List containing a single metadata dict with 'nx_meta' key 

120 """ 

121 filename = context.file_path 

122 _logger.debug("Extracting metadata from Tescan TIFF file: %s", filename) 

123 

124 mdict = {"nx_meta": {}} 

125 # Assume all datasets coming from Tescan are SEM Images, originally 

126 mdict["nx_meta"]["DatasetType"] = "Image" 

127 mdict["nx_meta"]["Data Type"] = "SEM_Imaging" 

128 

129 _set_instr_name_and_time(mdict, filename) 

130 

131 hdr_parsed = False 

132 

133 # Strategy 1: Try to parse embedded HDR metadata from TIFF tag 50431 

134 try: 

135 embedded_metadata = self._extract_embedded_hdr(filename) 

136 if embedded_metadata: 

137 mdict.update(embedded_metadata) 

138 mdict = self._parse_nx_meta(mdict) 

139 hdr_parsed = True 

140 _logger.debug("Successfully parsed embedded HDR from TIFF tag") 

141 except Exception as e: 

142 _logger.debug("Could not parse embedded HDR metadata: %s", e) 

143 

144 # Strategy 2: If embedded parsing failed, try sidecar HDR file 

145 if not hdr_parsed: 

146 hdr_file = self._find_hdr_file(filename) 

147 if hdr_file is not None and self._is_tescan_hdr(hdr_file): 

148 try: 

149 hdr_metadata = self._read_hdr_metadata(hdr_file) 

150 mdict.update(hdr_metadata) 

151 mdict = self._parse_nx_meta(mdict) 

152 hdr_parsed = True 

153 _logger.debug("Successfully parsed sidecar HDR file") 

154 except Exception as e: 

155 _logger.warning( 

156 "Failed to parse HDR file %s: %s", 

157 hdr_file, 

158 e, 

159 ) 

160 

161 # Strategy 3: Always extract basic TIFF tags (may supplement or override) 

162 self._extract_from_tiff_tags(filename, mdict) 

163 

164 # Migrate metadata to schema-compliant format 

165 mdict = self._migrate_to_schema_compliant_metadata(mdict) 

166 

167 # Sort the nx_meta dictionary (recursively) for nicer display 

168 mdict["nx_meta"] = sort_dict(mdict["nx_meta"]) 

169 

170 return [mdict] 

171 

172 def _find_hdr_file(self, tiff_path: Path) -> Path | None: 

173 """ 

174 Find the sidecar .hdr file for a given TIFF file. 

175 

176 Parameters 

177 ---------- 

178 tiff_path 

179 Path to the TIFF file 

180 

181 Returns 

182 ------- 

183 Path or None 

184 Path to the .hdr file if it exists, None otherwise 

185 """ 

186 hdr_path = tiff_path.with_suffix(".hdr") 

187 if hdr_path.exists(): 

188 return hdr_path 

189 return None 

190 

191 def _is_tescan_hdr(self, hdr_path: Path) -> bool: 

192 """ 

193 Verify that an HDR file is a Tescan format file. 

194 

195 Checks for the presence of [MAIN] and [SEM] sections which are 

196 characteristic of Tescan HDR files. 

197 

198 Parameters 

199 ---------- 

200 hdr_path 

201 Path to the .hdr file 

202 

203 Returns 

204 ------- 

205 bool 

206 True if this appears to be a Tescan HDR file 

207 """ 

208 try: 

209 with hdr_path.open("r", encoding="utf-8", errors="ignore") as f: 

210 content = f.read(500) # Read first 500 chars 

211 # Look for characteristic Tescan sections 

212 return "[MAIN]" in content or "Device=TESCAN" in content 

213 except Exception as e: 

214 _logger.debug("Could not verify HDR file %s: %s", hdr_path, e) 

215 return False 

216 

217 def _extract_embedded_hdr( 

218 self, tiff_path: Path 

219 ) -> dict[str, dict[str, str]] | None: 

220 """ 

221 Extract embedded HDR metadata from TIFF Tag TESCAN_TIFF_TAG. 

222 

223 Tescan embeds the complete HDR metadata in TIFF tag TESCAN_TIFF_TAG as a 

224 binary blob containing the INI-formatted text. The tag may contain binary 

225 garbage at the beginning before the actual metadata starts. 

226 

227 Parameters 

228 ---------- 

229 tiff_path 

230 Path to the TIFF file 

231 

232 Returns 

233 ------- 

234 dict or None 

235 Dictionary with section names as keys and key-value dicts as values, 

236 or None if tag is not present or cannot be parsed 

237 """ 

238 try: 

239 with Image.open(tiff_path) as img: 

240 metadata_tag = img.tag_v2.get(TESCAN_TIFF_TAG) 

241 if metadata_tag is None: 

242 return None 

243 

244 # Convert tag to bytes 

245 metadata_bytes = self._tag_to_bytes(metadata_tag) 

246 

247 # Extract metadata string from binary data 

248 metadata_str = self._extract_metadata_string(metadata_bytes) 

249 

250 # Clean up non-printable characters 

251 metadata_str = self._clean_metadata_string(metadata_str) 

252 

253 # Add section headers if missing 

254 metadata_str = self._add_section_headers_if_needed(metadata_str) 

255 

256 # Parse as INI format 

257 return self._parse_hdr_string(metadata_str) 

258 

259 except Exception as e: 

260 _logger.debug("Failed to extract embedded HDR from tag 50431: %s", e) 

261 return None 

262 

263 def _tag_to_bytes(self, metadata_tag: Any) -> bytes: 

264 """Convert TIFF tag data to bytes. 

265 

266 Parameters 

267 ---------- 

268 metadata_tag 

269 Tag data in various formats (bytes, str, etc.) 

270 

271 Returns 

272 ------- 

273 bytes 

274 Converted bytes 

275 

276 Raises 

277 ------ 

278 TypeError 

279 If tag data is not bytes or str 

280 """ 

281 if isinstance(metadata_tag, bytes): 

282 return metadata_tag 

283 if isinstance(metadata_tag, str): 

284 return metadata_tag.encode("utf-8") 

285 msg = f"Unsupported metadata tag type: {type(metadata_tag)}" 

286 raise TypeError(msg) 

287 

288 def _extract_metadata_string(self, metadata_bytes: bytes) -> str: 

289 """Extract metadata string from binary data by removing garbage. 

290 

291 The tag may contain binary garbage at the beginning. This method looks 

292 for known keys to find the start of actual metadata. 

293 

294 Parameters 

295 ---------- 

296 metadata_bytes 

297 Raw binary metadata from TIFF tag 

298 

299 Returns 

300 ------- 

301 str 

302 Cleaned metadata string 

303 """ 

304 # Look for the start of metadata by searching for known keys 

305 search_keys = [b"[MAIN]", b"AccFrames=", b"AccType=", b"Company=", b"Date="] 

306 for search_key in search_keys: 

307 pos = metadata_bytes.find(search_key) 

308 if pos >= 0: 

309 metadata_bytes = metadata_bytes[pos:] 

310 return metadata_bytes.replace(b"\x00", b"").decode( 

311 "utf-8", errors="ignore" 

312 ) 

313 

314 # Fallback: decode whole thing 

315 return metadata_bytes.replace(b"\x00", b"").decode("utf-8", errors="ignore") 

316 

317 def _clean_metadata_string(self, metadata_str: str) -> str: 

318 """Remove non-printable binary characters from metadata string. 

319 

320 Parameters 

321 ---------- 

322 metadata_str 

323 Metadata string that may contain non-printable characters 

324 

325 Returns 

326 ------- 

327 str 

328 Cleaned metadata string 

329 """ 

330 return "".join( 

331 c 

332 for c in metadata_str 

333 if ord(c) < _MAX_ASCII_VALUE and (c.isprintable() or c in "\n\r\t") 

334 ) 

335 

336 def _add_section_headers_if_needed(self, metadata_str: str) -> str: 

337 """Add [MAIN] and [SEM] section headers if missing. 

338 

339 Tescan's embedded metadata doesn't include section headers, so this 

340 method detects where the SEM section starts and inserts headers. 

341 

342 Parameters 

343 ---------- 

344 metadata_str 

345 Metadata string potentially without section headers 

346 

347 Returns 

348 ------- 

349 str 

350 Metadata string with section headers 

351 """ 

352 if "[MAIN]" in metadata_str or "[SEM]" in metadata_str: 

353 return metadata_str 

354 

355 # Find where SEM section starts by looking for known SEM keys 

356 sem_keys = [ 

357 "AcceleratorVoltage=", 

358 "ApertureDiameter=", 

359 "ApertureOptimization=", 

360 "ChamberPressure=", 

361 "CrossFree=", 

362 "HV=", 

363 ] 

364 sem_start_pos = self._find_sem_section_start(metadata_str, sem_keys) 

365 

366 # Insert section headers at line boundaries 

367 if sem_start_pos < len(metadata_str): 

368 line_start = metadata_str.rfind("\n", 0, sem_start_pos) 

369 if line_start < 0: 

370 line_start = 0 

371 else: 

372 line_start += 1 # Move past the \n 

373 return ( 

374 "[MAIN]\n" 

375 + metadata_str[:line_start] 

376 + "[SEM]\n" 

377 + metadata_str[line_start:] 

378 ) 

379 

380 # No SEM section found 

381 return "[MAIN]\n" + metadata_str 

382 

383 def _find_sem_section_start(self, metadata_str: str, sem_keys: list[str]) -> int: 

384 """Find the position where SEM section starts. 

385 

386 Parameters 

387 ---------- 

388 metadata_str 

389 Metadata string to search 

390 sem_keys 

391 List of keys that typically appear in SEM section 

392 

393 Returns 

394 ------- 

395 int 

396 Position of first SEM key, or length of string if not found 

397 """ 

398 sem_start_pos = len(metadata_str) 

399 for sem_key in sem_keys: 

400 pos = metadata_str.find(sem_key) 

401 if pos >= 0 and pos < sem_start_pos: 

402 sem_start_pos = pos 

403 return sem_start_pos 

404 

405 def _parse_hdr_string(self, hdr_string: str) -> dict[str, dict[str, str]]: 

406 """ 

407 Parse HDR metadata from a string in INI format. 

408 

409 Parameters 

410 ---------- 

411 hdr_string 

412 HDR metadata as a string in INI format 

413 

414 Returns 

415 ------- 

416 dict 

417 Dictionary with section names as keys and key-value dicts as values 

418 """ 

419 # Normalize line endings 

420 hdr_string = hdr_string.replace("\r\n", "\n").replace("\r", "\n") 

421 

422 # Parse with ConfigParser 

423 config = configparser.ConfigParser() 

424 # Make ConfigParser respect upper/lowercase values 

425 config.optionxform = lambda option: option 

426 

427 # Use StringIO to read from string 

428 buf = io.StringIO(hdr_string) 

429 config.read_file(buf) 

430 

431 metadata = {} 

432 for section in config.sections(): 

433 metadata[section] = dict(config.items(section)) 

434 

435 return metadata 

436 

437 def _read_hdr_metadata(self, hdr_path: Path) -> dict[str, dict[str, str]]: 

438 """ 

439 Read and parse a Tescan .hdr file. 

440 

441 The .hdr file is in INI format with sections like [MAIN] and [SEM]. 

442 

443 Parameters 

444 ---------- 

445 hdr_path 

446 Path to the .hdr file 

447 

448 Returns 

449 ------- 

450 dict 

451 Dictionary with section names as keys and key-value dicts as values 

452 """ 

453 with hdr_path.open("r", encoding="utf-8", errors="ignore") as f: 

454 hdr_string = f.read() 

455 

456 return self._parse_hdr_string(hdr_string) 

457 

458 def _extract_from_tiff_tags(self, filename: Path, mdict: dict) -> None: 

459 """ 

460 Extract basic metadata from TIFF tags. 

461 

462 This supplements metadata from HDR files with standard TIFF tags. 

463 Only adds fields that haven't already been set by HDR parsing. 

464 Updates mdict in place. 

465 

466 Parameters 

467 ---------- 

468 filename 

469 Path to the TIFF file 

470 mdict 

471 Metadata dictionary to update 

472 """ 

473 try: 

474 with Image.open(filename) as img: 

475 # Extract standard TIFF tags 

476 # 271 = Make 

477 # 272 = Model 

478 # 305 = Software 

479 # 306 = DateTime 

480 # 315 = Artist (username) 

481 

482 # Only add Make if not already present 

483 if "Make" not in mdict["nx_meta"]: 

484 make = img.tag_v2.get(271) 

485 if make: 

486 mdict["nx_meta"]["Make"] = make 

487 

488 # Only add Model if not already present 

489 if "Model" not in mdict["nx_meta"]: 

490 model = img.tag_v2.get(272) 

491 if model: 

492 mdict["nx_meta"]["Model"] = model 

493 

494 # Only add Software Version if not already present 

495 if "Software Version" not in mdict["nx_meta"]: 

496 software = img.tag_v2.get(305) 

497 if software: 

498 mdict["nx_meta"]["Software Version"] = software 

499 

500 # Always add TIFF DateTime as supplemental info 

501 datetime_str = img.tag_v2.get(306) 

502 if datetime_str: 

503 mdict["nx_meta"]["TIFF DateTime"] = datetime_str 

504 

505 # Only add Operator from Artist tag if not already present 

506 if "Operator" not in mdict["nx_meta"]: 

507 artist = img.tag_v2.get(315) 

508 if artist: 

509 mdict["nx_meta"]["Operator"] = artist 

510 

511 # Only add dimensions if not already present 

512 if "Data Dimensions" not in mdict["nx_meta"]: 

513 width = img.tag_v2.get(256) # ImageWidth 

514 height = img.tag_v2.get(257) # ImageLength 

515 if width and height: 

516 mdict["nx_meta"]["Data Dimensions"] = str((width, height)) 

517 

518 except Exception as e: 

519 _logger.warning("Failed to extract TIFF tags from %s: %s", filename, e) 

520 mdict["nx_meta"]["Extractor Warnings"] = f"Failed to extract TIFF tags: {e}" 

521 

522 def _get_field_definitions(self) -> list: 

523 """ 

524 Get field definitions for metadata extraction. 

525 

526 Returns 

527 ------- 

528 list 

529 List of FieldDefinition tuples 

530 """ 

531 return [ 

532 # [MAIN] section - in order as they appear in HDR file 

533 FD("MAIN", "AccFrames", "Accumulated Frames", 1, False), 

534 FD("MAIN", "AccType", "Accumulation Type", 1, True), 

535 FD("MAIN", "Company", "Company", 1, True), 

536 FD("MAIN", "Date", "Acquisition Date", 1, True), 

537 FD("MAIN", "Description", "Description", 1, True), 

538 FD("MAIN", "Device", "Device", 1, True), 

539 FD("MAIN", "DeviceModel", "Device Model", 1, True), 

540 FD("MAIN", "FullUserName", "Full User Name", 1, True), 

541 FD("MAIN", "ImageStripSize", "Image Strip Size", 1, False), 

542 FD( 

543 "MAIN", 

544 "Magnification", 

545 "Magnification", 

546 1e-3, 

547 False, 

548 target_unit="kiloX", 

549 ), 

550 FD("MAIN", "MagnificationReference", "Magnification Reference", 1, False), 

551 FD("MAIN", "OrigFileName", "Original Filename", 1, True), 

552 FD( 

553 "MAIN", "PixelSizeX", "Pixel Width", 1e9, False, target_unit="nanometer" 

554 ), 

555 FD( 

556 "MAIN", 

557 "PixelSizeY", 

558 "Pixel Height", 

559 1e9, 

560 False, 

561 target_unit="nanometer", 

562 ), 

563 FD("MAIN", "SerialNumber", "Serial Number", 1, True), 

564 FD("MAIN", "Sign", "Sign", 1, True), 

565 FD("MAIN", "SoftwareVersion", "Software Version", 1, True), 

566 FD("MAIN", "Time", "Acquisition Time", 1, True), 

567 FD("MAIN", "UserName", "User Name", 1, True), 

568 FD("MAIN", "ViewFieldsCountX", "View Fields Count X", 1, False), 

569 FD("MAIN", "ViewFieldsCountY", "View Fields Count Y", 1, False), 

570 # [SEM] section - in order as they appear in HDR file 

571 FD( 

572 "SEM", 

573 "AcceleratorVoltage", 

574 "Accelerator Voltage", 

575 1e-3, 

576 False, 

577 target_unit="kilovolt", 

578 ), 

579 FD( 

580 "SEM", 

581 "ApertureDiameter", 

582 "Aperture Diameter", 

583 1e6, 

584 False, 

585 target_unit="micrometer", 

586 ), 

587 FD("SEM", "ApertureOptimization", "Aperture Optimization", 1, False), 

588 FD( 

589 "SEM", 

590 "ChamberPressure", 

591 "Chamber Pressure", 

592 1e3, 

593 False, 

594 target_unit="millipascal", 

595 ), 

596 FD("SEM", "CrossFree", "Cross Free", 1, False), 

597 FD( 

598 "SEM", 

599 "CrossSectionShiftX", 

600 "Cross Section Shift X", 

601 1e6, 

602 False, 

603 target_unit="micrometer", 

604 ), 

605 FD( 

606 "SEM", 

607 "CrossSectionShiftY", 

608 "Cross Section Shift Y", 

609 1e6, 

610 False, 

611 target_unit="micrometer", 

612 ), 

613 FD( 

614 "SEM", 

615 "DepthOfFocus", 

616 "Depth of Focus", 

617 1e6, 

618 False, 

619 target_unit="micrometer", 

620 ), 

621 FD("SEM", "Detector", "Detector Name", 1, True), 

622 FD("SEM", "Detector0", "Detector 0", 1, True), 

623 FD("SEM", "Detector0FlatField", "Detector 0 Flat Field", 1, False), 

624 FD("SEM", "Detector0Gain", "Detector 0 Gain", 1, False), 

625 FD("SEM", "Detector0Offset", "Detector 0 Offset", 1, False), 

626 FD( 

627 "SEM", 

628 "DwellTime", 

629 "Pixel Dwell Time", 

630 1e6, 

631 False, 

632 target_unit="microsecond", 

633 ), 

634 FD( 

635 "SEM", 

636 "EmissionCurrent", 

637 "Emission Current", 

638 1e6, 

639 False, 

640 target_unit="microampere", 

641 ), 

642 FD("SEM", "Gun", "Gun Type", 1, True), 

643 FD("SEM", "GunShiftX", "Gun Shift X", 1, False), 

644 FD("SEM", "GunShiftY", "Gun Shift Y", 1, False), 

645 FD("SEM", "GunTiltX", "Gun Tilt X", 1, False), 

646 FD("SEM", "GunTiltY", "Gun Tilt Y", 1, False), 

647 FD("SEM", "HV", "HV Voltage", 1e-3, False, target_unit="kilovolt"), 

648 FD("SEM", "IMLCenteringX", "IML Centering X", 1, False), 

649 FD("SEM", "IMLCenteringY", "IML Centering Y", 1, False), 

650 FD( 

651 "SEM", 

652 "ImageShiftX", 

653 "Image Shift X", 

654 1e9, 

655 False, 

656 target_unit="nanometer", 

657 ), 

658 FD( 

659 "SEM", 

660 "ImageShiftY", 

661 "Image Shift Y", 

662 1e9, 

663 False, 

664 target_unit="nanometer", 

665 ), 

666 FD("SEM", "InjectedGas", "Injected Gas", 1, True), 

667 FD("SEM", "LUTGamma", "LUT Gamma", 1, False), 

668 FD("SEM", "LUTMaximum", "LUT Maximum", 1, False), 

669 FD("SEM", "LUTMinimum", "LUT Minimum", 1, False), 

670 FD("SEM", "MTDGrid", "MTD Grid", 1e-3, False, target_unit="kilovolt"), 

671 FD( 

672 "SEM", 

673 "MTDScintillator", 

674 "MTD Scintillator", 

675 1e-3, 

676 False, 

677 target_unit="kilovolt", 

678 ), 

679 FD("SEM", "OBJCenteringX", "OBJ Centering X", 1, False), 

680 FD("SEM", "OBJCenteringY", "OBJ Centering Y", 1, False), 

681 FD("SEM", "OBJPreCenteringX", "OBJ Pre-Centering X", 1, False), 

682 FD("SEM", "OBJPreCenteringY", "OBJ Pre-Centering Y", 1, False), 

683 FD("SEM", "PotentialMode", "Potential Mode", 1, True), 

684 FD( 

685 "SEM", 

686 "PredictedBeamCurrent", 

687 "Predicted Beam Current", 

688 1e12, 

689 False, 

690 target_unit="picoampere", 

691 ), 

692 FD("SEM", "PrimaryDetectorGain", "Primary Detector Gain", 1, False), 

693 FD("SEM", "PrimaryDetectorOffset", "Primary Detector Offset", 1, False), 

694 FD("SEM", "SampleVoltage", "Sample Voltage", 1, False, target_unit="volt"), 

695 FD("SEM", "ScanID", "Scan ID", 1, False), 

696 FD("SEM", "ScanMode", "Scan Mode", 1, True), 

697 FD("SEM", "ScanRotation", "Scan Rotation", 1, False, target_unit="degree"), 

698 FD("SEM", "ScanSpeed", "Scan Speed", 1, False), 

699 FD("SEM", "SessionID", "Session ID", 1, True), 

700 FD( 

701 "SEM", 

702 "SpecimenCurrent", 

703 "Specimen Current", 

704 1e12, 

705 False, 

706 target_unit="picoampere", 

707 ), 

708 FD("SEM", "SpotSize", "Spot Size", 1e9, False, target_unit="nanometer"), 

709 FD( 

710 "SEM", 

711 "StageRotation", 

712 ["Stage Position", "Rotation"], 

713 1, 

714 False, 

715 target_unit="degree", 

716 ), 

717 FD( 

718 "SEM", 

719 "StageTilt", 

720 ["Stage Position", "Tilt"], 

721 1, 

722 False, 

723 target_unit="degree", 

724 ), 

725 FD("SEM", "StageX", ["Stage Position", "X"], 1, False, target_unit="meter"), 

726 FD("SEM", "StageY", ["Stage Position", "Y"], 1, False, target_unit="meter"), 

727 FD("SEM", "StageZ", ["Stage Position", "Z"], 1, False, target_unit="meter"), 

728 FD("SEM", "StigmatorX", "Stigmator X Value", 1, False), 

729 FD("SEM", "StigmatorY", "Stigmator Y Value", 1, False), 

730 FD( 

731 "SEM", 

732 "SymmetrizationVoltage", 

733 "Symmetrization Voltage", 

734 1e-3, 

735 False, 

736 target_unit="kilovolt", 

737 ), 

738 FD("SEM", "SyncMains", "Sync to Mains", 1, True), 

739 FD("SEM", "TiltCorrection", "Tilt Correction", 1, False), 

740 FD( 

741 "SEM", 

742 "TubeVoltage", 

743 "Tube Voltage", 

744 1e-3, 

745 False, 

746 target_unit="kilovolt", 

747 ), 

748 FD( 

749 "SEM", 

750 "VirtualObserverDistance", 

751 "Virtual Observer Distance", 

752 1e3, 

753 False, 

754 target_unit="millimeter", 

755 ), 

756 FD("SEM", "WD", "Working Distance", 1e3, False, target_unit="millimeter"), 

757 ] 

758 

759 def _parse_nx_meta(self, mdict: dict) -> dict: # noqa: PLR0912 

760 """ 

761 Parse metadata into NexusLIMS format. 

762 

763 Extracts important metadata from the [MAIN] and [SEM] sections 

764 of the HDR file and places them in standardized locations under 

765 the nx_meta key. 

766 

767 Parameters 

768 ---------- 

769 mdict 

770 Metadata dictionary with [MAIN] and [SEM] sections 

771 

772 Returns 

773 ------- 

774 dict 

775 Updated metadata dictionary with parsed nx_meta fields 

776 """ 

777 # Initialize warnings list 

778 if "warnings" not in mdict["nx_meta"]: 

779 mdict["nx_meta"]["warnings"] = [] 

780 

781 main_section = mdict.get("MAIN", {}) 

782 sem_section = mdict.get("SEM", {}) 

783 

784 # Get field definitions 

785 fields = self._get_field_definitions() 

786 

787 # Extract standard fields 

788 for field in fields: 

789 section = main_section if field.section == "MAIN" else sem_section 

790 value = section.get(field.source_key) 

791 

792 # Try fallback keys for some fields 

793 if value is None and field.source_key == "HV": 

794 value = sem_section.get("AcceleratorVoltage") 

795 elif value is None and field.source_key == "Detector0Gain": 

796 value = sem_section.get("PrimaryDetectorGain") 

797 elif value is None and field.source_key == "Detector0Offset": 

798 value = sem_section.get("PrimaryDetectorOffset") 

799 

800 if value: 

801 if field.is_string: 

802 # Handle nested dict paths vs flat keys 

803 # (impossible to test with existing metadata structure, 

804 # so exclude from coverage) 

805 if isinstance(field.output_key, list): # pragma: no cover 

806 set_nested_dict_value( 

807 mdict, ["nx_meta", *field.output_key], value 

808 ) 

809 else: 

810 mdict["nx_meta"][field.output_key] = value 

811 else: 

812 with contextlib.suppress(ValueError): 

813 # Convert to Decimal to preserve precision through unit 

814 # conversions. The ureg uses non_int_type=Decimal to avoid 

815 # floating-point errors during internal conversions. 

816 # Also apply scaling factor for unit conversion 

817 decimal_val = Decimal(value) * Decimal(str(field.factor)) 

818 

819 # Skip if suppress_zero is True and value is zero 

820 if field.suppress_zero and decimal_val == 0: 

821 continue 

822 

823 # Create Pint Quantity if unit is specified 

824 if field.target_unit: 

825 # Create Quantity with the value after factor conversion 

826 quantity = ureg.Quantity(decimal_val, field.target_unit) 

827 

828 if isinstance(field.output_key, list): 

829 set_nested_dict_value( 

830 mdict, ["nx_meta", *field.output_key], quantity 

831 ) 

832 else: 

833 mdict["nx_meta"][field.output_key] = quantity 

834 # No unit specified, keep as Decimal for precision 

835 elif isinstance(field.output_key, list): 

836 set_nested_dict_value( 

837 mdict, ["nx_meta", *field.output_key], decimal_val 

838 ) 

839 else: 

840 mdict["nx_meta"][field.output_key] = decimal_val 

841 

842 # Handle user information (prefer FullUserName over UserName) 

843 full_username = main_section.get("FullUserName") 

844 username = main_section.get("UserName") 

845 if full_username or username: 

846 mdict["nx_meta"]["Operator"] = full_username or username 

847 mdict["nx_meta"]["warnings"].append(["Operator"]) 

848 

849 return mdict 

850 

851 def _migrate_to_schema_compliant_metadata(self, mdict: dict) -> dict: 

852 """ 

853 Migrate metadata to schema-compliant format. 

854 

855 Reorganizes metadata to conform to type-specific Pydantic schemas: 

856 - Extracts core EM Glossary fields to top level with standardized names 

857 - Moves vendor-specific nested dictionaries and fields to extensions section 

858 - Preserves existing extensions from instrument profiles 

859 

860 Parameters 

861 ---------- 

862 mdict 

863 Metadata dictionary with nx_meta containing extracted fields 

864 

865 Returns 

866 ------- 

867 dict 

868 Metadata dictionary with schema-compliant nx_meta structure 

869 """ 

870 nx_meta = mdict.get("nx_meta", {}) 

871 

872 # Preserve existing extensions from instrument profiles 

873 extensions = ( 

874 nx_meta.get("extensions", {}).copy() if "extensions" in nx_meta else {} 

875 ) 

876 

877 # Field mappings from display names to EM Glossary names 

878 field_mappings = { 

879 "HV Voltage": "acceleration_voltage", 

880 "Accelerator Voltage": "acceleration_voltage", 

881 "Working Distance": "working_distance", 

882 "Beam Current": "beam_current", 

883 "Emission Current": "emission_current", 

884 "Pixel Dwell Time": "dwell_time", 

885 "Horizontal Field Width": "horizontal_field_width", 

886 "Pixel Width": "pixel_width", 

887 "Pixel Height": "pixel_height", 

888 } 

889 

890 # Tescan-specific fields that go to extensions (ALL non-core fields) 

891 # Since tescan extractor currently extracts many individual fields at top level, 

892 # we move them all to extensions except the core EM Glossary ones 

893 extension_field_names = { 

894 "Operator", # User info 

895 # Any other Tescan-specific fields we discover 

896 } 

897 

898 # Build new nx_meta with proper field organization 

899 new_nx_meta = {} 

900 

901 # Copy required fields 

902 for field in ["DatasetType", "Data Type", "Creation Time"]: 

903 if field in nx_meta: 

904 new_nx_meta[field] = nx_meta[field] 

905 

906 # Copy instrument identification 

907 if "Instrument ID" in nx_meta: 

908 new_nx_meta["Instrument ID"] = nx_meta["Instrument ID"] 

909 

910 # Process all fields and categorize 

911 for old_name, value in nx_meta.items(): 

912 # Skip fields we've already handled 

913 if old_name in [ 

914 "DatasetType", 

915 "Data Type", 

916 "Creation Time", 

917 "Instrument ID", 

918 "Extractor Warnings", 

919 "warnings", 

920 "extensions", 

921 ]: 

922 continue 

923 

924 # Check if this is a core field that needs renaming 

925 if old_name in field_mappings: 

926 emg_name = field_mappings[old_name] 

927 new_nx_meta[emg_name] = value 

928 continue 

929 

930 # Fields explicitly marked as extensions 

931 if old_name in extension_field_names: 

932 extensions[old_name] = value 

933 continue 

934 

935 # Everything else goes to extensions (Tescan-specific fields) 

936 # This is the safest approach since most Tescan fields are vendor-specific 

937 extensions[old_name] = value 

938 

939 # Copy warnings if present 

940 if "warnings" in nx_meta: 

941 new_nx_meta["warnings"] = nx_meta["warnings"] 

942 

943 # Add extensions section if we have any 

944 for key, value in extensions.items(): 

945 add_to_extensions(new_nx_meta, key, value) 

946 

947 mdict["nx_meta"] = new_nx_meta 

948 return mdict