Coverage for nexusLIMS/extractors/plugins/tescan_tif.py: 100%
247 statements
« prev ^ index » next coverage.py v7.11.3, created at 2026-03-24 05:23 +0000
« prev ^ index » next coverage.py v7.11.3, created at 2026-03-24 05:23 +0000
1# ruff: noqa: N817, FBT003
2"""Tescan (P)FIB/SEM TIFF extractor plugin."""
4import configparser
5import contextlib
6import io
7import logging
8from decimal import Decimal
9from pathlib import Path
10from typing import Any, ClassVar
12from PIL import Image
14from nexusLIMS.extractors.base import ExtractionContext
15from nexusLIMS.extractors.base import FieldDefinition as FD
16from nexusLIMS.extractors.utils import _set_instr_name_and_time, add_to_extensions
17from nexusLIMS.schemas.units import ureg
18from nexusLIMS.utils.dicts import set_nested_dict_value, sort_dict
20TESCAN_TIFF_TAG = 50431
21"""
22TIFF tag ID where Tescan stores INI-style metadata in TIFF files.
23The tag contains holds instrument configuration, beam parameters, stage position,
24detector settings, and other acquisition metadata.
25"""
27_MAX_ASCII_VALUE = 128
28"""Maximum value for ASCII characters. Used to filter non-ASCII binary data."""
30_logger = logging.getLogger(__name__)
33class TescanTiffExtractor:
34 """
35 Extractor for Tescan FIB/SEM TIFF files.
37 This extractor handles metadata extraction from .tif files saved by
38 Tescan FIB and SEM instruments (e.g., AMBER X). The extractor uses
39 a two-tier strategy:
41 1. Primary: Look for sidecar .hdr file with full metadata in INI format
42 2. Fallback: Extract basic metadata from TIFF tags if no .hdr file exists
44 The .hdr file contains comprehensive acquisition parameters in two sections:
45 [MAIN] and [SEM], which are parsed using Python's configparser.
46 """
48 name = "tescan_tif_extractor"
49 priority = 150
50 supported_extensions: ClassVar = {"tif", "tiff"}
52 def supports(self, context: ExtractionContext) -> bool:
53 """
54 Check if this extractor supports the given file.
56 Performs content sniffing to verify this is a Tescan TIFF file by:
57 1. Checking file extension (.tif or .tiff)
58 2. Looking for either a sidecar .hdr file or Tescan-specific TIFF tags
60 Parameters
61 ----------
62 context
63 The extraction context containing file information
65 Returns
66 -------
67 bool
68 True if this appears to be a Tescan TIFF file
69 """
70 extension = context.file_path.suffix.lower().lstrip(".")
71 if extension not in {"tif", "tiff"}:
72 return False
74 # Check for sidecar HDR file
75 hdr_file = self._find_hdr_file(context.file_path)
76 if hdr_file is not None and self._is_tescan_hdr(hdr_file):
77 return True
79 # Fallback: check TIFF tags for Tescan signature
80 try:
81 with Image.open(context.file_path) as img:
82 # Check for TESCAN in Make tag (271) or Software tag (305)
83 make = img.tag_v2.get(271, "")
84 software = img.tag_v2.get(305, "")
85 if "TESCAN" in str(make).upper() or "TESCAN" in str(software).upper():
86 return True
87 # check for custom Tescan metadata tag
88 tescan_metadata = img.tag_v2.get(TESCAN_TIFF_TAG, "")
89 if tescan_metadata != "":
90 return True
91 except Exception as e:
92 _logger.debug(
93 "Could not read TIFF tags from %s: %s",
94 context.file_path,
95 e,
96 )
97 return False
99 return False
101 def extract(self, context: ExtractionContext) -> list[dict[str, Any]]:
102 """
103 Extract metadata from a Tescan FIB/SEM TIFF file.
105 Returns the metadata (as a list of dictionaries) from a .tif file saved by
106 Tescan instruments. Uses a three-tier extraction strategy:
107 1. Try to parse embedded HDR metadata from TIFF Tag 50431
108 2. If that fails, look for a sidecar .hdr file
109 3. Always extract basic TIFF tags as well
111 Parameters
112 ----------
113 context
114 The extraction context containing file information
116 Returns
117 -------
118 list[dict]
119 List containing a single metadata dict with 'nx_meta' key
120 """
121 filename = context.file_path
122 _logger.debug("Extracting metadata from Tescan TIFF file: %s", filename)
124 mdict = {"nx_meta": {}}
125 # Assume all datasets coming from Tescan are SEM Images, originally
126 mdict["nx_meta"]["DatasetType"] = "Image"
127 mdict["nx_meta"]["Data Type"] = "SEM_Imaging"
129 _set_instr_name_and_time(mdict, filename)
131 hdr_parsed = False
133 # Strategy 1: Try to parse embedded HDR metadata from TIFF tag 50431
134 try:
135 embedded_metadata = self._extract_embedded_hdr(filename)
136 if embedded_metadata:
137 mdict.update(embedded_metadata)
138 mdict = self._parse_nx_meta(mdict)
139 hdr_parsed = True
140 _logger.debug("Successfully parsed embedded HDR from TIFF tag")
141 except Exception as e:
142 _logger.debug("Could not parse embedded HDR metadata: %s", e)
144 # Strategy 2: If embedded parsing failed, try sidecar HDR file
145 if not hdr_parsed:
146 hdr_file = self._find_hdr_file(filename)
147 if hdr_file is not None and self._is_tescan_hdr(hdr_file):
148 try:
149 hdr_metadata = self._read_hdr_metadata(hdr_file)
150 mdict.update(hdr_metadata)
151 mdict = self._parse_nx_meta(mdict)
152 hdr_parsed = True
153 _logger.debug("Successfully parsed sidecar HDR file")
154 except Exception as e:
155 _logger.warning(
156 "Failed to parse HDR file %s: %s",
157 hdr_file,
158 e,
159 )
161 # Strategy 3: Always extract basic TIFF tags (may supplement or override)
162 self._extract_from_tiff_tags(filename, mdict)
164 # Migrate metadata to schema-compliant format
165 mdict = self._migrate_to_schema_compliant_metadata(mdict)
167 # Sort the nx_meta dictionary (recursively) for nicer display
168 mdict["nx_meta"] = sort_dict(mdict["nx_meta"])
170 return [mdict]
172 def _find_hdr_file(self, tiff_path: Path) -> Path | None:
173 """
174 Find the sidecar .hdr file for a given TIFF file.
176 Parameters
177 ----------
178 tiff_path
179 Path to the TIFF file
181 Returns
182 -------
183 Path or None
184 Path to the .hdr file if it exists, None otherwise
185 """
186 hdr_path = tiff_path.with_suffix(".hdr")
187 if hdr_path.exists():
188 return hdr_path
189 return None
191 def _is_tescan_hdr(self, hdr_path: Path) -> bool:
192 """
193 Verify that an HDR file is a Tescan format file.
195 Checks for the presence of [MAIN] and [SEM] sections which are
196 characteristic of Tescan HDR files.
198 Parameters
199 ----------
200 hdr_path
201 Path to the .hdr file
203 Returns
204 -------
205 bool
206 True if this appears to be a Tescan HDR file
207 """
208 try:
209 with hdr_path.open("r", encoding="utf-8", errors="ignore") as f:
210 content = f.read(500) # Read first 500 chars
211 # Look for characteristic Tescan sections
212 return "[MAIN]" in content or "Device=TESCAN" in content
213 except Exception as e:
214 _logger.debug("Could not verify HDR file %s: %s", hdr_path, e)
215 return False
217 def _extract_embedded_hdr(
218 self, tiff_path: Path
219 ) -> dict[str, dict[str, str]] | None:
220 """
221 Extract embedded HDR metadata from TIFF Tag TESCAN_TIFF_TAG.
223 Tescan embeds the complete HDR metadata in TIFF tag TESCAN_TIFF_TAG as a
224 binary blob containing the INI-formatted text. The tag may contain binary
225 garbage at the beginning before the actual metadata starts.
227 Parameters
228 ----------
229 tiff_path
230 Path to the TIFF file
232 Returns
233 -------
234 dict or None
235 Dictionary with section names as keys and key-value dicts as values,
236 or None if tag is not present or cannot be parsed
237 """
238 try:
239 with Image.open(tiff_path) as img:
240 metadata_tag = img.tag_v2.get(TESCAN_TIFF_TAG)
241 if metadata_tag is None:
242 return None
244 # Convert tag to bytes
245 metadata_bytes = self._tag_to_bytes(metadata_tag)
247 # Extract metadata string from binary data
248 metadata_str = self._extract_metadata_string(metadata_bytes)
250 # Clean up non-printable characters
251 metadata_str = self._clean_metadata_string(metadata_str)
253 # Add section headers if missing
254 metadata_str = self._add_section_headers_if_needed(metadata_str)
256 # Parse as INI format
257 return self._parse_hdr_string(metadata_str)
259 except Exception as e:
260 _logger.debug("Failed to extract embedded HDR from tag 50431: %s", e)
261 return None
263 def _tag_to_bytes(self, metadata_tag: Any) -> bytes:
264 """Convert TIFF tag data to bytes.
266 Parameters
267 ----------
268 metadata_tag
269 Tag data in various formats (bytes, str, etc.)
271 Returns
272 -------
273 bytes
274 Converted bytes
276 Raises
277 ------
278 TypeError
279 If tag data is not bytes or str
280 """
281 if isinstance(metadata_tag, bytes):
282 return metadata_tag
283 if isinstance(metadata_tag, str):
284 return metadata_tag.encode("utf-8")
285 msg = f"Unsupported metadata tag type: {type(metadata_tag)}"
286 raise TypeError(msg)
288 def _extract_metadata_string(self, metadata_bytes: bytes) -> str:
289 """Extract metadata string from binary data by removing garbage.
291 The tag may contain binary garbage at the beginning. This method looks
292 for known keys to find the start of actual metadata.
294 Parameters
295 ----------
296 metadata_bytes
297 Raw binary metadata from TIFF tag
299 Returns
300 -------
301 str
302 Cleaned metadata string
303 """
304 # Look for the start of metadata by searching for known keys
305 search_keys = [b"[MAIN]", b"AccFrames=", b"AccType=", b"Company=", b"Date="]
306 for search_key in search_keys:
307 pos = metadata_bytes.find(search_key)
308 if pos >= 0:
309 metadata_bytes = metadata_bytes[pos:]
310 return metadata_bytes.replace(b"\x00", b"").decode(
311 "utf-8", errors="ignore"
312 )
314 # Fallback: decode whole thing
315 return metadata_bytes.replace(b"\x00", b"").decode("utf-8", errors="ignore")
317 def _clean_metadata_string(self, metadata_str: str) -> str:
318 """Remove non-printable binary characters from metadata string.
320 Parameters
321 ----------
322 metadata_str
323 Metadata string that may contain non-printable characters
325 Returns
326 -------
327 str
328 Cleaned metadata string
329 """
330 return "".join(
331 c
332 for c in metadata_str
333 if ord(c) < _MAX_ASCII_VALUE and (c.isprintable() or c in "\n\r\t")
334 )
336 def _add_section_headers_if_needed(self, metadata_str: str) -> str:
337 """Add [MAIN] and [SEM] section headers if missing.
339 Tescan's embedded metadata doesn't include section headers, so this
340 method detects where the SEM section starts and inserts headers.
342 Parameters
343 ----------
344 metadata_str
345 Metadata string potentially without section headers
347 Returns
348 -------
349 str
350 Metadata string with section headers
351 """
352 if "[MAIN]" in metadata_str or "[SEM]" in metadata_str:
353 return metadata_str
355 # Find where SEM section starts by looking for known SEM keys
356 sem_keys = [
357 "AcceleratorVoltage=",
358 "ApertureDiameter=",
359 "ApertureOptimization=",
360 "ChamberPressure=",
361 "CrossFree=",
362 "HV=",
363 ]
364 sem_start_pos = self._find_sem_section_start(metadata_str, sem_keys)
366 # Insert section headers at line boundaries
367 if sem_start_pos < len(metadata_str):
368 line_start = metadata_str.rfind("\n", 0, sem_start_pos)
369 if line_start < 0:
370 line_start = 0
371 else:
372 line_start += 1 # Move past the \n
373 return (
374 "[MAIN]\n"
375 + metadata_str[:line_start]
376 + "[SEM]\n"
377 + metadata_str[line_start:]
378 )
380 # No SEM section found
381 return "[MAIN]\n" + metadata_str
383 def _find_sem_section_start(self, metadata_str: str, sem_keys: list[str]) -> int:
384 """Find the position where SEM section starts.
386 Parameters
387 ----------
388 metadata_str
389 Metadata string to search
390 sem_keys
391 List of keys that typically appear in SEM section
393 Returns
394 -------
395 int
396 Position of first SEM key, or length of string if not found
397 """
398 sem_start_pos = len(metadata_str)
399 for sem_key in sem_keys:
400 pos = metadata_str.find(sem_key)
401 if pos >= 0 and pos < sem_start_pos:
402 sem_start_pos = pos
403 return sem_start_pos
405 def _parse_hdr_string(self, hdr_string: str) -> dict[str, dict[str, str]]:
406 """
407 Parse HDR metadata from a string in INI format.
409 Parameters
410 ----------
411 hdr_string
412 HDR metadata as a string in INI format
414 Returns
415 -------
416 dict
417 Dictionary with section names as keys and key-value dicts as values
418 """
419 # Normalize line endings
420 hdr_string = hdr_string.replace("\r\n", "\n").replace("\r", "\n")
422 # Parse with ConfigParser
423 config = configparser.ConfigParser()
424 # Make ConfigParser respect upper/lowercase values
425 config.optionxform = lambda option: option
427 # Use StringIO to read from string
428 buf = io.StringIO(hdr_string)
429 config.read_file(buf)
431 metadata = {}
432 for section in config.sections():
433 metadata[section] = dict(config.items(section))
435 return metadata
437 def _read_hdr_metadata(self, hdr_path: Path) -> dict[str, dict[str, str]]:
438 """
439 Read and parse a Tescan .hdr file.
441 The .hdr file is in INI format with sections like [MAIN] and [SEM].
443 Parameters
444 ----------
445 hdr_path
446 Path to the .hdr file
448 Returns
449 -------
450 dict
451 Dictionary with section names as keys and key-value dicts as values
452 """
453 with hdr_path.open("r", encoding="utf-8", errors="ignore") as f:
454 hdr_string = f.read()
456 return self._parse_hdr_string(hdr_string)
458 def _extract_from_tiff_tags(self, filename: Path, mdict: dict) -> None:
459 """
460 Extract basic metadata from TIFF tags.
462 This supplements metadata from HDR files with standard TIFF tags.
463 Only adds fields that haven't already been set by HDR parsing.
464 Updates mdict in place.
466 Parameters
467 ----------
468 filename
469 Path to the TIFF file
470 mdict
471 Metadata dictionary to update
472 """
473 try:
474 with Image.open(filename) as img:
475 # Extract standard TIFF tags
476 # 271 = Make
477 # 272 = Model
478 # 305 = Software
479 # 306 = DateTime
480 # 315 = Artist (username)
482 # Only add Make if not already present
483 if "Make" not in mdict["nx_meta"]:
484 make = img.tag_v2.get(271)
485 if make:
486 mdict["nx_meta"]["Make"] = make
488 # Only add Model if not already present
489 if "Model" not in mdict["nx_meta"]:
490 model = img.tag_v2.get(272)
491 if model:
492 mdict["nx_meta"]["Model"] = model
494 # Only add Software Version if not already present
495 if "Software Version" not in mdict["nx_meta"]:
496 software = img.tag_v2.get(305)
497 if software:
498 mdict["nx_meta"]["Software Version"] = software
500 # Always add TIFF DateTime as supplemental info
501 datetime_str = img.tag_v2.get(306)
502 if datetime_str:
503 mdict["nx_meta"]["TIFF DateTime"] = datetime_str
505 # Only add Operator from Artist tag if not already present
506 if "Operator" not in mdict["nx_meta"]:
507 artist = img.tag_v2.get(315)
508 if artist:
509 mdict["nx_meta"]["Operator"] = artist
511 # Only add dimensions if not already present
512 if "Data Dimensions" not in mdict["nx_meta"]:
513 width = img.tag_v2.get(256) # ImageWidth
514 height = img.tag_v2.get(257) # ImageLength
515 if width and height:
516 mdict["nx_meta"]["Data Dimensions"] = str((width, height))
518 except Exception as e:
519 _logger.warning("Failed to extract TIFF tags from %s: %s", filename, e)
520 mdict["nx_meta"]["Extractor Warnings"] = f"Failed to extract TIFF tags: {e}"
522 def _get_field_definitions(self) -> list:
523 """
524 Get field definitions for metadata extraction.
526 Returns
527 -------
528 list
529 List of FieldDefinition tuples
530 """
531 return [
532 # [MAIN] section - in order as they appear in HDR file
533 FD("MAIN", "AccFrames", "Accumulated Frames", 1, False),
534 FD("MAIN", "AccType", "Accumulation Type", 1, True),
535 FD("MAIN", "Company", "Company", 1, True),
536 FD("MAIN", "Date", "Acquisition Date", 1, True),
537 FD("MAIN", "Description", "Description", 1, True),
538 FD("MAIN", "Device", "Device", 1, True),
539 FD("MAIN", "DeviceModel", "Device Model", 1, True),
540 FD("MAIN", "FullUserName", "Full User Name", 1, True),
541 FD("MAIN", "ImageStripSize", "Image Strip Size", 1, False),
542 FD(
543 "MAIN",
544 "Magnification",
545 "Magnification",
546 1e-3,
547 False,
548 target_unit="kiloX",
549 ),
550 FD("MAIN", "MagnificationReference", "Magnification Reference", 1, False),
551 FD("MAIN", "OrigFileName", "Original Filename", 1, True),
552 FD(
553 "MAIN", "PixelSizeX", "Pixel Width", 1e9, False, target_unit="nanometer"
554 ),
555 FD(
556 "MAIN",
557 "PixelSizeY",
558 "Pixel Height",
559 1e9,
560 False,
561 target_unit="nanometer",
562 ),
563 FD("MAIN", "SerialNumber", "Serial Number", 1, True),
564 FD("MAIN", "Sign", "Sign", 1, True),
565 FD("MAIN", "SoftwareVersion", "Software Version", 1, True),
566 FD("MAIN", "Time", "Acquisition Time", 1, True),
567 FD("MAIN", "UserName", "User Name", 1, True),
568 FD("MAIN", "ViewFieldsCountX", "View Fields Count X", 1, False),
569 FD("MAIN", "ViewFieldsCountY", "View Fields Count Y", 1, False),
570 # [SEM] section - in order as they appear in HDR file
571 FD(
572 "SEM",
573 "AcceleratorVoltage",
574 "Accelerator Voltage",
575 1e-3,
576 False,
577 target_unit="kilovolt",
578 ),
579 FD(
580 "SEM",
581 "ApertureDiameter",
582 "Aperture Diameter",
583 1e6,
584 False,
585 target_unit="micrometer",
586 ),
587 FD("SEM", "ApertureOptimization", "Aperture Optimization", 1, False),
588 FD(
589 "SEM",
590 "ChamberPressure",
591 "Chamber Pressure",
592 1e3,
593 False,
594 target_unit="millipascal",
595 ),
596 FD("SEM", "CrossFree", "Cross Free", 1, False),
597 FD(
598 "SEM",
599 "CrossSectionShiftX",
600 "Cross Section Shift X",
601 1e6,
602 False,
603 target_unit="micrometer",
604 ),
605 FD(
606 "SEM",
607 "CrossSectionShiftY",
608 "Cross Section Shift Y",
609 1e6,
610 False,
611 target_unit="micrometer",
612 ),
613 FD(
614 "SEM",
615 "DepthOfFocus",
616 "Depth of Focus",
617 1e6,
618 False,
619 target_unit="micrometer",
620 ),
621 FD("SEM", "Detector", "Detector Name", 1, True),
622 FD("SEM", "Detector0", "Detector 0", 1, True),
623 FD("SEM", "Detector0FlatField", "Detector 0 Flat Field", 1, False),
624 FD("SEM", "Detector0Gain", "Detector 0 Gain", 1, False),
625 FD("SEM", "Detector0Offset", "Detector 0 Offset", 1, False),
626 FD(
627 "SEM",
628 "DwellTime",
629 "Pixel Dwell Time",
630 1e6,
631 False,
632 target_unit="microsecond",
633 ),
634 FD(
635 "SEM",
636 "EmissionCurrent",
637 "Emission Current",
638 1e6,
639 False,
640 target_unit="microampere",
641 ),
642 FD("SEM", "Gun", "Gun Type", 1, True),
643 FD("SEM", "GunShiftX", "Gun Shift X", 1, False),
644 FD("SEM", "GunShiftY", "Gun Shift Y", 1, False),
645 FD("SEM", "GunTiltX", "Gun Tilt X", 1, False),
646 FD("SEM", "GunTiltY", "Gun Tilt Y", 1, False),
647 FD("SEM", "HV", "HV Voltage", 1e-3, False, target_unit="kilovolt"),
648 FD("SEM", "IMLCenteringX", "IML Centering X", 1, False),
649 FD("SEM", "IMLCenteringY", "IML Centering Y", 1, False),
650 FD(
651 "SEM",
652 "ImageShiftX",
653 "Image Shift X",
654 1e9,
655 False,
656 target_unit="nanometer",
657 ),
658 FD(
659 "SEM",
660 "ImageShiftY",
661 "Image Shift Y",
662 1e9,
663 False,
664 target_unit="nanometer",
665 ),
666 FD("SEM", "InjectedGas", "Injected Gas", 1, True),
667 FD("SEM", "LUTGamma", "LUT Gamma", 1, False),
668 FD("SEM", "LUTMaximum", "LUT Maximum", 1, False),
669 FD("SEM", "LUTMinimum", "LUT Minimum", 1, False),
670 FD("SEM", "MTDGrid", "MTD Grid", 1e-3, False, target_unit="kilovolt"),
671 FD(
672 "SEM",
673 "MTDScintillator",
674 "MTD Scintillator",
675 1e-3,
676 False,
677 target_unit="kilovolt",
678 ),
679 FD("SEM", "OBJCenteringX", "OBJ Centering X", 1, False),
680 FD("SEM", "OBJCenteringY", "OBJ Centering Y", 1, False),
681 FD("SEM", "OBJPreCenteringX", "OBJ Pre-Centering X", 1, False),
682 FD("SEM", "OBJPreCenteringY", "OBJ Pre-Centering Y", 1, False),
683 FD("SEM", "PotentialMode", "Potential Mode", 1, True),
684 FD(
685 "SEM",
686 "PredictedBeamCurrent",
687 "Predicted Beam Current",
688 1e12,
689 False,
690 target_unit="picoampere",
691 ),
692 FD("SEM", "PrimaryDetectorGain", "Primary Detector Gain", 1, False),
693 FD("SEM", "PrimaryDetectorOffset", "Primary Detector Offset", 1, False),
694 FD("SEM", "SampleVoltage", "Sample Voltage", 1, False, target_unit="volt"),
695 FD("SEM", "ScanID", "Scan ID", 1, False),
696 FD("SEM", "ScanMode", "Scan Mode", 1, True),
697 FD("SEM", "ScanRotation", "Scan Rotation", 1, False, target_unit="degree"),
698 FD("SEM", "ScanSpeed", "Scan Speed", 1, False),
699 FD("SEM", "SessionID", "Session ID", 1, True),
700 FD(
701 "SEM",
702 "SpecimenCurrent",
703 "Specimen Current",
704 1e12,
705 False,
706 target_unit="picoampere",
707 ),
708 FD("SEM", "SpotSize", "Spot Size", 1e9, False, target_unit="nanometer"),
709 FD(
710 "SEM",
711 "StageRotation",
712 ["Stage Position", "Rotation"],
713 1,
714 False,
715 target_unit="degree",
716 ),
717 FD(
718 "SEM",
719 "StageTilt",
720 ["Stage Position", "Tilt"],
721 1,
722 False,
723 target_unit="degree",
724 ),
725 FD("SEM", "StageX", ["Stage Position", "X"], 1, False, target_unit="meter"),
726 FD("SEM", "StageY", ["Stage Position", "Y"], 1, False, target_unit="meter"),
727 FD("SEM", "StageZ", ["Stage Position", "Z"], 1, False, target_unit="meter"),
728 FD("SEM", "StigmatorX", "Stigmator X Value", 1, False),
729 FD("SEM", "StigmatorY", "Stigmator Y Value", 1, False),
730 FD(
731 "SEM",
732 "SymmetrizationVoltage",
733 "Symmetrization Voltage",
734 1e-3,
735 False,
736 target_unit="kilovolt",
737 ),
738 FD("SEM", "SyncMains", "Sync to Mains", 1, True),
739 FD("SEM", "TiltCorrection", "Tilt Correction", 1, False),
740 FD(
741 "SEM",
742 "TubeVoltage",
743 "Tube Voltage",
744 1e-3,
745 False,
746 target_unit="kilovolt",
747 ),
748 FD(
749 "SEM",
750 "VirtualObserverDistance",
751 "Virtual Observer Distance",
752 1e3,
753 False,
754 target_unit="millimeter",
755 ),
756 FD("SEM", "WD", "Working Distance", 1e3, False, target_unit="millimeter"),
757 ]
759 def _parse_nx_meta(self, mdict: dict) -> dict: # noqa: PLR0912
760 """
761 Parse metadata into NexusLIMS format.
763 Extracts important metadata from the [MAIN] and [SEM] sections
764 of the HDR file and places them in standardized locations under
765 the nx_meta key.
767 Parameters
768 ----------
769 mdict
770 Metadata dictionary with [MAIN] and [SEM] sections
772 Returns
773 -------
774 dict
775 Updated metadata dictionary with parsed nx_meta fields
776 """
777 # Initialize warnings list
778 if "warnings" not in mdict["nx_meta"]:
779 mdict["nx_meta"]["warnings"] = []
781 main_section = mdict.get("MAIN", {})
782 sem_section = mdict.get("SEM", {})
784 # Get field definitions
785 fields = self._get_field_definitions()
787 # Extract standard fields
788 for field in fields:
789 section = main_section if field.section == "MAIN" else sem_section
790 value = section.get(field.source_key)
792 # Try fallback keys for some fields
793 if value is None and field.source_key == "HV":
794 value = sem_section.get("AcceleratorVoltage")
795 elif value is None and field.source_key == "Detector0Gain":
796 value = sem_section.get("PrimaryDetectorGain")
797 elif value is None and field.source_key == "Detector0Offset":
798 value = sem_section.get("PrimaryDetectorOffset")
800 if value:
801 if field.is_string:
802 # Handle nested dict paths vs flat keys
803 # (impossible to test with existing metadata structure,
804 # so exclude from coverage)
805 if isinstance(field.output_key, list): # pragma: no cover
806 set_nested_dict_value(
807 mdict, ["nx_meta", *field.output_key], value
808 )
809 else:
810 mdict["nx_meta"][field.output_key] = value
811 else:
812 with contextlib.suppress(ValueError):
813 # Convert to Decimal to preserve precision through unit
814 # conversions. The ureg uses non_int_type=Decimal to avoid
815 # floating-point errors during internal conversions.
816 # Also apply scaling factor for unit conversion
817 decimal_val = Decimal(value) * Decimal(str(field.factor))
819 # Skip if suppress_zero is True and value is zero
820 if field.suppress_zero and decimal_val == 0:
821 continue
823 # Create Pint Quantity if unit is specified
824 if field.target_unit:
825 # Create Quantity with the value after factor conversion
826 quantity = ureg.Quantity(decimal_val, field.target_unit)
828 if isinstance(field.output_key, list):
829 set_nested_dict_value(
830 mdict, ["nx_meta", *field.output_key], quantity
831 )
832 else:
833 mdict["nx_meta"][field.output_key] = quantity
834 # No unit specified, keep as Decimal for precision
835 elif isinstance(field.output_key, list):
836 set_nested_dict_value(
837 mdict, ["nx_meta", *field.output_key], decimal_val
838 )
839 else:
840 mdict["nx_meta"][field.output_key] = decimal_val
842 # Handle user information (prefer FullUserName over UserName)
843 full_username = main_section.get("FullUserName")
844 username = main_section.get("UserName")
845 if full_username or username:
846 mdict["nx_meta"]["Operator"] = full_username or username
847 mdict["nx_meta"]["warnings"].append(["Operator"])
849 return mdict
851 def _migrate_to_schema_compliant_metadata(self, mdict: dict) -> dict:
852 """
853 Migrate metadata to schema-compliant format.
855 Reorganizes metadata to conform to type-specific Pydantic schemas:
856 - Extracts core EM Glossary fields to top level with standardized names
857 - Moves vendor-specific nested dictionaries and fields to extensions section
858 - Preserves existing extensions from instrument profiles
860 Parameters
861 ----------
862 mdict
863 Metadata dictionary with nx_meta containing extracted fields
865 Returns
866 -------
867 dict
868 Metadata dictionary with schema-compliant nx_meta structure
869 """
870 nx_meta = mdict.get("nx_meta", {})
872 # Preserve existing extensions from instrument profiles
873 extensions = (
874 nx_meta.get("extensions", {}).copy() if "extensions" in nx_meta else {}
875 )
877 # Field mappings from display names to EM Glossary names
878 field_mappings = {
879 "HV Voltage": "acceleration_voltage",
880 "Accelerator Voltage": "acceleration_voltage",
881 "Working Distance": "working_distance",
882 "Beam Current": "beam_current",
883 "Emission Current": "emission_current",
884 "Pixel Dwell Time": "dwell_time",
885 "Horizontal Field Width": "horizontal_field_width",
886 "Pixel Width": "pixel_width",
887 "Pixel Height": "pixel_height",
888 }
890 # Tescan-specific fields that go to extensions (ALL non-core fields)
891 # Since tescan extractor currently extracts many individual fields at top level,
892 # we move them all to extensions except the core EM Glossary ones
893 extension_field_names = {
894 "Operator", # User info
895 # Any other Tescan-specific fields we discover
896 }
898 # Build new nx_meta with proper field organization
899 new_nx_meta = {}
901 # Copy required fields
902 for field in ["DatasetType", "Data Type", "Creation Time"]:
903 if field in nx_meta:
904 new_nx_meta[field] = nx_meta[field]
906 # Copy instrument identification
907 if "Instrument ID" in nx_meta:
908 new_nx_meta["Instrument ID"] = nx_meta["Instrument ID"]
910 # Process all fields and categorize
911 for old_name, value in nx_meta.items():
912 # Skip fields we've already handled
913 if old_name in [
914 "DatasetType",
915 "Data Type",
916 "Creation Time",
917 "Instrument ID",
918 "Extractor Warnings",
919 "warnings",
920 "extensions",
921 ]:
922 continue
924 # Check if this is a core field that needs renaming
925 if old_name in field_mappings:
926 emg_name = field_mappings[old_name]
927 new_nx_meta[emg_name] = value
928 continue
930 # Fields explicitly marked as extensions
931 if old_name in extension_field_names:
932 extensions[old_name] = value
933 continue
935 # Everything else goes to extensions (Tescan-specific fields)
936 # This is the safest approach since most Tescan fields are vendor-specific
937 extensions[old_name] = value
939 # Copy warnings if present
940 if "warnings" in nx_meta:
941 new_nx_meta["warnings"] = nx_meta["warnings"]
943 # Add extensions section if we have any
944 for key, value in extensions.items():
945 add_to_extensions(new_nx_meta, key, value)
947 mdict["nx_meta"] = new_nx_meta
948 return mdict