Source code for nexusLIMS.extractors.plugins.tescan_tif
# ruff: noqa: N817, FBT003
"""Tescan (P)FIB/SEM TIFF extractor plugin."""
import configparser
import contextlib
import io
import logging
from decimal import Decimal
from pathlib import Path
from typing import Any, ClassVar
from PIL import Image
from nexusLIMS.extractors.base import ExtractionContext
from nexusLIMS.extractors.base import FieldDefinition as FD
from nexusLIMS.extractors.utils import _set_instr_name_and_time, add_to_extensions
from nexusLIMS.schemas.units import ureg
from nexusLIMS.utils.dicts import set_nested_dict_value, sort_dict
TESCAN_TIFF_TAG = 50431
"""
TIFF tag ID where Tescan stores INI-style metadata in TIFF files.
The tag contains holds instrument configuration, beam parameters, stage position,
detector settings, and other acquisition metadata.
"""
_MAX_ASCII_VALUE = 128
"""Maximum value for ASCII characters. Used to filter non-ASCII binary data."""
_logger = logging.getLogger(__name__)
[docs]
class TescanTiffExtractor:
"""
Extractor for Tescan FIB/SEM TIFF files.
This extractor handles metadata extraction from .tif files saved by
Tescan FIB and SEM instruments (e.g., AMBER X). The extractor uses
a two-tier strategy:
1. Primary: Look for sidecar .hdr file with full metadata in INI format
2. Fallback: Extract basic metadata from TIFF tags if no .hdr file exists
The .hdr file contains comprehensive acquisition parameters in two sections:
[MAIN] and [SEM], which are parsed using Python's configparser.
"""
name = "tescan_tif_extractor"
priority = 150
supported_extensions: ClassVar = {"tif", "tiff"}
[docs]
def supports(self, context: ExtractionContext) -> bool:
"""
Check if this extractor supports the given file.
Performs content sniffing to verify this is a Tescan TIFF file by:
1. Checking file extension (.tif or .tiff)
2. Looking for either a sidecar .hdr file or Tescan-specific TIFF tags
Parameters
----------
context
The extraction context containing file information
Returns
-------
bool
True if this appears to be a Tescan TIFF file
"""
extension = context.file_path.suffix.lower().lstrip(".")
if extension not in {"tif", "tiff"}:
return False
# Check for sidecar HDR file
hdr_file = self._find_hdr_file(context.file_path)
if hdr_file is not None and self._is_tescan_hdr(hdr_file):
return True
# Fallback: check TIFF tags for Tescan signature
try:
with Image.open(context.file_path) as img:
# Check for TESCAN in Make tag (271) or Software tag (305)
make = img.tag_v2.get(271, "")
software = img.tag_v2.get(305, "")
if "TESCAN" in str(make).upper() or "TESCAN" in str(software).upper():
return True
# check for custom Tescan metadata tag
tescan_metadata = img.tag_v2.get(TESCAN_TIFF_TAG, "")
if tescan_metadata != "":
return True
except Exception as e:
_logger.debug(
"Could not read TIFF tags from %s: %s",
context.file_path,
e,
)
return False
return False
[docs]
def extract(self, context: ExtractionContext) -> list[dict[str, Any]]:
"""
Extract metadata from a Tescan FIB/SEM TIFF file.
Returns the metadata (as a list of dictionaries) from a .tif file saved by
Tescan instruments. Uses a three-tier extraction strategy:
1. Try to parse embedded HDR metadata from TIFF Tag 50431
2. If that fails, look for a sidecar .hdr file
3. Always extract basic TIFF tags as well
Parameters
----------
context
The extraction context containing file information
Returns
-------
list[dict]
List containing a single metadata dict with 'nx_meta' key
"""
filename = context.file_path
_logger.debug("Extracting metadata from Tescan TIFF file: %s", filename)
mdict = {"nx_meta": {}}
# Assume all datasets coming from Tescan are SEM Images, originally
mdict["nx_meta"]["DatasetType"] = "Image"
mdict["nx_meta"]["Data Type"] = "SEM_Imaging"
_set_instr_name_and_time(mdict, filename)
hdr_parsed = False
# Strategy 1: Try to parse embedded HDR metadata from TIFF tag 50431
try:
embedded_metadata = self._extract_embedded_hdr(filename)
if embedded_metadata:
mdict.update(embedded_metadata)
mdict = self._parse_nx_meta(mdict)
hdr_parsed = True
_logger.debug("Successfully parsed embedded HDR from TIFF tag")
except Exception as e:
_logger.debug("Could not parse embedded HDR metadata: %s", e)
# Strategy 2: If embedded parsing failed, try sidecar HDR file
if not hdr_parsed:
hdr_file = self._find_hdr_file(filename)
if hdr_file is not None and self._is_tescan_hdr(hdr_file):
try:
hdr_metadata = self._read_hdr_metadata(hdr_file)
mdict.update(hdr_metadata)
mdict = self._parse_nx_meta(mdict)
hdr_parsed = True
_logger.debug("Successfully parsed sidecar HDR file")
except Exception as e:
_logger.warning(
"Failed to parse HDR file %s: %s",
hdr_file,
e,
)
# Strategy 3: Always extract basic TIFF tags (may supplement or override)
self._extract_from_tiff_tags(filename, mdict)
# Migrate metadata to schema-compliant format
mdict = self._migrate_to_schema_compliant_metadata(mdict)
# Sort the nx_meta dictionary (recursively) for nicer display
mdict["nx_meta"] = sort_dict(mdict["nx_meta"])
return [mdict]
def _find_hdr_file(self, tiff_path: Path) -> Path | None:
"""
Find the sidecar .hdr file for a given TIFF file.
Parameters
----------
tiff_path
Path to the TIFF file
Returns
-------
Path or None
Path to the .hdr file if it exists, None otherwise
"""
hdr_path = tiff_path.with_suffix(".hdr")
if hdr_path.exists():
return hdr_path
return None
def _is_tescan_hdr(self, hdr_path: Path) -> bool:
"""
Verify that an HDR file is a Tescan format file.
Checks for the presence of [MAIN] and [SEM] sections which are
characteristic of Tescan HDR files.
Parameters
----------
hdr_path
Path to the .hdr file
Returns
-------
bool
True if this appears to be a Tescan HDR file
"""
try:
with hdr_path.open("r", encoding="utf-8", errors="ignore") as f:
content = f.read(500) # Read first 500 chars
# Look for characteristic Tescan sections
return "[MAIN]" in content or "Device=TESCAN" in content
except Exception as e:
_logger.debug("Could not verify HDR file %s: %s", hdr_path, e)
return False
def _extract_embedded_hdr(
self, tiff_path: Path
) -> dict[str, dict[str, str]] | None:
"""
Extract embedded HDR metadata from TIFF Tag TESCAN_TIFF_TAG.
Tescan embeds the complete HDR metadata in TIFF tag TESCAN_TIFF_TAG as a
binary blob containing the INI-formatted text. The tag may contain binary
garbage at the beginning before the actual metadata starts.
Parameters
----------
tiff_path
Path to the TIFF file
Returns
-------
dict or None
Dictionary with section names as keys and key-value dicts as values,
or None if tag is not present or cannot be parsed
"""
try:
with Image.open(tiff_path) as img:
metadata_tag = img.tag_v2.get(TESCAN_TIFF_TAG)
if metadata_tag is None:
return None
# Convert tag to bytes
metadata_bytes = self._tag_to_bytes(metadata_tag)
# Extract metadata string from binary data
metadata_str = self._extract_metadata_string(metadata_bytes)
# Clean up non-printable characters
metadata_str = self._clean_metadata_string(metadata_str)
# Add section headers if missing
metadata_str = self._add_section_headers_if_needed(metadata_str)
# Parse as INI format
return self._parse_hdr_string(metadata_str)
except Exception as e:
_logger.debug("Failed to extract embedded HDR from tag 50431: %s", e)
return None
def _tag_to_bytes(self, metadata_tag: Any) -> bytes:
"""Convert TIFF tag data to bytes.
Parameters
----------
metadata_tag
Tag data in various formats (bytes, str, etc.)
Returns
-------
bytes
Converted bytes
Raises
------
TypeError
If tag data is not bytes or str
"""
if isinstance(metadata_tag, bytes):
return metadata_tag
if isinstance(metadata_tag, str):
return metadata_tag.encode("utf-8")
msg = f"Unsupported metadata tag type: {type(metadata_tag)}"
raise TypeError(msg)
def _extract_metadata_string(self, metadata_bytes: bytes) -> str:
"""Extract metadata string from binary data by removing garbage.
The tag may contain binary garbage at the beginning. This method looks
for known keys to find the start of actual metadata.
Parameters
----------
metadata_bytes
Raw binary metadata from TIFF tag
Returns
-------
str
Cleaned metadata string
"""
# Look for the start of metadata by searching for known keys
search_keys = [b"[MAIN]", b"AccFrames=", b"AccType=", b"Company=", b"Date="]
for search_key in search_keys:
pos = metadata_bytes.find(search_key)
if pos >= 0:
metadata_bytes = metadata_bytes[pos:]
return metadata_bytes.replace(b"\x00", b"").decode(
"utf-8", errors="ignore"
)
# Fallback: decode whole thing
return metadata_bytes.replace(b"\x00", b"").decode("utf-8", errors="ignore")
def _clean_metadata_string(self, metadata_str: str) -> str:
"""Remove non-printable binary characters from metadata string.
Parameters
----------
metadata_str
Metadata string that may contain non-printable characters
Returns
-------
str
Cleaned metadata string
"""
return "".join(
c
for c in metadata_str
if ord(c) < _MAX_ASCII_VALUE and (c.isprintable() or c in "\n\r\t")
)
def _add_section_headers_if_needed(self, metadata_str: str) -> str:
"""Add [MAIN] and [SEM] section headers if missing.
Tescan's embedded metadata doesn't include section headers, so this
method detects where the SEM section starts and inserts headers.
Parameters
----------
metadata_str
Metadata string potentially without section headers
Returns
-------
str
Metadata string with section headers
"""
if "[MAIN]" in metadata_str or "[SEM]" in metadata_str:
return metadata_str
# Find where SEM section starts by looking for known SEM keys
sem_keys = [
"AcceleratorVoltage=",
"ApertureDiameter=",
"ApertureOptimization=",
"ChamberPressure=",
"CrossFree=",
"HV=",
]
sem_start_pos = self._find_sem_section_start(metadata_str, sem_keys)
# Insert section headers at line boundaries
if sem_start_pos < len(metadata_str):
line_start = metadata_str.rfind("\n", 0, sem_start_pos)
if line_start < 0:
line_start = 0
else:
line_start += 1 # Move past the \n
return (
"[MAIN]\n"
+ metadata_str[:line_start]
+ "[SEM]\n"
+ metadata_str[line_start:]
)
# No SEM section found
return "[MAIN]\n" + metadata_str
def _find_sem_section_start(self, metadata_str: str, sem_keys: list[str]) -> int:
"""Find the position where SEM section starts.
Parameters
----------
metadata_str
Metadata string to search
sem_keys
List of keys that typically appear in SEM section
Returns
-------
int
Position of first SEM key, or length of string if not found
"""
sem_start_pos = len(metadata_str)
for sem_key in sem_keys:
pos = metadata_str.find(sem_key)
if pos >= 0 and pos < sem_start_pos:
sem_start_pos = pos
return sem_start_pos
def _parse_hdr_string(self, hdr_string: str) -> dict[str, dict[str, str]]:
"""
Parse HDR metadata from a string in INI format.
Parameters
----------
hdr_string
HDR metadata as a string in INI format
Returns
-------
dict
Dictionary with section names as keys and key-value dicts as values
"""
# Normalize line endings
hdr_string = hdr_string.replace("\r\n", "\n").replace("\r", "\n")
# Parse with ConfigParser
config = configparser.ConfigParser()
# Make ConfigParser respect upper/lowercase values
config.optionxform = lambda option: option
# Use StringIO to read from string
buf = io.StringIO(hdr_string)
config.read_file(buf)
metadata = {}
for section in config.sections():
metadata[section] = dict(config.items(section))
return metadata
def _read_hdr_metadata(self, hdr_path: Path) -> dict[str, dict[str, str]]:
"""
Read and parse a Tescan .hdr file.
The .hdr file is in INI format with sections like [MAIN] and [SEM].
Parameters
----------
hdr_path
Path to the .hdr file
Returns
-------
dict
Dictionary with section names as keys and key-value dicts as values
"""
with hdr_path.open("r", encoding="utf-8", errors="ignore") as f:
hdr_string = f.read()
return self._parse_hdr_string(hdr_string)
def _extract_from_tiff_tags(self, filename: Path, mdict: dict) -> None:
"""
Extract basic metadata from TIFF tags.
This supplements metadata from HDR files with standard TIFF tags.
Only adds fields that haven't already been set by HDR parsing.
Updates mdict in place.
Parameters
----------
filename
Path to the TIFF file
mdict
Metadata dictionary to update
"""
try:
with Image.open(filename) as img:
# Extract standard TIFF tags
# 271 = Make
# 272 = Model
# 305 = Software
# 306 = DateTime
# 315 = Artist (username)
# Only add Make if not already present
if "Make" not in mdict["nx_meta"]:
make = img.tag_v2.get(271)
if make:
mdict["nx_meta"]["Make"] = make
# Only add Model if not already present
if "Model" not in mdict["nx_meta"]:
model = img.tag_v2.get(272)
if model:
mdict["nx_meta"]["Model"] = model
# Only add Software Version if not already present
if "Software Version" not in mdict["nx_meta"]:
software = img.tag_v2.get(305)
if software:
mdict["nx_meta"]["Software Version"] = software
# Always add TIFF DateTime as supplemental info
datetime_str = img.tag_v2.get(306)
if datetime_str:
mdict["nx_meta"]["TIFF DateTime"] = datetime_str
# Only add Operator from Artist tag if not already present
if "Operator" not in mdict["nx_meta"]:
artist = img.tag_v2.get(315)
if artist:
mdict["nx_meta"]["Operator"] = artist
# Only add dimensions if not already present
if "Data Dimensions" not in mdict["nx_meta"]:
width = img.tag_v2.get(256) # ImageWidth
height = img.tag_v2.get(257) # ImageLength
if width and height:
mdict["nx_meta"]["Data Dimensions"] = str((width, height))
except Exception as e:
_logger.warning("Failed to extract TIFF tags from %s: %s", filename, e)
mdict["nx_meta"]["Extractor Warnings"] = f"Failed to extract TIFF tags: {e}"
def _get_field_definitions(self) -> list:
"""
Get field definitions for metadata extraction.
Returns
-------
list
List of FieldDefinition tuples
"""
return [
# [MAIN] section - in order as they appear in HDR file
FD("MAIN", "AccFrames", "Accumulated Frames", 1, False),
FD("MAIN", "AccType", "Accumulation Type", 1, True),
FD("MAIN", "Company", "Company", 1, True),
FD("MAIN", "Date", "Acquisition Date", 1, True),
FD("MAIN", "Description", "Description", 1, True),
FD("MAIN", "Device", "Device", 1, True),
FD("MAIN", "DeviceModel", "Device Model", 1, True),
FD("MAIN", "FullUserName", "Full User Name", 1, True),
FD("MAIN", "ImageStripSize", "Image Strip Size", 1, False),
FD(
"MAIN",
"Magnification",
"Magnification",
1e-3,
False,
target_unit="kiloX",
),
FD("MAIN", "MagnificationReference", "Magnification Reference", 1, False),
FD("MAIN", "OrigFileName", "Original Filename", 1, True),
FD(
"MAIN", "PixelSizeX", "Pixel Width", 1e9, False, target_unit="nanometer"
),
FD(
"MAIN",
"PixelSizeY",
"Pixel Height",
1e9,
False,
target_unit="nanometer",
),
FD("MAIN", "SerialNumber", "Serial Number", 1, True),
FD("MAIN", "Sign", "Sign", 1, True),
FD("MAIN", "SoftwareVersion", "Software Version", 1, True),
FD("MAIN", "Time", "Acquisition Time", 1, True),
FD("MAIN", "UserName", "User Name", 1, True),
FD("MAIN", "ViewFieldsCountX", "View Fields Count X", 1, False),
FD("MAIN", "ViewFieldsCountY", "View Fields Count Y", 1, False),
# [SEM] section - in order as they appear in HDR file
FD(
"SEM",
"AcceleratorVoltage",
"Accelerator Voltage",
1e-3,
False,
target_unit="kilovolt",
),
FD(
"SEM",
"ApertureDiameter",
"Aperture Diameter",
1e6,
False,
target_unit="micrometer",
),
FD("SEM", "ApertureOptimization", "Aperture Optimization", 1, False),
FD(
"SEM",
"ChamberPressure",
"Chamber Pressure",
1e3,
False,
target_unit="millipascal",
),
FD("SEM", "CrossFree", "Cross Free", 1, False),
FD(
"SEM",
"CrossSectionShiftX",
"Cross Section Shift X",
1e6,
False,
target_unit="micrometer",
),
FD(
"SEM",
"CrossSectionShiftY",
"Cross Section Shift Y",
1e6,
False,
target_unit="micrometer",
),
FD(
"SEM",
"DepthOfFocus",
"Depth of Focus",
1e6,
False,
target_unit="micrometer",
),
FD("SEM", "Detector", "Detector Name", 1, True),
FD("SEM", "Detector0", "Detector 0", 1, True),
FD("SEM", "Detector0FlatField", "Detector 0 Flat Field", 1, False),
FD("SEM", "Detector0Gain", "Detector 0 Gain", 1, False),
FD("SEM", "Detector0Offset", "Detector 0 Offset", 1, False),
FD(
"SEM",
"DwellTime",
"Pixel Dwell Time",
1e6,
False,
target_unit="microsecond",
),
FD(
"SEM",
"EmissionCurrent",
"Emission Current",
1e6,
False,
target_unit="microampere",
),
FD("SEM", "Gun", "Gun Type", 1, True),
FD("SEM", "GunShiftX", "Gun Shift X", 1, False),
FD("SEM", "GunShiftY", "Gun Shift Y", 1, False),
FD("SEM", "GunTiltX", "Gun Tilt X", 1, False),
FD("SEM", "GunTiltY", "Gun Tilt Y", 1, False),
FD("SEM", "HV", "HV Voltage", 1e-3, False, target_unit="kilovolt"),
FD("SEM", "IMLCenteringX", "IML Centering X", 1, False),
FD("SEM", "IMLCenteringY", "IML Centering Y", 1, False),
FD(
"SEM",
"ImageShiftX",
"Image Shift X",
1e9,
False,
target_unit="nanometer",
),
FD(
"SEM",
"ImageShiftY",
"Image Shift Y",
1e9,
False,
target_unit="nanometer",
),
FD("SEM", "InjectedGas", "Injected Gas", 1, True),
FD("SEM", "LUTGamma", "LUT Gamma", 1, False),
FD("SEM", "LUTMaximum", "LUT Maximum", 1, False),
FD("SEM", "LUTMinimum", "LUT Minimum", 1, False),
FD("SEM", "MTDGrid", "MTD Grid", 1e-3, False, target_unit="kilovolt"),
FD(
"SEM",
"MTDScintillator",
"MTD Scintillator",
1e-3,
False,
target_unit="kilovolt",
),
FD("SEM", "OBJCenteringX", "OBJ Centering X", 1, False),
FD("SEM", "OBJCenteringY", "OBJ Centering Y", 1, False),
FD("SEM", "OBJPreCenteringX", "OBJ Pre-Centering X", 1, False),
FD("SEM", "OBJPreCenteringY", "OBJ Pre-Centering Y", 1, False),
FD("SEM", "PotentialMode", "Potential Mode", 1, True),
FD(
"SEM",
"PredictedBeamCurrent",
"Predicted Beam Current",
1e12,
False,
target_unit="picoampere",
),
FD("SEM", "PrimaryDetectorGain", "Primary Detector Gain", 1, False),
FD("SEM", "PrimaryDetectorOffset", "Primary Detector Offset", 1, False),
FD("SEM", "SampleVoltage", "Sample Voltage", 1, False, target_unit="volt"),
FD("SEM", "ScanID", "Scan ID", 1, False),
FD("SEM", "ScanMode", "Scan Mode", 1, True),
FD("SEM", "ScanRotation", "Scan Rotation", 1, False, target_unit="degree"),
FD("SEM", "ScanSpeed", "Scan Speed", 1, False),
FD("SEM", "SessionID", "Session ID", 1, True),
FD(
"SEM",
"SpecimenCurrent",
"Specimen Current",
1e12,
False,
target_unit="picoampere",
),
FD("SEM", "SpotSize", "Spot Size", 1e9, False, target_unit="nanometer"),
FD(
"SEM",
"StageRotation",
["Stage Position", "Rotation"],
1,
False,
target_unit="degree",
),
FD(
"SEM",
"StageTilt",
["Stage Position", "Tilt"],
1,
False,
target_unit="degree",
),
FD("SEM", "StageX", ["Stage Position", "X"], 1, False, target_unit="meter"),
FD("SEM", "StageY", ["Stage Position", "Y"], 1, False, target_unit="meter"),
FD("SEM", "StageZ", ["Stage Position", "Z"], 1, False, target_unit="meter"),
FD("SEM", "StigmatorX", "Stigmator X Value", 1, False),
FD("SEM", "StigmatorY", "Stigmator Y Value", 1, False),
FD(
"SEM",
"SymmetrizationVoltage",
"Symmetrization Voltage",
1e-3,
False,
target_unit="kilovolt",
),
FD("SEM", "SyncMains", "Sync to Mains", 1, True),
FD("SEM", "TiltCorrection", "Tilt Correction", 1, False),
FD(
"SEM",
"TubeVoltage",
"Tube Voltage",
1e-3,
False,
target_unit="kilovolt",
),
FD(
"SEM",
"VirtualObserverDistance",
"Virtual Observer Distance",
1e3,
False,
target_unit="millimeter",
),
FD("SEM", "WD", "Working Distance", 1e3, False, target_unit="millimeter"),
]
def _parse_nx_meta(self, mdict: dict) -> dict: # noqa: PLR0912
"""
Parse metadata into NexusLIMS format.
Extracts important metadata from the [MAIN] and [SEM] sections
of the HDR file and places them in standardized locations under
the nx_meta key.
Parameters
----------
mdict
Metadata dictionary with [MAIN] and [SEM] sections
Returns
-------
dict
Updated metadata dictionary with parsed nx_meta fields
"""
# Initialize warnings list
if "warnings" not in mdict["nx_meta"]:
mdict["nx_meta"]["warnings"] = []
main_section = mdict.get("MAIN", {})
sem_section = mdict.get("SEM", {})
# Get field definitions
fields = self._get_field_definitions()
# Extract standard fields
for field in fields:
section = main_section if field.section == "MAIN" else sem_section
value = section.get(field.source_key)
# Try fallback keys for some fields
if value is None and field.source_key == "HV":
value = sem_section.get("AcceleratorVoltage")
elif value is None and field.source_key == "Detector0Gain":
value = sem_section.get("PrimaryDetectorGain")
elif value is None and field.source_key == "Detector0Offset":
value = sem_section.get("PrimaryDetectorOffset")
if value:
if field.is_string:
# Handle nested dict paths vs flat keys
# (impossible to test with existing metadata structure,
# so exclude from coverage)
if isinstance(field.output_key, list): # pragma: no cover
set_nested_dict_value(
mdict, ["nx_meta", *field.output_key], value
)
else:
mdict["nx_meta"][field.output_key] = value
else:
with contextlib.suppress(ValueError):
# Convert to Decimal to preserve precision through unit
# conversions. The ureg uses non_int_type=Decimal to avoid
# floating-point errors during internal conversions.
# Also apply scaling factor for unit conversion
decimal_val = Decimal(value) * Decimal(str(field.factor))
# Skip if suppress_zero is True and value is zero
if field.suppress_zero and decimal_val == 0:
continue
# Create Pint Quantity if unit is specified
if field.target_unit:
# Create Quantity with the value after factor conversion
quantity = ureg.Quantity(decimal_val, field.target_unit)
if isinstance(field.output_key, list):
set_nested_dict_value(
mdict, ["nx_meta", *field.output_key], quantity
)
else:
mdict["nx_meta"][field.output_key] = quantity
# No unit specified, keep as Decimal for precision
elif isinstance(field.output_key, list):
set_nested_dict_value(
mdict, ["nx_meta", *field.output_key], decimal_val
)
else:
mdict["nx_meta"][field.output_key] = decimal_val
# Handle user information (prefer FullUserName over UserName)
full_username = main_section.get("FullUserName")
username = main_section.get("UserName")
if full_username or username:
mdict["nx_meta"]["Operator"] = full_username or username
mdict["nx_meta"]["warnings"].append(["Operator"])
return mdict
def _migrate_to_schema_compliant_metadata(self, mdict: dict) -> dict:
"""
Migrate metadata to schema-compliant format.
Reorganizes metadata to conform to type-specific Pydantic schemas:
- Extracts core EM Glossary fields to top level with standardized names
- Moves vendor-specific nested dictionaries and fields to extensions section
- Preserves existing extensions from instrument profiles
Parameters
----------
mdict
Metadata dictionary with nx_meta containing extracted fields
Returns
-------
dict
Metadata dictionary with schema-compliant nx_meta structure
"""
nx_meta = mdict.get("nx_meta", {})
# Preserve existing extensions from instrument profiles
extensions = (
nx_meta.get("extensions", {}).copy() if "extensions" in nx_meta else {}
)
# Field mappings from display names to EM Glossary names
field_mappings = {
"HV Voltage": "acceleration_voltage",
"Accelerator Voltage": "acceleration_voltage",
"Working Distance": "working_distance",
"Beam Current": "beam_current",
"Emission Current": "emission_current",
"Pixel Dwell Time": "dwell_time",
"Horizontal Field Width": "horizontal_field_width",
"Pixel Width": "pixel_width",
"Pixel Height": "pixel_height",
}
# Tescan-specific fields that go to extensions (ALL non-core fields)
# Since tescan extractor currently extracts many individual fields at top level,
# we move them all to extensions except the core EM Glossary ones
extension_field_names = {
"Operator", # User info
# Any other Tescan-specific fields we discover
}
# Build new nx_meta with proper field organization
new_nx_meta = {}
# Copy required fields
for field in ["DatasetType", "Data Type", "Creation Time"]:
if field in nx_meta:
new_nx_meta[field] = nx_meta[field]
# Copy instrument identification
if "Instrument ID" in nx_meta:
new_nx_meta["Instrument ID"] = nx_meta["Instrument ID"]
# Process all fields and categorize
for old_name, value in nx_meta.items():
# Skip fields we've already handled
if old_name in [
"DatasetType",
"Data Type",
"Creation Time",
"Instrument ID",
"Extractor Warnings",
"warnings",
"extensions",
]:
continue
# Check if this is a core field that needs renaming
if old_name in field_mappings:
emg_name = field_mappings[old_name]
new_nx_meta[emg_name] = value
continue
# Fields explicitly marked as extensions
if old_name in extension_field_names:
extensions[old_name] = value
continue
# Everything else goes to extensions (Tescan-specific fields)
# This is the safest approach since most Tescan fields are vendor-specific
extensions[old_name] = value
# Copy warnings if present
if "warnings" in nx_meta:
new_nx_meta["warnings"] = nx_meta["warnings"]
# Add extensions section if we have any
for key, value in extensions.items():
add_to_extensions(new_nx_meta, key, value)
mdict["nx_meta"] = new_nx_meta
return mdict