"""
Pint unit registry and utilities for NexusLIMS metadata.
This module provides a centralized Pint unit registry for handling physical quantities
with units in NexusLIMS metadata. It defines preferred units for different measurement
types and provides utilities for normalizing quantities to these preferred units.
The module supports three-tiered unit serialization:
- **Tier 1 (Internal)**: Pint Quantity objects with QUDT/EMG mappings
- **Tier 2 (XML)**: Clean name/value/unit separation using XSD unit attribute
- **Tier 3 (Future)**: Optional QUDT/EMG URIs for semantic web integration
Examples
--------
Create and normalize quantities:
>>> from nexusLIMS.schemas.units import ureg, normalize_quantity
>>> voltage = ureg.Quantity(10000, "volt")
>>> normalized = normalize_quantity("acceleration_voltage", voltage)
>>> print(normalized)
10.0 kilovolt
Parse from strings:
>>> from nexusLIMS.schemas.units import parse_quantity
>>> voltage = parse_quantity("acceleration_voltage", "10 kV")
>>> print(voltage)
10.0 kilovolt
Serialize for XML:
>>> from nexusLIMS.schemas.units import quantity_to_xml_parts
>>> name, value, unit = quantity_to_xml_parts("acceleration_voltage", voltage)
>>> print(f"<meta name='{name}' unit='{unit}'>{value}</meta>")
<meta name='Voltage' unit='kV'>10.0</meta>
"""
import logging
from decimal import Decimal
from functools import lru_cache
from pathlib import Path
from typing import Any
import numpy as np
from pint import UnitRegistry
from rdflib import RDFS, Graph, Namespace
logger = logging.getLogger(__name__)
# Singleton Pint unit registry for the entire application
# Using this ensures consistent unit definitions across all modules
# Use Decimal for non-integer types to avoid floating-point precision issues
# (e.g., 1.5625 instead of 1.5624999999999998 when converting units)
ureg = UnitRegistry(non_int_type=Decimal)
# Save reference to the original Quantity class for isinstance checks
_OriginalQuantity = ureg.Quantity
# Monkey-patch the __new__ method to auto-convert floats to Decimals
# This prevents type errors when comparing Quantities with different magnitude types
_original_new = _OriginalQuantity.__new__
def _quantity_new_with_decimal_conversion(cls, value, units=None):
"""
Auto-convert float magnitudes to Decimal when creating Quantity instances.
This ensures consistency with the ureg's non_int_type=Decimal setting.
Without this conversion, Pint doesn't automatically convert input floats,
leading to mixed float/Decimal types that fail during unit conversions.
"""
if isinstance(value, (float, np.floating)):
value = Decimal(str(value))
# Call original __new__ with potentially modified value
return _original_new(cls, value, units)
# Replace the __new__ method while keeping the class intact for isinstance()
_OriginalQuantity.__new__ = staticmethod(_quantity_new_with_decimal_conversion)
# Path to QUDT unit vocabulary file
QUDT_UNIT_TTL_PATH = Path(__file__).parent / "references" / "qudt_unit.ttl"
QUDT_VERSION = "3.1.9"
# RDF namespace for QUDT
QUDT_UNIT = Namespace("http://qudt.org/vocab/unit/")
# Define custom microscopy units
ureg.define("kiloX = 1000 = kX") # Magnification in thousands (e.g., 160 kX = 160000x)
# Magic values for scientific notation formatting
_MIN_MAGNITUDE_FOR_NORMAL_NOTATION = 1e-3
_MAX_MAGNITUDE_FOR_NORMAL_NOTATION = 1e6
# Preferred units for each field type
# These define the canonical units that quantities should be normalized to
# before serialization to XML or storage
PREFERRED_UNITS = {
# Image acquisition parameters
"acceleration_voltage": ureg.kilovolt,
"working_distance": ureg.millimeter,
"beam_current": ureg.picoampere,
"emission_current": ureg.microampere,
"dwell_time": ureg.microsecond,
"magnification": ureg.dimensionless, # Magnification has no units
"horizontal_field_width": ureg.micrometer,
"pixel_width": ureg.nanometer,
"pixel_height": ureg.nanometer,
"scan_rotation": ureg.degree,
# Stage position components
"stage_x": ureg.micrometer,
"stage_y": ureg.micrometer,
"stage_z": ureg.millimeter,
"stage_tilt": ureg.degree,
"stage_rotation": ureg.degree,
"stage_alpha": ureg.degree,
"stage_beta": ureg.degree,
# Spectrum acquisition parameters
"acquisition_time": ureg.second,
"live_time": ureg.second,
"detector_energy_resolution": ureg.eV,
"channel_size": ureg.eV,
"starting_energy": ureg.keV,
"azimuthal_angle": ureg.degree,
"elevation_angle": ureg.degree,
"takeoff_angle": ureg.degree,
# Diffraction parameters
"camera_length": ureg.millimeter,
"convergence_angle": ureg.milliradian,
# Environmental parameters
"temperature": ureg.kelvin,
"pressure": ureg.pascal,
"chamber_pressure": ureg.pascal,
}
@lru_cache(maxsize=1)
def _load_qudt_units() -> dict[str, str]:
"""
Load QUDT unit URIs from the Turtle file.
Parses the QUDT unit vocabulary to extract unit labels and their URIs.
This provides a mapping from Pint unit names to QUDT ontology URIs.
Returns
-------
dict[str, str]
Mapping from unit_name -> QUDT URI
Examples
--------
>>> units = _load_qudt_units()
>>> units.get("kilovolt")
'http://qudt.org/vocab/unit/KiloV'
Notes
-----
Results are cached for performance. The mapping uses rdfs:label to match
Pint unit names (e.g., "kilovolt") to QUDT URIs.
"""
if not QUDT_UNIT_TTL_PATH.exists():
logger.warning("QUDT unit file not found at %s", QUDT_UNIT_TTL_PATH)
return {}
try:
g = Graph()
g.parse(QUDT_UNIT_TTL_PATH, format="turtle")
logger.debug("Loaded QUDT unit vocabulary from %s", QUDT_UNIT_TTL_PATH)
except Exception:
logger.exception("Failed to parse QUDT unit file.")
return {}
# Build mapping from label -> URI
unit_map = {}
# Iterate over all QUDT unit instances
for unit_uri in g.subjects(predicate=RDFS.label):
if not str(unit_uri).startswith(str(QUDT_UNIT)):
continue
# Get the label(s) for this unit
for label_obj in g.objects(unit_uri, RDFS.label):
label = str(label_obj).lower().replace(" ", "")
# Map label to URI
unit_map[label] = str(unit_uri)
logger.debug("Loaded %s QUDT unit mappings", len(unit_map))
return unit_map
# Lazy-loaded QUDT unit URI mappings via lru_cache
@lru_cache(maxsize=1)
def _get_qudt_uri_mapping() -> dict[str, str]:
"""Get the QUDT unit URI mapping, loading if necessary."""
return _load_qudt_units()
[docs]
def normalize_quantity(field_name: str, quantity: Any) -> Any:
"""
Normalize a quantity to its preferred unit for the given field.
Takes a Pint Quantity and converts it to the canonical unit defined
in PREFERRED_UNITS for that field. If no preferred unit is defined,
returns the quantity unchanged. Non-Quantity values are passed through.
Parameters
----------
field_name : str
The metadata field name (e.g., "acceleration_voltage", "working_distance")
quantity : Any
The quantity to normalize. Can be:
- Pint Quantity object (will be converted)
- String (returned unchanged - use parse_quantity first)
- Numeric value (returned unchanged)
- None (returned unchanged)
Returns
-------
Any
The normalized quantity in preferred units, or the original value
if not a Quantity or no preferred unit is defined
Examples
--------
>>> voltage = ureg.Quantity(10000, "volt")
>>> normalized = normalize_quantity("acceleration_voltage", voltage)
>>> print(normalized)
10.0 kilovolt
>>> current = ureg.Quantity(0.1, "nanoampere")
>>> normalized = normalize_quantity("beam_current", current)
>>> print(normalized)
100.0 picoampere
>>> # Non-Quantity values pass through
>>> normalize_quantity("unknown_field", "some string")
'some string'
>>> # Fields without preferred units return unchanged
>>> qty = ureg.Quantity(5.0, "furlong")
>>> normalize_quantity("custom_field", qty) == qty
True
"""
# Only process Pint Quantity objects
if not isinstance(quantity, ureg.Quantity):
return quantity
# Get preferred unit for this field
preferred_unit = PREFERRED_UNITS.get(field_name)
if preferred_unit is None:
# No preferred unit defined, return as-is
return quantity
try:
# Convert to preferred unit
return quantity.to(preferred_unit)
except Exception as e:
# Log conversion error but don't fail - return original
logger.warning(
"Could not convert %s from %s to %s: %s. Returning original value.",
field_name,
quantity.units,
preferred_unit,
e,
)
return quantity
[docs]
def parse_quantity(field_name: str, value: Any) -> Any:
"""
Parse a value into a Pint Quantity and normalize to preferred units.
Accepts multiple input types:
- Pint Quantity: Normalized to preferred units
- String: Parsed as quantity (e.g., "10 kV", "5.2 mm")
- Numeric: Assumed to be in preferred units for field
- None: Passed through unchanged
Parameters
----------
field_name : str
The metadata field name (e.g., "acceleration_voltage")
value : Any
The value to parse. Can be Quantity, string, numeric, or None
Returns
-------
Any
Pint Quantity in preferred units, or original value if unparseable
Examples
--------
>>> qty = parse_quantity("acceleration_voltage", "10 kV")
>>> print(qty)
10.0 kilovolt
>>> qty = parse_quantity("working_distance", 5.2) # Assumes mm
>>> print(qty)
5.2 millimeter
>>> qty = parse_quantity("beam_current", ureg.Quantity(0.1, "nA"))
>>> print(qty)
100.0 picoampere
>>> parse_quantity("operator", None) is None
True
"""
# Pass through None
if value is None:
return value
# If already a Quantity, normalize it
if isinstance(value, ureg.Quantity):
return normalize_quantity(field_name, value)
# Try parsing string as quantity
if isinstance(value, str):
try:
qty = ureg.Quantity(value)
return normalize_quantity(field_name, qty)
except Exception as e:
logger.debug(
"Could not parse '%s' as quantity for %s: %s", value, field_name, e
)
# For numeric values, assume they're in the preferred unit
if isinstance(value, (int, float)):
preferred_unit = PREFERRED_UNITS.get(field_name)
if preferred_unit is not None:
return ureg.Quantity(value, preferred_unit)
# All other cases (unparseable strings, unknown types, or no preferred unit)
return value
[docs]
def quantity_to_xml_parts(
field_name: str, quantity: Any
) -> tuple[str, str, str | None]:
"""
Convert a field name and quantity to XML serialization parts.
Extracts the display name, numeric value, and unit string for XML
serialization. This enables clean XML output like:
``<meta name="Voltage" unit="kV">10.0</meta>``
Parameters
----------
field_name : str
The internal field name (e.g., "acceleration_voltage")
quantity : Any
The quantity value (Pint Quantity, string, or numeric)
Returns
-------
tuple[str, str, str | None]
A 3-tuple of (display_name, value_string, unit_string)
- display_name: Human-readable field name for XML
- value_string: Numeric value as string
- unit_string: Unit abbreviation, or None if dimensionless/non-quantity
Examples
--------
>>> qty = ureg.Quantity(10.0, "kilovolt")
>>> name, value, unit = quantity_to_xml_parts("acceleration_voltage", qty)
>>> print(f"<meta name='{name}' unit='{unit}'>{value}</meta>")
<meta name='Voltage' unit='kV'>10.0</meta>
>>> qty = ureg.Quantity(5000, "dimensionless")
>>> name, value, unit = quantity_to_xml_parts("magnification", qty)
>>> print(f"<meta name='{name}'>{value}</meta>") # No unit attr
<meta name='Magnification'>5000</meta>
Notes
-----
For non-Quantity values, the value is converted to string and unit is None.
Display name mapping is handled by separate EM Glossary utilities.
"""
from nexusLIMS.schemas.em_glossary import ( # noqa: PLC0415
get_display_name,
) # Import here to avoid circular imports
display_name = get_display_name(field_name)
if isinstance(quantity, ureg.Quantity):
# Format magnitude (use scientific notation for very small/large)
magnitude = quantity.magnitude
if (
abs(magnitude) < _MIN_MAGNITUDE_FOR_NORMAL_NOTATION
or abs(magnitude) > _MAX_MAGNITUDE_FOR_NORMAL_NOTATION
):
value_str = f"{magnitude:.6e}"
else:
value_str = f"{magnitude:.6g}"
# Get unit string (use compact format)
unit_str = f"{quantity.units:~}" # Compact format (kV instead of kilovolt)
# Handle dimensionless
if quantity.dimensionless:
unit_str = None
return display_name, value_str, unit_str
# Non-Quantity value
return display_name, str(quantity), None
[docs]
def get_qudt_uri(quantity: Any) -> str | None:
"""
Get the QUDT URI for a Pint Quantity's unit.
Returns the QUDT (Quantities, Units, Dimensions and Data Types) ontology
URI for the quantity's unit. This enables Tier 3 semantic web integration.
The mapping is loaded dynamically from the QUDT unit vocabulary file
(qudt_unit.ttl) using RDFLib.
Parameters
----------
quantity : Any
A Pint Quantity object
Returns
-------
str or None
QUDT URI string, or None if not a Quantity or URI not found
Examples
--------
>>> qty = ureg.Quantity(10, "kilovolt")
>>> get_qudt_uri(qty)
'http://qudt.org/vocab/unit/KiloV'
>>> qty = ureg.Quantity(5.2, "millimeter")
>>> get_qudt_uri(qty)
'http://qudt.org/vocab/unit/MilliM'
>>> get_qudt_uri("not a quantity")
# Returns None
"""
if not isinstance(quantity, ureg.Quantity):
return None
# Get unit string (full name, lowercase, no spaces for matching)
unit_str = str(quantity.units).lower().replace(" ", "")
# Look up in QUDT mapping (loaded from TTL file)
qudt_map = _get_qudt_uri_mapping()
return qudt_map.get(unit_str)
[docs]
def serialize_quantity(quantity: Any) -> dict[str, Any]:
"""
Serialize a Pint Quantity to a JSON-compatible dictionary.
Converts a Quantity to a dict with 'value' and 'units' keys.
Used for internal storage or JSON export. For XML serialization,
use :func:`quantity_to_xml_parts` instead.
Parameters
----------
quantity : Any
A Pint Quantity object, or other value to serialize
Returns
-------
dict[str, Any]
Dictionary with 'value' and 'units' keys if Quantity,
or {'value': quantity} for non-Quantity values
Examples
--------
>>> qty = ureg.Quantity(10, "kilovolt")
>>> serialize_quantity(qty)
{'value': 10.0, 'units': 'kilovolt'}
>>> serialize_quantity("some string")
{'value': 'some string'}
"""
if isinstance(quantity, ureg.Quantity):
return {
"value": quantity.magnitude,
"units": str(quantity.units),
}
return {"value": quantity}
[docs]
def deserialize_quantity(data: dict[str, Any]) -> Any:
"""
Deserialize a dictionary back to a Pint Quantity.
Reverses the operation of :func:`serialize_quantity`. Takes a dict
with 'value' and 'units' keys and reconstructs the Quantity.
Parameters
----------
data : dict[str, Any]
Dictionary with 'value' and 'units' keys, or just 'value' key
Returns
-------
Any
Pint Quantity if dict has value/units, otherwise the 'value' field
Examples
--------
>>> data = {'value': 10.0, 'units': 'kilovolt'}
>>> qty = deserialize_quantity(data)
>>> print(qty)
10.0 kilovolt
>>> data = {'value': 'some string'}
>>> deserialize_quantity(data)
'some string'
"""
if "units" in data:
return ureg.Quantity(data["value"], data["units"])
return data.get("value")