Coverage for nexusLIMS/extractors/xml_serialization.py: 100%

49 statements  

« prev     ^ index     » next       coverage.py v7.11.3, created at 2026-03-24 05:23 +0000

1""" 

2XML serialization utilities for NexusLIMS metadata schemas. 

3 

4This module provides utilities for converting type-specific metadata schemas 

5(using Pint Quantities and EM Glossary terminology) into XML format compatible 

6with the Nexus Experiment schema. 

7 

8Key Functions 

9------------- 

10- :func:`serialize_quantity_to_xml`: Convert Pint Quantities to value/unit pairs for XML 

11- :func:`get_xml_field_name`: Map EM Glossary field name to human-friendly display name 

12- :func:`prepare_metadata_for_xml`: Convert rich metadata to XML-compatible flat dict 

13 

14Examples 

15-------- 

16Convert a Pint Quantity to XML: 

17 

18>>> from nexusLIMS.schemas.units import ureg 

19>>> qty = ureg.Quantity(10, "kilovolt") 

20>>> value, unit = serialize_quantity_to_xml(qty) 

21>>> value, unit 

22(10.0, 'kV') 

23 

24Get human-readable field name for XML: 

25 

26>>> get_xml_field_name("acceleration_voltage") 

27'Voltage' 

28>>> get_xml_field_name("working_distance") 

29'Working Distance' 

30""" 

31 

32from typing import Any 

33 

34from pint import Quantity 

35 

36from nexusLIMS.schemas import em_glossary 

37from nexusLIMS.schemas.units import get_qudt_uri as _get_qudt_uri 

38from nexusLIMS.schemas.units import ureg 

39 

40EM_GLOSSARY_TO_XML_DISPLAY_NAMES = { 

41 # Imaging fields (common) 

42 "acceleration_voltage": "Voltage", 

43 "working_distance": "Working Distance", 

44 "beam_current": "Beam Current", 

45 "emission_current": "Emission Current", 

46 "magnification": "Magnification", 

47 "dwell_time": "Pixel Dwell Time", 

48 "horizontal_field_width": "Horizontal Field Width", 

49 "pixel_width": "Pixel Width", 

50 "scan_rotation": "Scan Rotation", 

51 "detector_type": "Detector", 

52 # Spectrum fields 

53 "acquisition_time": "Acquisition Time", 

54 "live_time": "Live Time", 

55 "detector_energy_resolution": "Energy Resolution", 

56 "channel_size": "Channel Size", 

57 "starting_energy": "Starting Energy", 

58 "azimuthal_angle": "Azimuthal Angle", 

59 "elevation_angle": "Elevation Angle", 

60 "elements": "Elements", 

61 # Diffraction fields 

62 "camera_length": "Camera Length", 

63 "convergence_angle": "Convergence Angle", 

64 "diffraction_mode": "Diffraction Mode", 

65 # Stage position fields 

66 "stage_position": "Stage Position", 

67 "stage_x": "Stage X", 

68 "stage_y": "Stage Y", 

69 "stage_z": "Stage Z", 

70 "stage_tilt": "Stage Tilt", 

71 "stage_tilt_alpha": "Stage Tilt", # Primary tilt axis 

72 "stage_tilt_beta": "Stage Tilt Beta", # Secondary tilt axis 

73 "stage_rotation": "Stage Rotation", 

74 # Data fields (core) 

75 "acquisition_timestamp": "Creation Time", 

76 "data_type": "Data Type", 

77 "dataset_type": "DatasetType", 

78 "data_dimensions": "Data Dimensions", 

79 "instrument_id": "Instrument ID", 

80 # Legacy/compatibility fields (old schema) 

81 "Voltage": "Voltage", 

82 "Working Distance": "Working Distance", 

83 "Beam Current": "Beam Current", 

84 "Magnification": "Magnification", 

85 "Detector": "Detector", 

86 "Creation Time": "Creation Time", 

87 "Data Type": "Data Type", 

88 "DatasetType": "DatasetType", 

89 "Data Dimensions": "Data Dimensions", 

90 "Instrument ID": "Instrument ID", 

91} 

92""" 

93Mapping from EM Glossary field names to human-readable XML display names. 

94This maintains backward compatibility with existing XML field names. 

95""" 

96 

97 

98def serialize_quantity_to_xml(qty: Quantity) -> tuple[float, str]: 

99 """ 

100 Convert a Pint Quantity to value and unit strings for XML serialization. 

101 

102 This function extracts the magnitude and unit from a Pint Quantity object 

103 and formats them for use in XML meta elements with the `unit` attribute. 

104 

105 Parameters 

106 ---------- 

107 qty : :class:`pint.Quantity` 

108 The Pint Quantity object to serialize 

109 

110 Returns 

111 ------- 

112 value : float 

113 The numeric magnitude of the quantity 

114 unit : str 

115 The unit symbol in compact form (e.g., "kV", "mm", "pA") 

116 

117 Examples 

118 -------- 

119 >>> from nexusLIMS.schemas.units import ureg 

120 >>> qty = ureg.Quantity(10, "kilovolt") 

121 >>> value, unit = serialize_quantity_to_xml(qty) 

122 >>> value 

123 10.0 

124 >>> unit 

125 'kV' 

126 

127 >>> qty = ureg.Quantity(5.2, "millimeter") 

128 >>> value, unit = serialize_quantity_to_xml(qty) 

129 >>> value 

130 5.2 

131 >>> unit 

132 'mm' 

133 

134 Notes 

135 ----- 

136 The unit is formatted using Pint's compact format (~) which produces 

137 short unit symbols suitable for display in XML attributes. 

138 """ 

139 # Extract magnitude as float 

140 magnitude = float(qty.magnitude) 

141 

142 # Format unit in compact form (e.g., "kV" instead of "kilovolt") 

143 unit_str = f"{qty.units:~}" 

144 

145 return magnitude, unit_str 

146 

147 

148def get_xml_field_name(field_name: str) -> str: 

149 """ 

150 Map an EM Glossary field name to a human-readable XML display name. 

151 

152 This function provides the translation layer between EM Glossary terminology 

153 (used internally in metadata schemas) and the human-readable field names 

154 used in XML output. It maintains backward compatibility with existing XML 

155 field names. 

156 

157 Parameters 

158 ---------- 

159 field_name : str 

160 The internal EM Glossary field name (e.g., "acceleration_voltage") 

161 

162 Returns 

163 ------- 

164 display_name : str 

165 The human-readable display name for XML (e.g., "Voltage") 

166 

167 Examples 

168 -------- 

169 >>> get_xml_field_name("acceleration_voltage") 

170 'Voltage' 

171 >>> get_xml_field_name("working_distance") 

172 'Working Distance' 

173 >>> get_xml_field_name("detector_type") 

174 'Detector' 

175 

176 For unknown fields, returns the field name with underscores replaced by spaces 

177 and title-cased: 

178 

179 >>> get_xml_field_name("some_custom_field") 

180 'Some Custom Field' 

181 

182 Notes 

183 ----- 

184 This function prioritizes backward compatibility with existing XML field names. 

185 New fields should be added to EM_GLOSSARY_TO_XML_DISPLAY_NAMES to control 

186 their XML representation. 

187 """ 

188 # Check if we have an explicit mapping 

189 if field_name in EM_GLOSSARY_TO_XML_DISPLAY_NAMES: 

190 return EM_GLOSSARY_TO_XML_DISPLAY_NAMES[field_name] 

191 

192 # For unknown fields, convert snake_case to Title Case 

193 # This handles instrument-specific fields not in the mapping 

194 return field_name.replace("_", " ").title() 

195 

196 

197def prepare_metadata_for_xml( 

198 metadata: dict[str, Any], 

199) -> dict[str, str | float]: 

200 """ 

201 Prepare rich metadata for XML serialization. 

202 

203 Converts metadata from the new schema format (with Pint Quantities, nested 

204 structures, etc.) into a flat dictionary suitable for XML serialization. 

205 This includes: 

206 

207 1. Converting Pint Quantity objects to separate value/unit entries 

208 2. Flattening nested structures (like StagePosition) 

209 3. Mapping EM Glossary field names to XML display names 

210 4. Preserving non-Quantity values as-is 

211 

212 Parameters 

213 ---------- 

214 metadata : dict[str, Any] 

215 Metadata dictionary from type-specific schema (ImageMetadata, etc.) 

216 May contain Pint Quantities, nested dicts, or simple values 

217 

218 Returns 

219 ------- 

220 xml_metadata : dict[str, str | float] 

221 Flat dictionary with XML-compatible field names and values. 

222 For Quantity fields, creates two entries: 

223 - "<field_name>": numeric value 

224 - "<field_name>_unit": unit string 

225 

226 Examples 

227 -------- 

228 >>> from nexusLIMS.schemas.units import ureg 

229 >>> metadata = { 

230 ... "acceleration_voltage": ureg.Quantity(10, "kilovolt"), 

231 ... "magnification": 50000, 

232 ... "detector_type": "ETD", 

233 ... } 

234 >>> xml_dict = prepare_metadata_for_xml(metadata) 

235 >>> xml_dict["Voltage"] 

236 10.0 

237 >>> xml_dict["Voltage_unit"] 

238 'kV' 

239 >>> xml_dict["Magnification"] 

240 50000 

241 >>> xml_dict["Detector"] 

242 'ETD' 

243 

244 Notes 

245 ----- 

246 This function is designed to work with both the new schema format and 

247 legacy metadata dicts for backward compatibility during migration. 

248 """ 

249 xml_dict = {} 

250 

251 for field_name, value in metadata.items(): 

252 # Skip None values and internal fields 

253 if value is None: 

254 continue 

255 if field_name in {"warnings", "schema_version", "extensions"}: 

256 continue 

257 

258 # Get the XML display name for this field 

259 xml_name = get_xml_field_name(field_name) 

260 

261 # Handle Pint Quantity objects 

262 if isinstance(value, Quantity): 

263 magnitude, unit = serialize_quantity_to_xml(value) 

264 xml_dict[xml_name] = magnitude 

265 xml_dict[f"{xml_name}_unit"] = unit 

266 

267 # Handle nested StagePosition dict 

268 elif field_name == "stage_position" and isinstance(value, dict): 

269 for axis, axis_value in value.items(): 

270 if axis_value is None: 

271 continue 

272 axis_xml_name = get_xml_field_name(f"stage_{axis.lower()}") 

273 if isinstance(axis_value, Quantity): 

274 mag, unit = serialize_quantity_to_xml(axis_value) 

275 xml_dict[axis_xml_name] = mag 

276 xml_dict[f"{axis_xml_name}_unit"] = unit 

277 else: 

278 xml_dict[axis_xml_name] = axis_value 

279 

280 # Handle list values (e.g., elements list) 

281 elif isinstance(value, list): 

282 # Convert list to comma-separated string 

283 xml_dict[xml_name] = ", ".join(str(v) for v in value) 

284 

285 # Handle all other values (strings, numbers, etc.) 

286 else: 

287 xml_dict[xml_name] = value 

288 

289 return xml_dict 

290 

291 

292def get_qudt_uri(field_name: str, unit: str) -> str | None: # noqa: ARG001 

293 """ 

294 Get the QUDT URI for a given field's unit. 

295 

296 This function looks up the QUDT (Quantities, Units, Dimensions and Types) 

297 ontology URI for a given unit string. Used for Tier 3 semantic web 

298 integration (future enhancement). 

299 

300 Parameters 

301 ---------- 

302 field_name : str 

303 The field name (currently unused, reserved for future context-aware lookups) 

304 unit : str 

305 The unit string in compact form (e.g., "kV", "mm", "pA") 

306 

307 Returns 

308 ------- 

309 qudt_uri : str or None 

310 The QUDT URI for this unit, or None if no mapping exists 

311 

312 Examples 

313 -------- 

314 >>> get_qudt_uri("acceleration_voltage", "kV") # doctest: +SKIP 

315 'http://qudt.org/vocab/unit/KiloV' 

316 >>> get_qudt_uri("working_distance", "mm") # doctest: +SKIP 

317 'http://qudt.org/vocab/unit/MilliM' 

318 

319 Notes 

320 ----- 

321 This function is currently a placeholder for Tier 3 implementation. 

322 It will use the QUDT mapping system from `nexusLIMS.schemas.units` 

323 when Tier 3 semantic attributes are added to the XML schema. 

324 """ 

325 # Parse unit string to Pint unit and create a Quantity 

326 try: 

327 # Create a quantity with magnitude 1 to get the unit object 

328 qty = ureg.Quantity(1, unit) 

329 except Exception: 

330 return None 

331 

332 # Look up QUDT URI using the Quantity object 

333 return _get_qudt_uri(qty) 

334 

335 

336def get_emg_id(field_name: str) -> str | None: 

337 """ 

338 Get the EM Glossary ID for a given field name. 

339 

340 This function looks up the EM Glossary term ID for a field name, 

341 if one exists. Used for Tier 3 semantic web integration (future enhancement). 

342 

343 Parameters 

344 ---------- 

345 field_name : str 

346 The internal field name (e.g., "acceleration_voltage") 

347 

348 Returns 

349 ------- 

350 emg_id : str or None 

351 The EM Glossary ID (e.g., "EMG_00000004"), or None if no mapping exists 

352 

353 Examples 

354 -------- 

355 >>> get_emg_id("acceleration_voltage") 

356 'EMG_00000004' 

357 >>> get_emg_id("working_distance") 

358 'EMG_00000050' 

359 >>> get_emg_id("some_custom_field") 

360 

361 Notes 

362 ----- 

363 This function is used for Tier 3 implementation where EM Glossary IDs 

364 are added as XML attributes for semantic traceability. 

365 """ 

366 return em_glossary.get_emg_id(field_name)