Source code for nexusLIMS.extractors.registry
"""Extractor registry for plugin discovery and selection.
This module provides the central registry that discovers, manages, and selects
extractors based on file type and context. It implements auto-discovery by
walking the plugins directory and uses priority-based selection.
"""
from __future__ import annotations
import importlib
import inspect
import logging
import pkgutil
from collections import defaultdict
from pathlib import Path
from typing import TYPE_CHECKING, Any
from nexusLIMS.extractors.plugins.basic_metadata import BasicFileInfoExtractor
from nexusLIMS.extractors.plugins.profiles import register_all_profiles
if TYPE_CHECKING:
from nexusLIMS.extractors.base import (
BaseExtractor,
ExtractionContext,
PreviewGenerator,
)
_logger = logging.getLogger(__name__)
__all__ = [
"ExtractorRegistry",
"get_registry",
]
[docs]
class ExtractorRegistry:
"""
Central registry for extractor plugins.
Manages auto-discovery, registration, and selection of metadata extractors.
Uses priority-based selection with content sniffing support.
This is a singleton - use :func:`get_registry` to access.
Features
--------
- Auto-discovers plugins by walking nexusLIMS/extractors/plugins/
- Maintains priority-sorted lists per extension
- Lazy instantiation for performance
- Caches extractor instances
- Never returns None (always has fallback extractor)
Examples
--------
Get an extractor for a file:
>>> from nexusLIMS.extractors.registry import get_registry
>>> from nexusLIMS.extractors.base import ExtractionContext
>>> from pathlib import Path
>>>
>>> registry = get_registry()
>>> context = ExtractionContext(Path("data.dm3"), instrument=None)
>>> extractor = registry.get_extractor(context)
>>> metadata = extractor.extract(context)
Manual registration (for testing):
>>> class MyExtractor:
... name = "my_extractor"
... priority = 100
... def supports(self, context): return True
... def extract(self, context): return {"nx_meta": {}}
>>>
>>> registry = get_registry()
>>> registry.register_extractor(MyExtractor)
"""
def __init__(self):
"""Initialize the extractor registry."""
# Maps extension -> list of extractor classes (sorted by priority)
self._extractors: dict[str, list[type[BaseExtractor]]] = defaultdict(list)
# Cache of instantiated extractors (name -> instance)
self._instances: dict[str, BaseExtractor] = {}
# Wildcard extractors that support any extension
self._wildcard_extractors: list[type[BaseExtractor]] = []
# Preview generators (maps extension -> list of generator classes)
self._preview_generators: dict[str, list[type[PreviewGenerator]]] = defaultdict(
list
)
# Cache of instantiated preview generators (name -> instance)
self._preview_instances: dict[str, PreviewGenerator] = {}
# Discovery state
self._discovered = False
_logger.debug("Initialized ExtractorRegistry")
@property
def extractors(self) -> dict[str, list[type[BaseExtractor]]]:
"""
Get the extractor list.
Returns a dictionary mapping file extensions to lists of extractor classes,
sorted by priority (descending).
Auto-discovers plugins if not already discovered.
Returns
-------
dict[str, list[type[BaseExtractor]]]
Maps extension (without dot) to list of extractor classes
Examples
--------
>>> registry = get_registry()
>>> extractors_by_ext = registry.extractors
>>> print(extractors_by_ext.get("dm3", []))
"""
if not self._discovered:
self.discover_plugins()
return dict(self._extractors)
@property
def extractor_names(self) -> list[str]:
"""
Get a deduplicated list of extractor names.
Returns extractor names sorted alphabetically, with duplicates removed.
Auto-discovers plugins if not already discovered.
Returns
-------
list[str]
Sorted list of unique extractor names
Examples
--------
>>> registry = get_registry()
>>> names = registry.extractor_names
>>> print(names)
['BasicFileInfoExtractor', 'DM3Extractor', 'QuantaTiffExtractor', ...]
"""
if not self._discovered:
self.discover_plugins()
# Collect all extractor names
extractor_names_set = set()
for extractor_classes in self._extractors.values():
for extractor_class in extractor_classes:
extractor_names_set.add(extractor_class.__name__)
# Also add wildcard extractors
for extractor_class in self._wildcard_extractors:
extractor_names_set.add(extractor_class.__name__)
return sorted(extractor_names_set)
@property
def all_extractors(self) -> list[BaseExtractor]:
"""
Get a deduplicated flat list of all registered extractor instances.
Returns one instance per unique extractor class (both extension-specific
and wildcard extractors), sorted by priority descending.
Auto-discovers plugins if not already discovered.
Returns
-------
list[BaseExtractor]
Unique extractor instances sorted by priority (descending)
Examples
--------
>>> registry = get_registry()
>>> for ext in registry.all_extractors:
... print(f"{ext.name}: priority {ext.priority}")
"""
if not self._discovered:
self.discover_plugins()
seen: set[type] = set()
unique_classes: list[type] = []
for extractor_classes in self._extractors.values():
for cls in extractor_classes:
if cls not in seen:
seen.add(cls)
unique_classes.append(cls)
for cls in self._wildcard_extractors:
if cls not in seen:
seen.add(cls)
unique_classes.append(cls)
instances = [self._get_instance(cls) for cls in unique_classes]
return sorted(instances, key=lambda e: e.priority, reverse=True)
[docs]
def discover_plugins(self) -> None:
"""
Auto-discover extractor plugins by walking the plugins directory.
Walks nexusLIMS/extractors/plugins/, imports all Python modules,
and registers any classes that implement the BaseExtractor protocol.
This is called automatically on first use, but can be called manually
to force re-discovery.
Examples
--------
>>> registry = get_registry()
>>> registry.discover_plugins()
>>> extractors = registry.get_extractors_for_extension("dm3")
>>> print(f"Found {len(extractors)} extractors for .dm3 files")
"""
if self._discovered:
_logger.debug("Plugins already discovered, skipping")
return
_logger.info("Discovering extractor plugins...")
# Find the plugins directory
plugins_package = "nexusLIMS.extractors.plugins"
try:
# Import the plugins package to get its path
plugins_module = importlib.import_module(plugins_package)
plugins_path = Path(plugins_module.__file__).parent
except (ImportError, AttributeError) as e:
_logger.warning(
"Could not import plugins package '%s': %s. Plugin discovery skipped.",
plugins_package,
e,
)
self._discovered = True
return
# Walk the plugins directory
discovered_count = 0
for _finder, name, _ispkg in pkgutil.walk_packages(
[str(plugins_path)],
prefix=f"{plugins_package}.",
):
# Skip __pycache__ and other special directories
if "__pycache__" in name:
continue # pragma: no cover
try:
module = importlib.import_module(name)
_logger.debug("Imported plugin module: %s", name)
# Look for classes implementing BaseExtractor/PreviewGenerator protocol
for _item_name, obj in inspect.getmembers(module, inspect.isclass):
# Skip imported classes (only use classes defined in this module)
if obj.__module__ != module.__name__:
continue
# Check if it looks like a BaseExtractor
if self._is_extractor(obj):
self.register_extractor(obj)
discovered_count += 1
_logger.debug(
"Discovered extractor: %s (priority: %d)",
obj.name,
obj.priority,
)
# Check if it looks like a PreviewGenerator
elif self._is_preview_generator(obj):
self.register_preview_generator(obj)
discovered_count += 1
_logger.debug(
"Discovered preview generator: %s (priority: %d)",
obj.name,
obj.priority,
)
except Exception as e:
_logger.warning(
"Failed to import plugin module '%s': %s",
name,
e,
exc_info=True,
)
_logger.info("Discovered %d extractor plugins", discovered_count)
# Register instrument profiles
self._register_instrument_profiles()
self._discovered = True
def _register_instrument_profiles(self) -> None:
"""
Register all instrument profiles.
This calls the profile package's auto-discovery function to load
and register all instrument-specific profiles.
"""
try:
register_all_profiles()
except ImportError as e:
_logger.warning(
"Could not import profiles package: %s. No profiles will be loaded.",
e,
)
except Exception as e:
_logger.warning(
"Error registering instrument profiles: %s",
e,
exc_info=True,
)
def _is_extractor(self, obj: Any) -> bool:
"""
Check if an object implements the BaseExtractor protocol.
Parameters
----------
obj
Object to check
Returns
-------
bool
True if obj implements BaseExtractor protocol
"""
# Must be a class
if not inspect.isclass(obj):
return False
# Check for required attributes
if not hasattr(obj, "name") or not isinstance(obj.name, str):
return False
if not hasattr(obj, "priority") or not isinstance(obj.priority, int):
return False
# Check for required methods
if not hasattr(obj, "supports") or not callable(obj.supports):
return False
if not hasattr(obj, "extract") or not callable(obj.extract): # noqa: SIM103
return False
return True
def _is_preview_generator(self, obj: Any) -> bool:
"""
Check if an object implements the PreviewGenerator protocol.
Parameters
----------
obj
Object to check
Returns
-------
bool
True if obj implements PreviewGenerator protocol
"""
# Must be a class
if not inspect.isclass(obj):
return False
# Check for required attributes
if not hasattr(obj, "name") or not isinstance(obj.name, str):
return False
if not hasattr(obj, "priority") or not isinstance(obj.priority, int):
return False
# Check for required methods
if not hasattr(obj, "supports") or not callable(obj.supports):
return False
if not hasattr(obj, "generate") or not callable(obj.generate): # noqa: SIM103
return False
return True
[docs]
def register_extractor(self, extractor_class: type[BaseExtractor]) -> None:
"""
Manually register an extractor class.
This method is called automatically during plugin discovery, but can
also be used to manually register extractors (useful for testing).
Parameters
----------
extractor_class
The extractor class to register (not an instance)
Examples
--------
>>> class MyExtractor:
... name = "my_extractor"
... priority = 100
... def supports(self, context): return True
... def extract(self, context): return {"nx_meta": {}}
>>>
>>> registry = get_registry()
>>> registry.register_extractor(MyExtractor)
"""
# Determine which extensions this extractor supports
# We'll do this by creating a temporary instance and asking it
extensions = self._get_supported_extensions(extractor_class)
if not extensions:
# This is a wildcard extractor (supports any extension)
if extractor_class not in self._wildcard_extractors:
self._wildcard_extractors.append(extractor_class)
_logger.debug(
"Registered wildcard extractor: %s",
extractor_class.name,
)
else:
_logger.debug(
"Extractor %s already registered (skipping duplicate)",
extractor_class.name,
)
else:
# Register for specific extensions
for ext in extensions:
if extractor_class not in self._extractors[ext]:
self._extractors[ext].append(extractor_class)
_logger.debug(
"Registered %s for extension: .%s",
extractor_class.name,
ext,
)
else:
_logger.debug(
"Extractor %s already registered for .%s (skipping duplicate)",
extractor_class.name,
ext,
)
# Sort by priority (descending) for each extension
for ext in extensions:
self._extractors[ext].sort(key=lambda e: e.priority, reverse=True)
def _get_supported_extensions(
self,
extractor_class: type[BaseExtractor],
) -> set[str]:
"""
Get supported file extensions from an extractor class.
Uses the extractor's declared supported_extensions attribute.
Parameters
----------
extractor_class
The extractor class to check
Returns
-------
set[str]
Set of supported extensions (without dots), or empty set if
this is a wildcard extractor
"""
if not hasattr(extractor_class, "supported_extensions"):
_logger.warning(
"Extractor %s does not have supported_extensions attribute",
extractor_class.name if hasattr(extractor_class, "name") else "unknown",
)
return set()
extensions = extractor_class.supported_extensions
if extensions is None:
# Wildcard extractor
return set()
# Return the declared extensions
return extensions if isinstance(extensions, set) else set(extensions)
def _get_instance(self, extractor_class: type[BaseExtractor]) -> BaseExtractor:
"""
Get or create an instance of an extractor class.
Instances are cached for performance.
Parameters
----------
extractor_class
The extractor class
Returns
-------
BaseExtractor
Instance of the extractor
"""
name = extractor_class.name
if name not in self._instances:
self._instances[name] = extractor_class()
_logger.debug("Instantiated extractor: %s", name)
return self._instances[name]
[docs]
def get_extractor(self, context: ExtractionContext) -> BaseExtractor:
"""
Get the best extractor for a given file context.
Selection algorithm:
1. Auto-discover plugins if not already done
2. Get extractors registered for this file's extension
3. Try each in priority order (high to low) until one's supports() returns True
4. If none match, try wildcard extractors
5. If still none, return BasicMetadataExtractor fallback
This method NEVER returns None - there is always a fallback.
Parameters
----------
context
Extraction context containing file path, instrument, etc.
Returns
-------
BaseExtractor
The best extractor for this file (never None)
Examples
--------
>>> from nexusLIMS.extractors.base import ExtractionContext
>>> from pathlib import Path
>>>
>>> context = ExtractionContext(Path("data.dm3"), None)
>>> registry = get_registry()
>>> extractor = registry.get_extractor(context)
>>> print(f"Selected: {extractor.name}")
"""
# Auto-discover if needed
if not self._discovered:
self.discover_plugins()
# Get file extension
ext = context.file_path.suffix.lstrip(".").lower()
# Try extension-specific extractors
if ext in self._extractors:
for extractor_class in self._extractors[ext]:
instance = self._get_instance(extractor_class)
try:
if instance.supports(context):
_logger.debug(
"Selected extractor %s for %s",
instance.name,
context.file_path.name,
)
return instance
except Exception as e:
_logger.warning(
"Error in %s.supports(): %s",
instance.name,
e,
exc_info=True,
)
# Try wildcard extractors
for extractor_class in self._wildcard_extractors:
instance = self._get_instance(extractor_class)
try:
if instance.supports(context):
_logger.debug(
"Selected wildcard extractor %s for %s",
instance.name,
context.file_path.name,
)
return instance
except Exception as e:
_logger.warning(
"Error in wildcard %s.supports(): %s",
instance.name,
e,
exc_info=True,
)
# Fallback: use basic metadata extractor
_logger.debug(
"No extractor found for %s, using fallback",
context.file_path.name,
)
return self._get_fallback_extractor()
def _get_fallback_extractor(self) -> BaseExtractor:
"""
Get the fallback extractor for unknown file types.
Returns
-------
BaseExtractor
BasicFileInfoExtractor instance
"""
return self._get_instance(BasicFileInfoExtractor)
[docs]
def get_extractors_for_extension(self, extension: str) -> list[BaseExtractor]:
"""
Get all extractors registered for a specific extension.
Parameters
----------
extension
File extension (with or without leading dot)
Returns
-------
list[BaseExtractor]
List of extractors, sorted by priority (descending)
Examples
--------
>>> registry = get_registry()
>>> extractors = registry.get_extractors_for_extension("dm3")
>>> for e in extractors:
... print(f"{e.name}: priority {e.priority}")
"""
# Auto-discover if needed
if not self._discovered:
self.discover_plugins()
ext = extension.lstrip(".").lower()
if ext not in self._extractors:
return []
return [
self._get_instance(extractor_class)
for extractor_class in self._extractors[ext]
]
[docs]
def get_supported_extensions(self, exclude_fallback: bool = False) -> set[str]: # noqa: FBT001, FBT002
"""
Get all file extensions that have registered extractors.
Parameters
----------
exclude_fallback
If True, exclude extensions that only have the fallback extractor
Returns
-------
set[str]
Set of extensions (without dots)
Examples
--------
>>> registry = get_registry()
>>> extensions = registry.get_supported_extensions()
>>> print(f"Supported: {', '.join(sorted(extensions))}")
>>> specialized = registry.get_supported_extensions(exclude_fallback=True)
>>> print(f"Specialized: {', '.join(sorted(specialized))}")
"""
# Auto-discover if needed
if not self._discovered:
self.discover_plugins()
if not exclude_fallback:
return set(self._extractors.keys())
# Only return extensions that have non-fallback extractors
specialized_extensions = set()
for ext, extractors in self._extractors.items():
# Check if any extractor for this extension is NOT the fallback
for extractor_class in extractors:
instance = self._get_instance(extractor_class)
# Basic file info extractor has priority 0 and is the fallback
if instance.priority > 0:
specialized_extensions.add(ext)
break
return specialized_extensions
[docs]
def clear(self) -> None:
"""
Clear all registered extractors and reset discovery state.
Primarily used for testing.
Examples
--------
>>> registry = get_registry()
>>> registry.clear()
>>> # Will re-discover on next use
"""
self._extractors.clear()
self._instances.clear()
self._wildcard_extractors.clear()
self._preview_generators.clear()
self._preview_instances.clear()
self._discovered = False
_logger.debug("Cleared extractor registry")
[docs]
def register_preview_generator(
self,
generator_class: type[PreviewGenerator],
) -> None:
"""
Manually register a preview generator class.
This method is called automatically during plugin discovery, but can
also be used to manually register generators (useful for testing).
Parameters
----------
generator_class
The preview generator class to register (not an instance)
Examples
--------
>>> class MyGenerator:
... name = "my_generator"
... priority = 100
... def supports(self, context): return True
... def generate(self, context, output_path): return True
>>>
>>> registry = get_registry()
>>> registry.register_preview_generator(MyGenerator)
"""
# Determine which extensions this generator supports
extensions = self._get_supported_extensions_for_generator(generator_class)
if extensions:
# Register for specific extensions
for ext in extensions:
self._preview_generators[ext].append(generator_class)
_logger.debug(
"Registered preview generator %s for extension: .%s",
generator_class.name,
ext,
)
# Sort by priority (descending) for each extension
for ext in extensions:
self._preview_generators[ext].sort(
key=lambda g: g.priority,
reverse=True,
)
def _get_supported_extensions_for_generator(
self,
generator_class: type[PreviewGenerator],
) -> set[str]:
"""
Get supported file extensions from a preview generator class.
Uses the generator's declared supported_extensions attribute.
Parameters
----------
generator_class
The preview generator class to check
Returns
-------
set[str]
Set of supported extensions (without dots)
"""
if not hasattr(generator_class, "supported_extensions"):
_logger.warning(
"Preview generator %s does not have supported_extensions attribute",
generator_class.name if hasattr(generator_class, "name") else "unknown",
)
return set()
extensions = generator_class.supported_extensions
if extensions is None:
# Wildcard generator
return set()
# Return the declared extensions
return extensions if isinstance(extensions, set) else set(extensions)
def _get_preview_instance(
self,
generator_class: type[PreviewGenerator],
) -> PreviewGenerator:
"""
Get or create an instance of a preview generator class.
Instances are cached for performance.
Parameters
----------
generator_class
The preview generator class
Returns
-------
PreviewGenerator
Instance of the preview generator
"""
name = generator_class.name
if name not in self._preview_instances:
self._preview_instances[name] = generator_class()
_logger.debug("Instantiated preview generator: %s", name)
return self._preview_instances[name]
[docs]
def get_preview_generator(
self,
context: ExtractionContext,
) -> PreviewGenerator | None:
"""
Get the best preview generator for a given file context.
Selection algorithm:
1. Auto-discover plugins if not already done
2. Get generators registered for this file's extension
3. Try each in priority order (high to low) until one's supports() returns True
4. If none match, return None
Parameters
----------
context
Extraction context containing file path, instrument, etc.
Returns
-------
PreviewGenerator | None
The best preview generator for this file, or None if no generator found
Examples
--------
>>> from nexusLIMS.extractors.base import ExtractionContext
>>> from pathlib import Path
>>>
>>> context = ExtractionContext(Path("data.dm3"), None)
>>> registry = get_registry()
>>> generator = registry.get_preview_generator(context)
>>> if generator:
... generator.generate(context, Path("preview.png"))
"""
# Auto-discover if needed
if not self._discovered:
self.discover_plugins()
# Get file extension
ext = context.file_path.suffix.lstrip(".").lower()
# Try extension-specific generators
if ext in self._preview_generators:
for generator_class in self._preview_generators[ext]:
instance = self._get_preview_instance(generator_class)
try:
if instance.supports(context):
_logger.debug(
"Selected preview generator %s for %s",
instance.name,
context.file_path.name,
)
return instance
except Exception as e:
_logger.warning(
"Error in %s.supports(): %s",
instance.name,
e,
exc_info=True,
)
# No generator found
_logger.debug(
"No preview generator found for %s",
context.file_path.name,
)
return None
# Singleton instance
_registry: ExtractorRegistry | None = None
[docs]
def get_registry() -> ExtractorRegistry:
"""
Get the global extractor registry (singleton).
Returns
-------
ExtractorRegistry
The global registry instance
Examples
--------
>>> from nexusLIMS.extractors.registry import get_registry
>>> registry = get_registry()
>>> # Always returns the same instance
>>> assert get_registry() is registry
"""
global _registry # noqa: PLW0603
if _registry is None:
_registry = ExtractorRegistry()
return _registry