Source code for procfunc.util.manifest

"""Utilities for reading and filtering manifest CSV files."""

import importlib
import logging
from pathlib import Path
from typing import Any, Callable

import pandas as pd

logger = logging.getLogger(__name__)


[docs] def module_path(): return Path(__file__).parent.parent
def filter_manifest( manifest: pd.DataFrame, filter: dict[str, str] | None = None, exclude: dict[str, list[str]] | None = None, require_nonempty: list[str] | None = None, min_entries: int | None = None, ) -> pd.DataFrame: if filter is None: filter = {} if exclude is None: exclude = {} if require_nonempty is None: require_nonempty = [] for k, v in filter.items(): assert k in manifest.columns, ( f"Filter {k}={v} did not match any columns in {manifest.columns}" ) before_count = len(manifest) manifest = manifest.dropna(subset=[k]) manifest = manifest[manifest[k] == v] after_count = len(manifest) logger.debug( f"Filter {k}={v}: {before_count} -> {after_count} (dropped {before_count - after_count})" ) for column, patterns in exclude.items(): if not isinstance(patterns, list): patterns = [patterns] before_count = len(manifest) if manifest[column].dtype == "object": mask = pd.Series([False] * len(manifest), index=manifest.index) for pattern in patterns: if isinstance(pattern, str): mask |= manifest[column].str.contains(pattern, na=False) else: mask |= manifest[column] == pattern else: mask = manifest[column].isin(patterns) manifest = manifest[~mask] after_count = len(manifest) logger.debug( f"Exclude {column} patterns {patterns}: {before_count} -> {after_count} (dropped {before_count - after_count})" ) if require_nonempty: before_count = len(manifest) manifest = manifest.dropna(subset=require_nonempty) after_count = len(manifest) logger.debug( f"Require nonempty {require_nonempty}: {before_count} -> {after_count} (dropped {before_count - after_count})" ) if min_entries is not None and len(manifest) < min_entries: raise ValueError( f"Expected at least {min_entries} entries, got {len(manifest)} " f"with {filter=} and {exclude=} and {require_nonempty=}" ) return manifest def import_item(name: str) -> Any: """ Find and import a function or class by its dotted module path. Args: name: Dotted module path like "mymodule.generators.objects.funcname" Returns: The imported function or class Raises: ModuleNotFoundError: If the module or attribute cannot be found """ *path_parts, item_name = name.split(".") try: return importlib.import_module("." + item_name, ".".join(path_parts)) except Exception as e: if not isinstance(e, ModuleNotFoundError): raise e mod = importlib.import_module(".".join(path_parts)) try: return getattr(mod, item_name) except AttributeError as e: raise AttributeError( f"Attribute {item_name} not found in module {mod.__name__}, {dir(mod)=}" ) from e def import_item_iterative(name: str, from_mod: Any | None = None) -> Callable: first, *rest = name.split(".") if len(rest) == 0: assert from_mod is not None, f"No module specified for {name}" return getattr(from_mod, name) elif from_mod is None: mod = importlib.import_module(first) return import_item_iterative(".".join(rest), mod) else: mod = getattr(from_mod, first) return import_item_iterative(".".join(rest), mod)