Skip to content

Preprocessing

AbstractNormalization

Bases: OneToOneFeatureMixin, TransformerMixin, BaseEstimator

Abstract base class for normalization techniques.

All normalization classes should inherit from this class and implement the transform and inverse_transform methods.

Source code in asf/preprocessing/performance_scaling.py
class AbstractNormalization(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
    """
    Abstract base class for normalization techniques.

    All normalization classes should inherit from this class and implement
    the `transform` and `inverse_transform` methods.
    """

    def __init__(self) -> None:
        super().__init__()

    def fit(
        self,
        X: np.ndarray,
        y: np.ndarray | None = None,
        sample_weight: np.ndarray | None = None,
    ) -> AbstractNormalization:
        """
        Fit the normalization model to the data.

        Parameters
        ----------
        X : np.ndarray
            Input data.
        y : np.ndarray or None, default=None
            Target values.
        sample_weight : np.ndarray or None, default=None
            Sample weights.

        Returns
        -------
        AbstractNormalization
            The fitted normalization instance.
        """
        return self

    def transform(self, X: np.ndarray) -> np.ndarray:
        """
        Transform the input data.

        Parameters
        ----------
        X : np.ndarray
            Input data.

        Returns
        -------
        np.ndarray
            Transformed data.
        """
        raise NotImplementedError

    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
        """
        Inverse transform the input data.

        Parameters
        ----------
        X : np.ndarray
            Transformed data.

        Returns
        -------
        np.ndarray
            Original data.
        """
        raise NotImplementedError

fit(X, y=None, sample_weight=None)

Fit the normalization model to the data.

Parameters

X : np.ndarray Input data. y : np.ndarray or None, default=None Target values. sample_weight : np.ndarray or None, default=None Sample weights.

Returns

AbstractNormalization The fitted normalization instance.

Source code in asf/preprocessing/performance_scaling.py
def fit(
    self,
    X: np.ndarray,
    y: np.ndarray | None = None,
    sample_weight: np.ndarray | None = None,
) -> AbstractNormalization:
    """
    Fit the normalization model to the data.

    Parameters
    ----------
    X : np.ndarray
        Input data.
    y : np.ndarray or None, default=None
        Target values.
    sample_weight : np.ndarray or None, default=None
        Sample weights.

    Returns
    -------
    AbstractNormalization
        The fitted normalization instance.
    """
    return self

inverse_transform(X)

Inverse transform the input data.

Parameters

X : np.ndarray Transformed data.

Returns

np.ndarray Original data.

Source code in asf/preprocessing/performance_scaling.py
def inverse_transform(self, X: np.ndarray) -> np.ndarray:
    """
    Inverse transform the input data.

    Parameters
    ----------
    X : np.ndarray
        Transformed data.

    Returns
    -------
    np.ndarray
        Original data.
    """
    raise NotImplementedError

transform(X)

Transform the input data.

Parameters

X : np.ndarray Input data.

Returns

np.ndarray Transformed data.

Source code in asf/preprocessing/performance_scaling.py
def transform(self, X: np.ndarray) -> np.ndarray:
    """
    Transform the input data.

    Parameters
    ----------
    X : np.ndarray
        Input data.

    Returns
    -------
    np.ndarray
        Transformed data.
    """
    raise NotImplementedError

BoxCoxNormalization

Bases: AbstractNormalization

Normalization using Box-Cox transformation (Yeo-Johnson variant).

Source code in asf/preprocessing/performance_scaling.py
class BoxCoxNormalization(AbstractNormalization):
    """
    Normalization using Box-Cox transformation (Yeo-Johnson variant).
    """

    def __init__(self) -> None:
        super().__init__()

    def fit(
        self,
        X: np.ndarray,
        y: np.ndarray | None = None,
        sample_weight: np.ndarray | None = None,
    ) -> BoxCoxNormalization:
        """
        Fit the Box-Cox transformer to the data.

        Parameters
        ----------
        X : np.ndarray
            Input data.
        y : np.ndarray or None, default=None
            Target values.
        sample_weight : np.ndarray or None, default=None
            Sample weights.

        Returns
        -------
        BoxCoxNormalization
            The fitted normalization instance.
        """
        self.box_cox = PowerTransformer(method="yeo-johnson")
        self.box_cox.fit(X.reshape(-1, 1))
        return self

    def transform(self, X: np.ndarray) -> np.ndarray:
        """
                Transform the input data using Box-Cox transformation.

                Parameters
                ----------
                X : np.ndarray
                    Input data.

                Returns
        -------
                np.ndarray
                    Transformed data.
        """
        return self.box_cox.transform(X.reshape(-1, 1)).reshape(-1)

    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
        """
                Inverse transform the data back to the original scale.

                Parameters
                ----------
                X : np.ndarray
                    Transformed data.

                Returns
        -------
                np.ndarray
                    Original data.
        """
        X_orig = self.box_cox.inverse_transform(X.reshape(-1, 1)).reshape(-1)
        return X_orig

fit(X, y=None, sample_weight=None)

Fit the Box-Cox transformer to the data.

Parameters

X : np.ndarray Input data. y : np.ndarray or None, default=None Target values. sample_weight : np.ndarray or None, default=None Sample weights.

Returns

BoxCoxNormalization The fitted normalization instance.

Source code in asf/preprocessing/performance_scaling.py
def fit(
    self,
    X: np.ndarray,
    y: np.ndarray | None = None,
    sample_weight: np.ndarray | None = None,
) -> BoxCoxNormalization:
    """
    Fit the Box-Cox transformer to the data.

    Parameters
    ----------
    X : np.ndarray
        Input data.
    y : np.ndarray or None, default=None
        Target values.
    sample_weight : np.ndarray or None, default=None
        Sample weights.

    Returns
    -------
    BoxCoxNormalization
        The fitted normalization instance.
    """
    self.box_cox = PowerTransformer(method="yeo-johnson")
    self.box_cox.fit(X.reshape(-1, 1))
    return self

inverse_transform(X)

    Inverse transform the data back to the original scale.

    Parameters
    ----------
    X : np.ndarray
        Transformed data.

    Returns

    np.ndarray
        Original data.
Source code in asf/preprocessing/performance_scaling.py
def inverse_transform(self, X: np.ndarray) -> np.ndarray:
    """
            Inverse transform the data back to the original scale.

            Parameters
            ----------
            X : np.ndarray
                Transformed data.

            Returns
    -------
            np.ndarray
                Original data.
    """
    X_orig = self.box_cox.inverse_transform(X.reshape(-1, 1)).reshape(-1)
    return X_orig

transform(X)

    Transform the input data using Box-Cox transformation.

    Parameters
    ----------
    X : np.ndarray
        Input data.

    Returns

    np.ndarray
        Transformed data.
Source code in asf/preprocessing/performance_scaling.py
def transform(self, X: np.ndarray) -> np.ndarray:
    """
            Transform the input data using Box-Cox transformation.

            Parameters
            ----------
            X : np.ndarray
                Input data.

            Returns
    -------
            np.ndarray
                Transformed data.
    """
    return self.box_cox.transform(X.reshape(-1, 1)).reshape(-1)

DummyNormalization

Bases: AbstractNormalization

Normalization that does not change the data.

Source code in asf/preprocessing/performance_scaling.py
class DummyNormalization(AbstractNormalization):
    """
    Normalization that does not change the data.
    """

    def __init__(self) -> None:
        super().__init__()

    def fit(
        self,
        X: np.ndarray,
        y: np.ndarray | None = None,
        sample_weight: np.ndarray | None = None,
    ) -> DummyNormalization:
        """
        Fit the DummyNormalization model to the data.

        Parameters
        ----------
        X : np.ndarray
            Input data.
        y : np.ndarray or None, default=None
            Target values.
        sample_weight : np.ndarray or None, default=None
            Sample weights.

        Returns
        -------
        DummyNormalization
            The fitted normalization instance.
        """
        return self

    def transform(self, X: np.ndarray) -> np.ndarray:
        """
                Transform the input data (no change).

                Parameters
                ----------
                X : np.ndarray
                    Input data.

                Returns
        -------
                np.ndarray
                    Transformed data.
        """
        return X

    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
        """
                Inverse transform the data (no change).

                Parameters
                ----------
                X : np.ndarray
                    Transformed data.

                Returns
        -------
                np.ndarray
                    Original data.
        """
        return X

fit(X, y=None, sample_weight=None)

Fit the DummyNormalization model to the data.

Parameters

X : np.ndarray Input data. y : np.ndarray or None, default=None Target values. sample_weight : np.ndarray or None, default=None Sample weights.

Returns

DummyNormalization The fitted normalization instance.

Source code in asf/preprocessing/performance_scaling.py
def fit(
    self,
    X: np.ndarray,
    y: np.ndarray | None = None,
    sample_weight: np.ndarray | None = None,
) -> DummyNormalization:
    """
    Fit the DummyNormalization model to the data.

    Parameters
    ----------
    X : np.ndarray
        Input data.
    y : np.ndarray or None, default=None
        Target values.
    sample_weight : np.ndarray or None, default=None
        Sample weights.

    Returns
    -------
    DummyNormalization
        The fitted normalization instance.
    """
    return self

inverse_transform(X)

    Inverse transform the data (no change).

    Parameters
    ----------
    X : np.ndarray
        Transformed data.

    Returns

    np.ndarray
        Original data.
Source code in asf/preprocessing/performance_scaling.py
def inverse_transform(self, X: np.ndarray) -> np.ndarray:
    """
            Inverse transform the data (no change).

            Parameters
            ----------
            X : np.ndarray
                Transformed data.

            Returns
    -------
            np.ndarray
                Original data.
    """
    return X

transform(X)

    Transform the input data (no change).

    Parameters
    ----------
    X : np.ndarray
        Input data.

    Returns

    np.ndarray
        Transformed data.
Source code in asf/preprocessing/performance_scaling.py
def transform(self, X: np.ndarray) -> np.ndarray:
    """
            Transform the input data (no change).

            Parameters
            ----------
            X : np.ndarray
                Input data.

            Returns
    -------
            np.ndarray
                Transformed data.
    """
    return X

FeatureGroupSelector

Bases: BaseEstimator, TransformerMixin

A sklearn-compatible transformer that selects features based on feature groups.

This transformer filters input features to only include those belonging to the specified feature groups. It is designed to work with ASlib scenarios where features are organized into groups (feature steps).

Parameters

feature_groups : dict[str, Any] Dictionary mapping feature group names to their metadata. Each value should be a dict with a 'provides' key listing the feature names in that group, and optionally a 'requires' key listing prerequisite groups. selected_groups : list[str] | None, default=None List of feature group names to include. If None, all groups are included. validate_requirements : bool, default=True If True, validate that all required prerequisite groups are included when selecting a group.

Attributes

selected_features_ : list[str] List of feature names that will be selected after fitting.

Examples

feature_groups = { ... 'basic': {'provides': ['f1', 'f2']}, ... 'advanced': {'provides': ['f3', 'f4']} ... } selector = FeatureGroupSelector(feature_groups, selected_groups=['basic']) X = pd.DataFrame({'f1': [1], 'f2': [2], 'f3': [3], 'f4': [4]}) selector.fit_transform(X) f1 f2 0 1 2

Example with prerequisites

feature_groups = { ... 'Pre': {'provides': ['f1', 'f2']}, ... 'Basic': {'provides': ['f3', 'f4'], 'requires': ['Pre']} ... } selector = FeatureGroupSelector(feature_groups, selected_groups=['Basic'])

This will raise MissingPrerequisiteGroupError because 'Pre' is not selected

Source code in asf/preprocessing/feature_group_selector.py
class FeatureGroupSelector(BaseEstimator, TransformerMixin):
    """
    A sklearn-compatible transformer that selects features based on feature groups.

    This transformer filters input features to only include those belonging to
    the specified feature groups. It is designed to work with ASlib scenarios
    where features are organized into groups (feature steps).

    Parameters
    ----------
    feature_groups : dict[str, Any]
        Dictionary mapping feature group names to their metadata.
        Each value should be a dict with a 'provides' key listing the feature names
        in that group, and optionally a 'requires' key listing prerequisite groups.
    selected_groups : list[str] | None, default=None
        List of feature group names to include. If None, all groups are included.
    validate_requirements : bool, default=True
        If True, validate that all required prerequisite groups are included
        when selecting a group.

    Attributes
    ----------
    selected_features_ : list[str]
        List of feature names that will be selected after fitting.

    Examples
    --------
    >>> feature_groups = {
    ...     'basic': {'provides': ['f1', 'f2']},
    ...     'advanced': {'provides': ['f3', 'f4']}
    ... }
    >>> selector = FeatureGroupSelector(feature_groups, selected_groups=['basic'])
    >>> X = pd.DataFrame({'f1': [1], 'f2': [2], 'f3': [3], 'f4': [4]})
    >>> selector.fit_transform(X)
       f1  f2
    0   1   2

    >>> # Example with prerequisites
    >>> feature_groups = {
    ...     'Pre': {'provides': ['f1', 'f2']},
    ...     'Basic': {'provides': ['f3', 'f4'], 'requires': ['Pre']}
    ... }
    >>> selector = FeatureGroupSelector(feature_groups, selected_groups=['Basic'])
    >>> # This will raise MissingPrerequisiteGroupError because 'Pre' is not selected
    """

    def __init__(
        self,
        feature_groups: dict[str, Any],
        selected_groups: list[str] | None = None,
        validate_requirements: bool = True,
    ):
        self.feature_groups = feature_groups
        self.selected_groups = selected_groups
        self.validate_requirements = validate_requirements

        if validate_requirements and selected_groups is not None:
            self._validate_prerequisites(selected_groups)

    def _validate_prerequisites(self, selected_groups: list[str]) -> None:
        """
        Validate that all required prerequisite groups are included.

        Parameters
        ----------
        selected_groups : list[str]
            List of selected feature group names.

        Raises
        ------
        MissingPrerequisiteGroupError
            If a selected group requires another group that is not selected.
        """
        selected_set = set(selected_groups)

        for group_name in selected_groups:
            if group_name not in self.feature_groups:
                continue

            group_info = self.feature_groups[group_name]
            required_groups = group_info.get("requires", [])

            for required_group in required_groups:
                if required_group not in selected_set:
                    raise MissingPrerequisiteGroupError(
                        f"Feature group '{group_name}' requires group '{required_group}' "
                        f"to be selected, but it is not included in the selected groups. "
                        f"Selected groups: {selected_groups}"
                    )

    @staticmethod
    def validate_feature_group_selection(
        feature_groups: dict[str, Any],
        selected_groups: list[str],
    ) -> None:
        """
        Validate that a list of selected groups satisfies all prerequisites.

        This is a static utility method that can be used to validate selections
        without creating a FeatureGroupSelector instance.

        Parameters
        ----------
        feature_groups : dict[str, Any]
            Dictionary of all feature groups with their metadata.
        selected_groups : list[str]
            List of selected feature group names.

        Raises
        ------
        MissingPrerequisiteGroupError
            If a selected group requires another group that is not selected.
        """
        selected_set = set(selected_groups)

        for group_name in selected_groups:
            if group_name not in feature_groups:
                continue

            group_info = feature_groups[group_name]
            required_groups = group_info.get("requires", [])

            for required_group in required_groups:
                if required_group not in selected_set:
                    raise MissingPrerequisiteGroupError(
                        f"Feature group '{group_name}' requires group '{required_group}' "
                        f"to be selected, but it is not included in the selected groups. "
                        f"Selected groups: {list(selected_groups)}"
                    )

    def fit(self, X: pd.DataFrame, y: Any = None) -> FeatureGroupSelector:
        """
        Fit the selector by determining which features to select.

        Parameters
        ----------
        X : pd.DataFrame
            Input features.
        y : Any, default=None
            Not used, present for API compatibility.

        Returns
        -------
        FeatureGroupSelector
            The fitted selector instance.
        """
        # Determine which groups to include
        if self.selected_groups is None:
            groups_to_use = list(self.feature_groups.keys())
        else:
            groups_to_use = self.selected_groups

        # Collect features from selected groups
        selected_features = []
        for fg_name in groups_to_use:
            if fg_name in self.feature_groups:
                fg_info = self.feature_groups[fg_name]
                if "provides" in fg_info:
                    selected_features.extend(fg_info["provides"])

        # Filter to only features that exist in X
        self.selected_features_ = [f for f in selected_features if f in X.columns]

        # If no features selected, use all columns
        if not self.selected_features_:
            self.selected_features_ = list(X.columns)

        return self

    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """
        Transform the input by selecting only the specified features.

        Parameters
        ----------
        X : pd.DataFrame
            Input features.

        Returns
        -------
        pd.DataFrame
            DataFrame with only the selected features.
        """
        # Filter to features that exist in X
        available_features = [f for f in self.selected_features_ if f in X.columns]
        if not available_features:
            return X
        return X[available_features]

    def get_feature_names_out(self, input_features: Any = None) -> list[str]:
        """
        Get output feature names.

        Parameters
        ----------
        input_features : Any, default=None
            Not used, present for API compatibility.

        Returns
        -------
        list[str]
            List of selected feature names.
        """
        return self.selected_features_

    @staticmethod
    def get_selected_groups_from_config(
        feature_groups: dict[str, Any],
        config: dict[str, Any] | Configuration,
        prefix: str = "feature_group_",
    ) -> dict[str, Any] | None:
        """
        Extract selected feature groups from a SMAC configuration.

        Parameters
        ----------
        feature_groups : dict[str, Any]
            Dictionary of all feature groups.
        config : dict[str, Any]
            SMAC configuration dictionary.
        prefix : str, default="feature_group_"
            Prefix used for feature group parameters in the config.

        Returns
        -------
        dict[str, Any] or None
            Dictionary of selected feature groups, or None if no groups selected.
        """
        selected = {}
        for fg_name, fg_info in feature_groups.items():
            if config.get(f"{prefix}{fg_name}", True):
                selected[fg_name] = fg_info
        return selected if selected else None

fit(X, y=None)

Fit the selector by determining which features to select.

Parameters

X : pd.DataFrame Input features. y : Any, default=None Not used, present for API compatibility.

Returns

FeatureGroupSelector The fitted selector instance.

Source code in asf/preprocessing/feature_group_selector.py
def fit(self, X: pd.DataFrame, y: Any = None) -> FeatureGroupSelector:
    """
    Fit the selector by determining which features to select.

    Parameters
    ----------
    X : pd.DataFrame
        Input features.
    y : Any, default=None
        Not used, present for API compatibility.

    Returns
    -------
    FeatureGroupSelector
        The fitted selector instance.
    """
    # Determine which groups to include
    if self.selected_groups is None:
        groups_to_use = list(self.feature_groups.keys())
    else:
        groups_to_use = self.selected_groups

    # Collect features from selected groups
    selected_features = []
    for fg_name in groups_to_use:
        if fg_name in self.feature_groups:
            fg_info = self.feature_groups[fg_name]
            if "provides" in fg_info:
                selected_features.extend(fg_info["provides"])

    # Filter to only features that exist in X
    self.selected_features_ = [f for f in selected_features if f in X.columns]

    # If no features selected, use all columns
    if not self.selected_features_:
        self.selected_features_ = list(X.columns)

    return self

get_feature_names_out(input_features=None)

Get output feature names.

Parameters

input_features : Any, default=None Not used, present for API compatibility.

Returns

list[str] List of selected feature names.

Source code in asf/preprocessing/feature_group_selector.py
def get_feature_names_out(self, input_features: Any = None) -> list[str]:
    """
    Get output feature names.

    Parameters
    ----------
    input_features : Any, default=None
        Not used, present for API compatibility.

    Returns
    -------
    list[str]
        List of selected feature names.
    """
    return self.selected_features_

get_selected_groups_from_config(feature_groups, config, prefix='feature_group_') staticmethod

Extract selected feature groups from a SMAC configuration.

Parameters

feature_groups : dict[str, Any] Dictionary of all feature groups. config : dict[str, Any] SMAC configuration dictionary. prefix : str, default="feature_group_" Prefix used for feature group parameters in the config.

Returns

dict[str, Any] or None Dictionary of selected feature groups, or None if no groups selected.

Source code in asf/preprocessing/feature_group_selector.py
@staticmethod
def get_selected_groups_from_config(
    feature_groups: dict[str, Any],
    config: dict[str, Any] | Configuration,
    prefix: str = "feature_group_",
) -> dict[str, Any] | None:
    """
    Extract selected feature groups from a SMAC configuration.

    Parameters
    ----------
    feature_groups : dict[str, Any]
        Dictionary of all feature groups.
    config : dict[str, Any]
        SMAC configuration dictionary.
    prefix : str, default="feature_group_"
        Prefix used for feature group parameters in the config.

    Returns
    -------
    dict[str, Any] or None
        Dictionary of selected feature groups, or None if no groups selected.
    """
    selected = {}
    for fg_name, fg_info in feature_groups.items():
        if config.get(f"{prefix}{fg_name}", True):
            selected[fg_name] = fg_info
    return selected if selected else None

transform(X)

Transform the input by selecting only the specified features.

Parameters

X : pd.DataFrame Input features.

Returns

pd.DataFrame DataFrame with only the selected features.

Source code in asf/preprocessing/feature_group_selector.py
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
    """
    Transform the input by selecting only the specified features.

    Parameters
    ----------
    X : pd.DataFrame
        Input features.

    Returns
    -------
    pd.DataFrame
        DataFrame with only the selected features.
    """
    # Filter to features that exist in X
    available_features = [f for f in self.selected_features_ if f in X.columns]
    if not available_features:
        return X
    return X[available_features]

validate_feature_group_selection(feature_groups, selected_groups) staticmethod

Validate that a list of selected groups satisfies all prerequisites.

This is a static utility method that can be used to validate selections without creating a FeatureGroupSelector instance.

Parameters

feature_groups : dict[str, Any] Dictionary of all feature groups with their metadata. selected_groups : list[str] List of selected feature group names.

Raises

MissingPrerequisiteGroupError If a selected group requires another group that is not selected.

Source code in asf/preprocessing/feature_group_selector.py
@staticmethod
def validate_feature_group_selection(
    feature_groups: dict[str, Any],
    selected_groups: list[str],
) -> None:
    """
    Validate that a list of selected groups satisfies all prerequisites.

    This is a static utility method that can be used to validate selections
    without creating a FeatureGroupSelector instance.

    Parameters
    ----------
    feature_groups : dict[str, Any]
        Dictionary of all feature groups with their metadata.
    selected_groups : list[str]
        List of selected feature group names.

    Raises
    ------
    MissingPrerequisiteGroupError
        If a selected group requires another group that is not selected.
    """
    selected_set = set(selected_groups)

    for group_name in selected_groups:
        if group_name not in feature_groups:
            continue

        group_info = feature_groups[group_name]
        required_groups = group_info.get("requires", [])

        for required_group in required_groups:
            if required_group not in selected_set:
                raise MissingPrerequisiteGroupError(
                    f"Feature group '{group_name}' requires group '{required_group}' "
                    f"to be selected, but it is not included in the selected groups. "
                    f"Selected groups: {list(selected_groups)}"
                )

InvSigmoidNormalization

Bases: AbstractNormalization

Normalization using inverse sigmoid scaling.

Source code in asf/preprocessing/performance_scaling.py
class InvSigmoidNormalization(AbstractNormalization):
    """
    Normalization using inverse sigmoid scaling.
    """

    def __init__(self) -> None:
        super().__init__()

    def fit(
        self,
        X: np.ndarray,
        y: np.ndarray | None = None,
        sample_weight: np.ndarray | None = None,
    ) -> InvSigmoidNormalization:
        """
        Fit the InvSigmoidNormalization model to the data.

        Parameters
        ----------
        X : np.ndarray
            Input data.
        y : np.ndarray or None, default=None
            Target values.
        sample_weight : np.ndarray or None, default=None
            Sample weights.

        Returns
        -------
        InvSigmoidNormalization
            The fitted normalization instance.
        """
        self.min_max_scale = MinMaxScaler(feature_range=(1e-6, 1 - 1e-6))
        self.min_max_scale.fit(np.asarray(X).reshape(-1, 1))
        return self

    def transform(self, X: np.ndarray) -> np.ndarray:
        """
                Transform the input data using inverse sigmoid scaling.

                Parameters
                ----------
                X : np.ndarray
                    Input data.

                Returns
        -------
                np.ndarray
                    Transformed data.
        """
        X_scaled = self.min_max_scale.transform(X.reshape(-1, 1)).reshape(-1)
        return np.log(X_scaled / (1 - X_scaled))

    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
        """
                Inverse transform the data back to the original scale.

                Parameters
                ----------
                X : np.ndarray
                    Transformed data.

                Returns
        -------
                np.ndarray
                    Original data.
        """
        X_logit = scipy.special.expit(X)
        return self.min_max_scale.inverse_transform(X_logit.reshape(-1, 1)).reshape(-1)

fit(X, y=None, sample_weight=None)

Fit the InvSigmoidNormalization model to the data.

Parameters

X : np.ndarray Input data. y : np.ndarray or None, default=None Target values. sample_weight : np.ndarray or None, default=None Sample weights.

Returns

InvSigmoidNormalization The fitted normalization instance.

Source code in asf/preprocessing/performance_scaling.py
def fit(
    self,
    X: np.ndarray,
    y: np.ndarray | None = None,
    sample_weight: np.ndarray | None = None,
) -> InvSigmoidNormalization:
    """
    Fit the InvSigmoidNormalization model to the data.

    Parameters
    ----------
    X : np.ndarray
        Input data.
    y : np.ndarray or None, default=None
        Target values.
    sample_weight : np.ndarray or None, default=None
        Sample weights.

    Returns
    -------
    InvSigmoidNormalization
        The fitted normalization instance.
    """
    self.min_max_scale = MinMaxScaler(feature_range=(1e-6, 1 - 1e-6))
    self.min_max_scale.fit(np.asarray(X).reshape(-1, 1))
    return self

inverse_transform(X)

    Inverse transform the data back to the original scale.

    Parameters
    ----------
    X : np.ndarray
        Transformed data.

    Returns

    np.ndarray
        Original data.
Source code in asf/preprocessing/performance_scaling.py
def inverse_transform(self, X: np.ndarray) -> np.ndarray:
    """
            Inverse transform the data back to the original scale.

            Parameters
            ----------
            X : np.ndarray
                Transformed data.

            Returns
    -------
            np.ndarray
                Original data.
    """
    X_logit = scipy.special.expit(X)
    return self.min_max_scale.inverse_transform(X_logit.reshape(-1, 1)).reshape(-1)

transform(X)

    Transform the input data using inverse sigmoid scaling.

    Parameters
    ----------
    X : np.ndarray
        Input data.

    Returns

    np.ndarray
        Transformed data.
Source code in asf/preprocessing/performance_scaling.py
def transform(self, X: np.ndarray) -> np.ndarray:
    """
            Transform the input data using inverse sigmoid scaling.

            Parameters
            ----------
            X : np.ndarray
                Input data.

            Returns
    -------
            np.ndarray
                Transformed data.
    """
    X_scaled = self.min_max_scale.transform(X.reshape(-1, 1)).reshape(-1)
    return np.log(X_scaled / (1 - X_scaled))

LogNormalization

Bases: AbstractNormalization

Normalization using logarithmic scaling.

Source code in asf/preprocessing/performance_scaling.py
class LogNormalization(AbstractNormalization):
    """
    Normalization using logarithmic scaling.
    """

    def __init__(self, base: float = 10.0, eps: float = 1e-6) -> None:
        """
        Initialize LogNormalization.

        Parameters
        ----------
        base : float, default=10.0
            Base of the logarithm.
        eps : float, default=1e-6
            Small constant to avoid log(0).
        """
        super().__init__()
        self.base = base
        self.eps = eps

    def fit(
        self,
        X: np.ndarray,
        y: np.ndarray | None = None,
        sample_weight: np.ndarray | None = None,
    ) -> LogNormalization:
        """
        Fit the LogNormalization model to the data.

        Parameters
        ----------
        X : np.ndarray
            Input data.
        y : np.ndarray or None, default=None
            Target values.
        sample_weight : np.ndarray or None, default=None
            Sample weights.

        Returns
        -------
        LogNormalization
            The fitted normalization instance.
        """
        x_min = np.min(np.asarray(X))
        if x_min <= 0:
            self.min_val = x_min
        else:
            self.min_val = 0.0
            self.eps = 0.0

        return self

    def transform(self, X: np.ndarray) -> np.ndarray:
        """
                Transform the input data using logarithmic scaling.

                Parameters
                ----------
                X : np.ndarray
                    Input data.

                Returns
        -------
                np.ndarray
                    Transformed data.
        """
        X_shifted = X - self.min_val + self.eps
        return np.log(X_shifted) / np.log(self.base)

    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
        """
                Inverse transform the data back to the original scale.

                Parameters
                ----------
                X : np.ndarray
                    Transformed data.

                Returns
        -------
                np.ndarray
                    Original data.
        """
        X_orig = np.power(self.base, X)
        if self.min_val != 0:
            X_orig = X_orig + self.min_val - self.eps
        return X_orig

__init__(base=10.0, eps=1e-06)

Initialize LogNormalization.

Parameters

base : float, default=10.0 Base of the logarithm. eps : float, default=1e-6 Small constant to avoid log(0).

Source code in asf/preprocessing/performance_scaling.py
def __init__(self, base: float = 10.0, eps: float = 1e-6) -> None:
    """
    Initialize LogNormalization.

    Parameters
    ----------
    base : float, default=10.0
        Base of the logarithm.
    eps : float, default=1e-6
        Small constant to avoid log(0).
    """
    super().__init__()
    self.base = base
    self.eps = eps

fit(X, y=None, sample_weight=None)

Fit the LogNormalization model to the data.

Parameters

X : np.ndarray Input data. y : np.ndarray or None, default=None Target values. sample_weight : np.ndarray or None, default=None Sample weights.

Returns

LogNormalization The fitted normalization instance.

Source code in asf/preprocessing/performance_scaling.py
def fit(
    self,
    X: np.ndarray,
    y: np.ndarray | None = None,
    sample_weight: np.ndarray | None = None,
) -> LogNormalization:
    """
    Fit the LogNormalization model to the data.

    Parameters
    ----------
    X : np.ndarray
        Input data.
    y : np.ndarray or None, default=None
        Target values.
    sample_weight : np.ndarray or None, default=None
        Sample weights.

    Returns
    -------
    LogNormalization
        The fitted normalization instance.
    """
    x_min = np.min(np.asarray(X))
    if x_min <= 0:
        self.min_val = x_min
    else:
        self.min_val = 0.0
        self.eps = 0.0

    return self

inverse_transform(X)

    Inverse transform the data back to the original scale.

    Parameters
    ----------
    X : np.ndarray
        Transformed data.

    Returns

    np.ndarray
        Original data.
Source code in asf/preprocessing/performance_scaling.py
def inverse_transform(self, X: np.ndarray) -> np.ndarray:
    """
            Inverse transform the data back to the original scale.

            Parameters
            ----------
            X : np.ndarray
                Transformed data.

            Returns
    -------
            np.ndarray
                Original data.
    """
    X_orig = np.power(self.base, X)
    if self.min_val != 0:
        X_orig = X_orig + self.min_val - self.eps
    return X_orig

transform(X)

    Transform the input data using logarithmic scaling.

    Parameters
    ----------
    X : np.ndarray
        Input data.

    Returns

    np.ndarray
        Transformed data.
Source code in asf/preprocessing/performance_scaling.py
def transform(self, X: np.ndarray) -> np.ndarray:
    """
            Transform the input data using logarithmic scaling.

            Parameters
            ----------
            X : np.ndarray
                Input data.

            Returns
    -------
            np.ndarray
                Transformed data.
    """
    X_shifted = X - self.min_val + self.eps
    return np.log(X_shifted) / np.log(self.base)

MinMaxNormalization

Bases: AbstractNormalization

Normalization using Min-Max scaling.

Source code in asf/preprocessing/performance_scaling.py
class MinMaxNormalization(AbstractNormalization):
    """
    Normalization using Min-Max scaling.
    """

    def __init__(self, feature_range: tuple[float, float] = (0, 1)) -> None:
        """
        Initialize MinMaxNormalization.

        Parameters
        ----------
        feature_range : tuple[float, float], default=(0, 1)
            Desired range of transformed data.
        """
        super().__init__()
        self.feature_range = feature_range

    def fit(
        self,
        X: np.ndarray,
        y: np.ndarray | None = None,
        sample_weight: np.ndarray | None = None,
    ) -> MinMaxNormalization:
        """
        Fit the Min-Max scaler to the data.

        Parameters
        ----------
        X : np.ndarray
            Input data.
        y : np.ndarray or None, default=None
            Target values.
        sample_weight : np.ndarray or None, default=None
            Sample weights.

        Returns
        -------
        MinMaxNormalization
            The fitted normalization instance.
        """
        self.min_max_scale = MinMaxScaler(feature_range=self.feature_range)
        self.min_max_scale.fit(X.reshape(-1, 1))
        return self

    def transform(self, X: np.ndarray) -> np.ndarray:
        """
        Transform the input data using Min-Max scaling.

        Parameters
        ----------
        X : np.ndarray
            Input data.

        Returns
        -------
        np.ndarray
            Transformed data.
        """
        return self.min_max_scale.transform(X.reshape(-1, 1)).reshape(-1)

    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
        """
        Inverse transform the data back to the original scale.

        Parameters
        ----------
        X : np.ndarray
            Transformed data.

        Returns
        -------
        np.ndarray
            Original data.
        """
        return self.min_max_scale.inverse_transform(X.reshape(-1, 1)).reshape(-1)

__init__(feature_range=(0, 1))

Initialize MinMaxNormalization.

Parameters

feature_range : tuple[float, float], default=(0, 1) Desired range of transformed data.

Source code in asf/preprocessing/performance_scaling.py
def __init__(self, feature_range: tuple[float, float] = (0, 1)) -> None:
    """
    Initialize MinMaxNormalization.

    Parameters
    ----------
    feature_range : tuple[float, float], default=(0, 1)
        Desired range of transformed data.
    """
    super().__init__()
    self.feature_range = feature_range

fit(X, y=None, sample_weight=None)

Fit the Min-Max scaler to the data.

Parameters

X : np.ndarray Input data. y : np.ndarray or None, default=None Target values. sample_weight : np.ndarray or None, default=None Sample weights.

Returns

MinMaxNormalization The fitted normalization instance.

Source code in asf/preprocessing/performance_scaling.py
def fit(
    self,
    X: np.ndarray,
    y: np.ndarray | None = None,
    sample_weight: np.ndarray | None = None,
) -> MinMaxNormalization:
    """
    Fit the Min-Max scaler to the data.

    Parameters
    ----------
    X : np.ndarray
        Input data.
    y : np.ndarray or None, default=None
        Target values.
    sample_weight : np.ndarray or None, default=None
        Sample weights.

    Returns
    -------
    MinMaxNormalization
        The fitted normalization instance.
    """
    self.min_max_scale = MinMaxScaler(feature_range=self.feature_range)
    self.min_max_scale.fit(X.reshape(-1, 1))
    return self

inverse_transform(X)

Inverse transform the data back to the original scale.

Parameters

X : np.ndarray Transformed data.

Returns

np.ndarray Original data.

Source code in asf/preprocessing/performance_scaling.py
def inverse_transform(self, X: np.ndarray) -> np.ndarray:
    """
    Inverse transform the data back to the original scale.

    Parameters
    ----------
    X : np.ndarray
        Transformed data.

    Returns
    -------
    np.ndarray
        Original data.
    """
    return self.min_max_scale.inverse_transform(X.reshape(-1, 1)).reshape(-1)

transform(X)

Transform the input data using Min-Max scaling.

Parameters

X : np.ndarray Input data.

Returns

np.ndarray Transformed data.

Source code in asf/preprocessing/performance_scaling.py
def transform(self, X: np.ndarray) -> np.ndarray:
    """
    Transform the input data using Min-Max scaling.

    Parameters
    ----------
    X : np.ndarray
        Input data.

    Returns
    -------
    np.ndarray
        Transformed data.
    """
    return self.min_max_scale.transform(X.reshape(-1, 1)).reshape(-1)

MissingPrerequisiteGroupError

Bases: ValueError

Raised when a feature group is selected without its required prerequisite groups.

Source code in asf/preprocessing/feature_group_selector.py
class MissingPrerequisiteGroupError(ValueError):
    """Raised when a feature group is selected without its required prerequisite groups."""

    pass

NegExpNormalization

Bases: AbstractNormalization

Normalization using negative exponential scaling.

Source code in asf/preprocessing/performance_scaling.py
class NegExpNormalization(AbstractNormalization):
    """
    Normalization using negative exponential scaling.
    """

    def __init__(self) -> None:
        super().__init__()

    def fit(
        self,
        X: np.ndarray,
        y: np.ndarray | None = None,
        sample_weight: np.ndarray | None = None,
    ) -> NegExpNormalization:
        """
        Fit the NegExpNormalization model to the data.

        Parameters
        ----------
        X : np.ndarray
            Input data.
        y : np.ndarray or None, default=None
            Target values.
        sample_weight : np.ndarray or None, default=None
            Sample weights.

        Returns
        -------
        NegExpNormalization
            The fitted normalization instance.
        """
        return self

    def transform(self, X: np.ndarray) -> np.ndarray:
        """
                Transform the input data using negative exponential scaling.

                Parameters
                ----------
                X : np.ndarray
                    Input data.

                Returns
        -------
                np.ndarray
                    Transformed data.
        """
        return np.exp(-X)

    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
        """
                Inverse transform the data back to the original scale.

                Parameters
                ----------
                X : np.ndarray
                    Transformed data.

                Returns
        -------
                np.ndarray
                    Original data.
        """
        return -np.log(X)

fit(X, y=None, sample_weight=None)

Fit the NegExpNormalization model to the data.

Parameters

X : np.ndarray Input data. y : np.ndarray or None, default=None Target values. sample_weight : np.ndarray or None, default=None Sample weights.

Returns

NegExpNormalization The fitted normalization instance.

Source code in asf/preprocessing/performance_scaling.py
def fit(
    self,
    X: np.ndarray,
    y: np.ndarray | None = None,
    sample_weight: np.ndarray | None = None,
) -> NegExpNormalization:
    """
    Fit the NegExpNormalization model to the data.

    Parameters
    ----------
    X : np.ndarray
        Input data.
    y : np.ndarray or None, default=None
        Target values.
    sample_weight : np.ndarray or None, default=None
        Sample weights.

    Returns
    -------
    NegExpNormalization
        The fitted normalization instance.
    """
    return self

inverse_transform(X)

    Inverse transform the data back to the original scale.

    Parameters
    ----------
    X : np.ndarray
        Transformed data.

    Returns

    np.ndarray
        Original data.
Source code in asf/preprocessing/performance_scaling.py
def inverse_transform(self, X: np.ndarray) -> np.ndarray:
    """
            Inverse transform the data back to the original scale.

            Parameters
            ----------
            X : np.ndarray
                Transformed data.

            Returns
    -------
            np.ndarray
                Original data.
    """
    return -np.log(X)

transform(X)

    Transform the input data using negative exponential scaling.

    Parameters
    ----------
    X : np.ndarray
        Input data.

    Returns

    np.ndarray
        Transformed data.
Source code in asf/preprocessing/performance_scaling.py
def transform(self, X: np.ndarray) -> np.ndarray:
    """
            Transform the input data using negative exponential scaling.

            Parameters
            ----------
            X : np.ndarray
                Input data.

            Returns
    -------
            np.ndarray
                Transformed data.
    """
    return np.exp(-X)

SqrtNormalization

Bases: AbstractNormalization

Normalization using square root scaling.

Source code in asf/preprocessing/performance_scaling.py
class SqrtNormalization(AbstractNormalization):
    """
    Normalization using square root scaling.
    """

    def __init__(self, eps: float = 1e-6) -> None:
        """
        Initialize SqrtNormalization.

        Parameters
        ----------
        eps : float, default=1e-6
            Small constant to avoid sqrt(0).
        """
        super().__init__()
        self.eps = eps

    def fit(
        self,
        X: np.ndarray,
        y: np.ndarray | None = None,
        sample_weight: np.ndarray | None = None,
    ) -> SqrtNormalization:
        """
        Fit the SqrtNormalization model to the data.

        Parameters
        ----------
        X : np.ndarray
            Input data.
        y : np.ndarray or None, default=None
            Target values.
        sample_weight : np.ndarray or None, default=None
            Sample weights.

        Returns
        -------
        SqrtNormalization
            The fitted normalization instance.
        """
        x_min = np.min(np.asarray(X))
        if x_min < 0:
            self.min_val = x_min
        else:
            self.min_val = 0.0
        return self

    def transform(self, X: np.ndarray) -> np.ndarray:
        """
                Transform the input data using square root scaling.

                Parameters
                ----------
                X : np.ndarray
                    Input data.

                Returns
        -------
                np.ndarray
                    Transformed data.
        """
        X_shifted = X + self.min_val + self.eps
        return np.sqrt(X_shifted)

    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
        """
                Inverse transform the data back to the original scale.

                Parameters
                ----------
                X : np.ndarray
                    Transformed data.

                Returns
        -------
                np.ndarray
                    Original data.
        """
        X_orig = np.power(X, 2)
        if self.min_val != 0:
            X_orig = X_orig - self.min_val - self.eps
        return X_orig

__init__(eps=1e-06)

Initialize SqrtNormalization.

Parameters

eps : float, default=1e-6 Small constant to avoid sqrt(0).

Source code in asf/preprocessing/performance_scaling.py
def __init__(self, eps: float = 1e-6) -> None:
    """
    Initialize SqrtNormalization.

    Parameters
    ----------
    eps : float, default=1e-6
        Small constant to avoid sqrt(0).
    """
    super().__init__()
    self.eps = eps

fit(X, y=None, sample_weight=None)

Fit the SqrtNormalization model to the data.

Parameters

X : np.ndarray Input data. y : np.ndarray or None, default=None Target values. sample_weight : np.ndarray or None, default=None Sample weights.

Returns

SqrtNormalization The fitted normalization instance.

Source code in asf/preprocessing/performance_scaling.py
def fit(
    self,
    X: np.ndarray,
    y: np.ndarray | None = None,
    sample_weight: np.ndarray | None = None,
) -> SqrtNormalization:
    """
    Fit the SqrtNormalization model to the data.

    Parameters
    ----------
    X : np.ndarray
        Input data.
    y : np.ndarray or None, default=None
        Target values.
    sample_weight : np.ndarray or None, default=None
        Sample weights.

    Returns
    -------
    SqrtNormalization
        The fitted normalization instance.
    """
    x_min = np.min(np.asarray(X))
    if x_min < 0:
        self.min_val = x_min
    else:
        self.min_val = 0.0
    return self

inverse_transform(X)

    Inverse transform the data back to the original scale.

    Parameters
    ----------
    X : np.ndarray
        Transformed data.

    Returns

    np.ndarray
        Original data.
Source code in asf/preprocessing/performance_scaling.py
def inverse_transform(self, X: np.ndarray) -> np.ndarray:
    """
            Inverse transform the data back to the original scale.

            Parameters
            ----------
            X : np.ndarray
                Transformed data.

            Returns
    -------
            np.ndarray
                Original data.
    """
    X_orig = np.power(X, 2)
    if self.min_val != 0:
        X_orig = X_orig - self.min_val - self.eps
    return X_orig

transform(X)

    Transform the input data using square root scaling.

    Parameters
    ----------
    X : np.ndarray
        Input data.

    Returns

    np.ndarray
        Transformed data.
Source code in asf/preprocessing/performance_scaling.py
def transform(self, X: np.ndarray) -> np.ndarray:
    """
            Transform the input data using square root scaling.

            Parameters
            ----------
            X : np.ndarray
                Input data.

            Returns
    -------
            np.ndarray
                Transformed data.
    """
    X_shifted = X + self.min_val + self.eps
    return np.sqrt(X_shifted)

ZScoreNormalization

Bases: AbstractNormalization

Normalization using Z-Score scaling.

Source code in asf/preprocessing/performance_scaling.py
class ZScoreNormalization(AbstractNormalization):
    """
    Normalization using Z-Score scaling.
    """

    def __init__(self) -> None:
        super().__init__()

    def fit(
        self,
        X: np.ndarray,
        y: np.ndarray | None = None,
        sample_weight: np.ndarray | None = None,
    ) -> ZScoreNormalization:
        """
        Fit the Z-Score scaler to the data.

        Parameters
        ----------
        X : np.ndarray
            Input data.
        y : np.ndarray or None, default=None
            Target values.
        sample_weight : np.ndarray or None, default=None
            Sample weights.

        Returns
        -------
        ZScoreNormalization
            The fitted normalization instance.
        """
        self.scaler = StandardScaler()
        self.scaler.fit(X.reshape(-1, 1))
        return self

    def transform(self, X: np.ndarray) -> np.ndarray:
        """
        Transform the input data using Z-Score scaling.

        Parameters
        ----------
        X : np.ndarray
            Input data.

        Returns
        -------
        np.ndarray
            Transformed data.
        """
        return self.scaler.transform(X.reshape(-1, 1)).reshape(-1)

    def inverse_transform(self, X: np.ndarray) -> np.ndarray:
        """
        Inverse transform the data back to the original scale.

        Parameters
        ----------
        X : np.ndarray
            Transformed data.

        Returns
        -------
        np.ndarray
            Original data.
        """
        return self.scaler.inverse_transform(X.reshape(-1, 1)).reshape(-1)

fit(X, y=None, sample_weight=None)

Fit the Z-Score scaler to the data.

Parameters

X : np.ndarray Input data. y : np.ndarray or None, default=None Target values. sample_weight : np.ndarray or None, default=None Sample weights.

Returns

ZScoreNormalization The fitted normalization instance.

Source code in asf/preprocessing/performance_scaling.py
def fit(
    self,
    X: np.ndarray,
    y: np.ndarray | None = None,
    sample_weight: np.ndarray | None = None,
) -> ZScoreNormalization:
    """
    Fit the Z-Score scaler to the data.

    Parameters
    ----------
    X : np.ndarray
        Input data.
    y : np.ndarray or None, default=None
        Target values.
    sample_weight : np.ndarray or None, default=None
        Sample weights.

    Returns
    -------
    ZScoreNormalization
        The fitted normalization instance.
    """
    self.scaler = StandardScaler()
    self.scaler.fit(X.reshape(-1, 1))
    return self

inverse_transform(X)

Inverse transform the data back to the original scale.

Parameters

X : np.ndarray Transformed data.

Returns

np.ndarray Original data.

Source code in asf/preprocessing/performance_scaling.py
def inverse_transform(self, X: np.ndarray) -> np.ndarray:
    """
    Inverse transform the data back to the original scale.

    Parameters
    ----------
    X : np.ndarray
        Transformed data.

    Returns
    -------
    np.ndarray
        Original data.
    """
    return self.scaler.inverse_transform(X.reshape(-1, 1)).reshape(-1)

transform(X)

Transform the input data using Z-Score scaling.

Parameters

X : np.ndarray Input data.

Returns

np.ndarray Transformed data.

Source code in asf/preprocessing/performance_scaling.py
def transform(self, X: np.ndarray) -> np.ndarray:
    """
    Transform the input data using Z-Score scaling.

    Parameters
    ----------
    X : np.ndarray
        Input data.

    Returns
    -------
    np.ndarray
        Transformed data.
    """
    return self.scaler.transform(X.reshape(-1, 1)).reshape(-1)

get_default_preprocessor(categorical_features=None, numerical_features=None)

Creates a default preprocessor for handling categorical and numerical features.

Parameters:

Name Type Description Default
categorical_features list[str] | Callable | None

List of categorical feature names or a callable selector. Defaults to selecting object dtype columns.

None
numerical_features list[str] | Callable | None

List of numerical feature names or a callable selector. Defaults to selecting numeric dtype columns.

None

Returns:

Name Type Description
ColumnTransformer ColumnTransformer

A transformer that applies preprocessing pipelines to categorical and numerical features.

Source code in asf/preprocessing/sklearn_preprocessor.py
def get_default_preprocessor(
    categorical_features: list[str] | Callable | None = None,
    numerical_features: list[str] | Callable | None = None,
) -> ColumnTransformer:
    """
    Creates a default preprocessor for handling categorical and numerical features.

    Args:
        categorical_features (list[str] | Callable | None):
            List of categorical feature names or a callable selector. Defaults to selecting object dtype columns.
        numerical_features (list[str] | Callable | None):
            List of numerical feature names or a callable selector. Defaults to selecting numeric dtype columns.

    Returns:
        ColumnTransformer: A transformer that applies preprocessing pipelines to categorical and numerical features.
    """
    if categorical_features is None:
        categorical_features = make_column_selector(dtype_include=object)

    if numerical_features is None:
        numerical_features = make_column_selector(dtype_include="number")

    preprocessor = ColumnTransformer(
        [
            (
                "cat",
                make_pipeline(
                    SimpleImputer(strategy="most_frequent"),
                    OneHotEncoder(sparse_output=False, handle_unknown="ignore"),
                ),
                categorical_features,
            ),
            (
                "cont",
                make_pipeline(SimpleImputer(strategy="median"), StandardScaler()),
                numerical_features,
            ),
        ]
    )

    preprocessor.set_output(transform="pandas")
    return preprocessor