EPM

`EPM`

Empirical Performance Model wrapper.

The EPM (Empirical Performance Model) class is a wrapper for machine learning models that includes preprocessing, normalization, and optional inverse transformation of predictions.

Parameters

predictor_class : type[AbstractPredictor] or type[RegressorMixin], default=RandomForestRegressorWrapper The class of the predictor to use. normalization_class : type[AbstractNormalization], default=LogNormalization The normalization class to apply to the target variable. transform_back : bool, default=True Whether to apply inverse transformation to predictions. features_preprocessing : str or TransformerMixin, default="default" Preprocessing pipeline for features. categorical_features : list or None, default=None List of categorical feature names. numerical_features : list or None, default=None List of numerical feature names. predictor_config : dict or None, default=None Configuration for the predictor. predictor_kwargs : dict or None, default=None Additional keyword arguments for the predictor. imputer : Callable or None, default=None Optional imputer function for target variables.

Source code in asf/epm/epm.py

class EPM:
    """
    Empirical Performance Model wrapper.

    The EPM (Empirical Performance Model) class is a wrapper for machine learning models
    that includes preprocessing, normalization, and optional inverse transformation of predictions.

    Parameters
    ----------
    predictor_class : type[AbstractPredictor] or type[RegressorMixin], default=RandomForestRegressorWrapper
        The class of the predictor to use.
    normalization_class : type[AbstractNormalization], default=LogNormalization
        The normalization class to apply to the target variable.
    transform_back : bool, default=True
        Whether to apply inverse transformation to predictions.
    features_preprocessing : str or TransformerMixin, default="default"
        Preprocessing pipeline for features.
    categorical_features : list or None, default=None
        List of categorical feature names.
    numerical_features : list or None, default=None
        List of numerical feature names.
    predictor_config : dict or None, default=None
        Configuration for the predictor.
    predictor_kwargs : dict or None, default=None
        Additional keyword arguments for the predictor.
    imputer : Callable or None, default=None
        Optional imputer function for target variables.
    """

    def __init__(
        self,
        predictor_class: type[AbstractPredictor]
        | type[RegressorMixin] = RandomForestRegressorWrapper,
        normalization_class: type[AbstractNormalization] = LogNormalization,
        transform_back: bool = True,
        features_preprocessing: str | TransformerMixin | None = "default",
        categorical_features: list[str] | None = None,
        numerical_features: list[str] | None = None,
        predictor_config: dict[str, Any] | None = None,
        predictor_kwargs: dict[str, Any] | None = None,
        imputer: Callable[[pd.Series, pd.DataFrame], pd.Series] | None = None,
    ) -> None:
        if isinstance(predictor_class, type) and issubclass(
            predictor_class, RegressorMixin
        ):
            self.model_class: Any = partial(SklearnWrapper, predictor_class)
        else:
            self.model_class = predictor_class

        self.predictor_class = predictor_class
        self.normalization_class = normalization_class
        self.transform_back = transform_back
        self.predictor_config = predictor_config
        self.predictor_kwargs = predictor_kwargs or {}
        self.imputer = imputer
        self.numpy = False

        if features_preprocessing == "default":
            self.features_preprocessing = get_default_preprocessor(
                categorical_features=categorical_features,
                numerical_features=numerical_features,
            )
        else:
            self.features_preprocessing = features_preprocessing

    def fit(
        self,
        X: pd.DataFrame | pd.Series | np.ndarray | list[Any],
        y: pd.Series | np.ndarray | list[Any],
        sample_weight: list[float] | np.ndarray | None = None,
    ) -> EPM:
        """
        Fit the EPM model.

        Parameters
        ----------
        X : pd.DataFrame, pd.Series, np.ndarray, or list
            Input features.
        y : pd.Series, np.ndarray, or list
            Target values.
        sample_weight : list, np.ndarray, or None, default=None
            Sample weights.

        Returns
        -------
        EPM
            The fitted model.
        """
        if isinstance(X, np.ndarray) and isinstance(y, np.ndarray):
            X_df = pd.DataFrame(
                X,
                index=range(len(X)),
                columns=pd.Index([f"f_{i}" for i in range(X.shape[1])]),
            )
            y_ser = pd.Series(
                y,
                index=range(len(y)),
            )
            self.numpy = True
        else:
            X_df = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X
            y_ser = pd.Series(y) if not isinstance(y, pd.Series) else y

        if self.features_preprocessing is not None and not isinstance(
            self.features_preprocessing, str
        ):
            X_df = self.features_preprocessing.fit_transform(X_df)  # type: ignore

        self.normalization = self.normalization_class()
        self.normalization.fit(np.asarray(y_ser))
        y_ser_scaled = self.normalization.transform(np.asarray(y_ser))
        y_ser = pd.Series(y_ser_scaled, index=y_ser.index)

        if self.imputer is not None:
            y_ser = self.imputer(y_ser, X_df)

        self.predictor = self._get_predictor()

        self.predictor.fit(X_df, y_ser, sample_weight=sample_weight)
        return self

    def _get_predictor(self) -> AbstractPredictor:
        """Get the predictor instance."""
        if self.predictor_config is None:
            predictor = self.predictor_class(**self.predictor_kwargs)
        else:
            # Assume get_from_configuration returns a partial or a class
            predictor_factory = self.predictor_class.get_from_configuration(  # type: ignore
                self.predictor_config, **self.predictor_kwargs
            )
            if callable(predictor_factory):
                predictor = predictor_factory()
            else:
                predictor = predictor_factory

        if not isinstance(predictor, AbstractPredictor):
            raise TypeError(f"Predictor {predictor} is not an AbstractPredictor")
        return predictor

    def predict(
        self, X: pd.DataFrame | pd.Series | np.ndarray | list[Any]
    ) -> np.ndarray:
        """
        Predict targets.

        Parameters
        ----------
        X : pd.DataFrame, pd.Series, np.ndarray, or list
            Input features.

        Returns
        -------
        np.ndarray
            Predicted values.
        """
        if self.numpy:
            if isinstance(X, np.ndarray):
                X_df = pd.DataFrame(
                    X,
                    index=range(len(X)),
                    columns=pd.Index([f"f_{i}" for i in range(X.shape[1])]),
                )
            else:
                X_df = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X
        else:
            X_df = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X

        if self.features_preprocessing is not None and not isinstance(
            self.features_preprocessing, str
        ):
            X_df = self.features_preprocessing.transform(X_df)  # type: ignore

        y_pred = self.predictor.predict(X_df)

        if self.transform_back:
            y_pred = self.normalization.inverse_transform(y_pred)

        return np.asarray(y_pred)

`fit(X, y, sample_weight=None)`

Fit the EPM model.

Parameters

X : pd.DataFrame, pd.Series, np.ndarray, or list Input features. y : pd.Series, np.ndarray, or list Target values. sample_weight : list, np.ndarray, or None, default=None Sample weights.

Returns

EPM The fitted model.

Source code in asf/epm/epm.py

def fit(
    self,
    X: pd.DataFrame | pd.Series | np.ndarray | list[Any],
    y: pd.Series | np.ndarray | list[Any],
    sample_weight: list[float] | np.ndarray | None = None,
) -> EPM:
    """
    Fit the EPM model.

    Parameters
    ----------
    X : pd.DataFrame, pd.Series, np.ndarray, or list
        Input features.
    y : pd.Series, np.ndarray, or list
        Target values.
    sample_weight : list, np.ndarray, or None, default=None
        Sample weights.

    Returns
    -------
    EPM
        The fitted model.
    """
    if isinstance(X, np.ndarray) and isinstance(y, np.ndarray):
        X_df = pd.DataFrame(
            X,
            index=range(len(X)),
            columns=pd.Index([f"f_{i}" for i in range(X.shape[1])]),
        )
        y_ser = pd.Series(
            y,
            index=range(len(y)),
        )
        self.numpy = True
    else:
        X_df = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X
        y_ser = pd.Series(y) if not isinstance(y, pd.Series) else y

    if self.features_preprocessing is not None and not isinstance(
        self.features_preprocessing, str
    ):
        X_df = self.features_preprocessing.fit_transform(X_df)  # type: ignore

    self.normalization = self.normalization_class()
    self.normalization.fit(np.asarray(y_ser))
    y_ser_scaled = self.normalization.transform(np.asarray(y_ser))
    y_ser = pd.Series(y_ser_scaled, index=y_ser.index)

    if self.imputer is not None:
        y_ser = self.imputer(y_ser, X_df)

    self.predictor = self._get_predictor()

    self.predictor.fit(X_df, y_ser, sample_weight=sample_weight)
    return self

`predict(X)`

Predict targets.

Parameters

X : pd.DataFrame, pd.Series, np.ndarray, or list Input features.

Returns

np.ndarray Predicted values.

Source code in asf/epm/epm.py

def predict(
    self, X: pd.DataFrame | pd.Series | np.ndarray | list[Any]
) -> np.ndarray:
    """
    Predict targets.

    Parameters
    ----------
    X : pd.DataFrame, pd.Series, np.ndarray, or list
        Input features.

    Returns
    -------
    np.ndarray
        Predicted values.
    """
    if self.numpy:
        if isinstance(X, np.ndarray):
            X_df = pd.DataFrame(
                X,
                index=range(len(X)),
                columns=pd.Index([f"f_{i}" for i in range(X.shape[1])]),
            )
        else:
            X_df = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X
    else:
        X_df = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X

    if self.features_preprocessing is not None and not isinstance(
        self.features_preprocessing, str
    ):
        X_df = self.features_preprocessing.transform(X_df)  # type: ignore

    y_pred = self.predictor.predict(X_df)

    if self.transform_back:
        y_pred = self.normalization.inverse_transform(y_pred)

    return np.asarray(y_pred)

`tune_epm(X, y, model_class, normalization_class=LogNormalization, features_preprocessing='default', categorical_features=None, numerical_features=None, groups=None, cv=5, timeout=3600, runcount_limit=100, output_dir='./smac_output', seed=0, smac_metric=mean_squared_error, smac_scenario_kwargs=None, smac_kwargs=None, predictor_kwargs=None)`

Tune the Empirical Performance Model (EPM) using SMAC.

Parameters

X : np.ndarray or pd.DataFrame Feature matrix for training and validation. y : np.ndarray or pd.Series Target values corresponding to the feature matrix. model_class : type[AbstractPredictor] The predictor class to be tuned. normalization_class : type[AbstractNormalization], default=LogNormalization The normalization class to be applied to the data. features_preprocessing : str or TransformerMixin, default="default" Preprocessing method for features. categorical_features : list or None, default=None List of categorical feature names. numerical_features : list or None, default=None List of numerical feature names. groups : np.ndarray or None, default=None Group labels for cross-validation. cv : int, default=5 Number of cross-validation folds. timeout : int, default=3600 Time limit for the tuning process in seconds. runcount_limit : int, default=100 Maximum number of configurations to evaluate. output_dir : str, default="./smac_output" Directory to store SMAC output. seed : int, default=0 Random seed for reproducibility. smac_metric : Callable, default=mean_squared_error Metric function to evaluate model performance. smac_scenario_kwargs : dict or None, default=None Additional keyword arguments for the SMAC scenario. smac_kwargs : dict or None, default=None Additional keyword arguments for SMAC optimization. predictor_kwargs : dict or None, default=None Additional keyword arguments for the predictor.

Returns

EPM The tuned Empirical Performance Model instance.

Raises

RuntimeError If SMAC is not installed.

Source code in asf/epm/epm_tuner.py

def tune_epm(
    X: np.ndarray | pd.DataFrame,
    y: np.ndarray | pd.Series,
    model_class: type[AbstractPredictor],
    normalization_class: type[AbstractNormalization] = LogNormalization,
    features_preprocessing: str | TransformerMixin | None = "default",
    categorical_features: list[str] | None = None,
    numerical_features: list[str] | None = None,
    groups: np.ndarray | None = None,
    cv: int = 5,
    timeout: int = 3600,
    runcount_limit: int = 100,
    output_dir: str = "./smac_output",
    seed: int = 0,
    smac_metric: Callable[[np.ndarray, np.ndarray], float] = mean_squared_error,
    smac_scenario_kwargs: dict[str, Any] | None = None,
    smac_kwargs: dict[str, Any] | None = None,
    predictor_kwargs: dict[str, Any] | None = None,
) -> EPM:
    """
    Tune the Empirical Performance Model (EPM) using SMAC.

    Parameters
    ----------
    X : np.ndarray or pd.DataFrame
        Feature matrix for training and validation.
    y : np.ndarray or pd.Series
        Target values corresponding to the feature matrix.
    model_class : type[AbstractPredictor]
        The predictor class to be tuned.
    normalization_class : type[AbstractNormalization], default=LogNormalization
        The normalization class to be applied to the data.
    features_preprocessing : str or TransformerMixin, default="default"
        Preprocessing method for features.
    categorical_features : list or None, default=None
        List of categorical feature names.
    numerical_features : list or None, default=None
        List of numerical feature names.
    groups : np.ndarray or None, default=None
        Group labels for cross-validation.
    cv : int, default=5
        Number of cross-validation folds.
    timeout : int, default=3600
        Time limit for the tuning process in seconds.
    runcount_limit : int, default=100
        Maximum number of configurations to evaluate.
    output_dir : str, default="./smac_output"
        Directory to store SMAC output.
    seed : int, default=0
        Random seed for reproducibility.
    smac_metric : Callable, default=mean_squared_error
        Metric function to evaluate model performance.
    smac_scenario_kwargs : dict or None, default=None
        Additional keyword arguments for the SMAC scenario.
    smac_kwargs : dict or None, default=None
        Additional keyword arguments for SMAC optimization.
    predictor_kwargs : dict or None, default=None
        Additional keyword arguments for the predictor.

    Returns
    -------
    EPM
        The tuned Empirical Performance Model instance.

    Raises
    ------
    RuntimeError
        If SMAC is not installed.
    """
    if not SMAC_AVAILABLE:
        raise RuntimeError("SMAC is not installed. Install it with: pip install smac")

    smac_scenario_kwargs = smac_scenario_kwargs or {}
    smac_kwargs = smac_kwargs or {}
    predictor_kwargs = predictor_kwargs or {}

    if isinstance(X, np.ndarray) and isinstance(y, np.ndarray):
        X_df = pd.DataFrame(
            X,
            index=range(len(X)),
            columns=[f"f_{i}" for i in range(X.shape[1])],  # type: ignore[arg-type]
        )
        y_ser = pd.Series(
            y,
            index=range(len(y)),
        )
    else:
        X_df = pd.DataFrame(X) if not isinstance(X, pd.DataFrame) else X
        y_ser = pd.Series(y) if not isinstance(y, pd.Series) else y

    scenario = Scenario(
        configspace=model_class.get_configuration_space(),
        n_trials=runcount_limit,
        walltime_limit=timeout,
        deterministic=True,
        output_directory=Path(output_dir),
        seed=seed,
        **smac_scenario_kwargs,
    )

    def target_function(config: Any, seed: int) -> float:
        if groups is not None:
            kfold = GroupKFoldShuffle(n_splits=cv, shuffle=True, random_state=seed)
        else:
            kfold = KFold(n_splits=cv, shuffle=True, random_state=seed)

        scores = []
        for train_idx, test_idx in kfold.split(X_df, y_ser, groups):
            X_train, X_test = X_df.iloc[train_idx], X_df.iloc[test_idx]
            y_train, y_test = y_ser.iloc[train_idx], y_ser.iloc[test_idx]

            epm = EPM(
                predictor_class=model_class,
                normalization_class=normalization_class,
                transform_back=True,
                predictor_config=config,
                predictor_kwargs=predictor_kwargs,
                features_preprocessing=features_preprocessing,
                categorical_features=categorical_features,
                numerical_features=numerical_features,
            )
            epm.fit(X_train, y_train)

            y_pred = epm.predict(X_test)
            score = smac_metric(y_test.values, y_pred)
            scores.append(score)

        return float(np.mean(scores))

    smac = HyperparameterOptimizationFacade(scenario, target_function, **smac_kwargs)
    best_config = smac.optimize()

    # handle Union[Configuration, list]
    if isinstance(best_config, list):
        best_config = best_config[0]

    return EPM(
        predictor_class=model_class,
        normalization_class=normalization_class,
        transform_back=True,
        predictor_config=dict(best_config),
        features_preprocessing=features_preprocessing,
        categorical_features=categorical_features,
        numerical_features=numerical_features,
    )