def read_aslib_scenario(
path: str,
add_running_time_features: bool = True,
training_par_factor: float | None = 10.0,
) -> tuple[
pd.DataFrame,
pd.DataFrame,
pd.DataFrame,
pd.DataFrame,
dict[str, Any],
bool,
float,
pd.DataFrame | None,
]:
"""
Read an ASlib scenario from a directory.
Parameters
----------
path : str
The path to the ASlib scenario directory.
add_running_time_features : bool, default=True
Whether to include running time features (feature costs).
training_par_factor : float or None, default=10.0
PAR factor to apply to training performance data. Timeouts (values > budget)
are replaced with budget * training_par_factor. Set to None to disable.
Returns
-------
tuple
A tuple containing (features, performance, features_running_time, cv,
feature_groups, maximize, budget, algorithm_features) where:
- features: pd.DataFrame of feature values per instance.
- performance: pd.DataFrame of algorithm performance per instance.
- features_running_time: pd.DataFrame of feature costs per instance.
- cv: pd.DataFrame of cross-validation fold assignments.
- feature_groups: dict of feature group definitions.
- maximize: bool, True if higher performance values are better.
- budget: float, the algorithm cutoff time.
- algorithm_features: pd.DataFrame of algorithm features, or None.
Raises
------
ImportError
If the required libraries (pyyaml, liac-arff) are not available.
"""
if not ASLIB_AVAILABLE:
raise ImportError(
"The aslib reader requires 'pyyaml' and 'liac-arff'. "
"Install them via 'pip install asf[aslib]'."
)
description_path = os.path.join(path, "description.txt")
performance_path = os.path.join(path, "algorithm_runs.arff")
features_path = os.path.join(path, "feature_values.arff")
features_runstatus_path = os.path.join(path, "feature_runstatus.arff")
features_running_time_path = os.path.join(path, "feature_costs.arff")
cv_path = os.path.join(path, "cv.arff")
algorithm_features_path = os.path.join(path, "algorithm_feature_values.arff")
algorithm_features_runstatus_path = os.path.join(
path, "algorithm_feature_runstatus.arff"
)
# Load description file
with open(description_path, "r") as f:
description: dict[str, Any] = yaml.load(f, Loader=Loader)
feature_groups: dict[str, Any] = description["feature_steps"]
algorithm_feature_groups: dict[str, Any] = description.get(
"algorithm_feature_steps", {}
)
maximize: bool = description["maximize"]
if not isinstance(maximize, bool):
maximize = bool(maximize[0])
budget: float = description["algorithm_cutoff_time"]
# Load performance data
with open(performance_path, "r") as f:
performance_data: dict[str, Any] = load(f)
performance = pd.DataFrame(
performance_data["data"],
columns=[a[0] for a in performance_data["attributes"]], # type: ignore[arg-type]
)
runtime_col = description["performance_measures"][0]
if "runstatus" in performance.columns:
performance.loc[performance["runstatus"] != "ok", runtime_col] = budget + 1
group_cols = ["instance_id", "algorithm"]
performance = performance.groupby(group_cols, as_index=False)[runtime_col].mean()
performance = performance.pivot(
index="instance_id", columns="algorithm", values=runtime_col
)
# Load feature values
with open(features_path, "r") as f:
features_data: dict[str, Any] = load(f)
features = pd.DataFrame(
features_data["data"],
columns=[a[0] for a in features_data["attributes"]], # type: ignore[arg-type]
)
if os.path.exists(features_runstatus_path):
with open(features_runstatus_path, "r") as f:
feature_runstatus_data: dict[str, Any] = load(f)
feature_runstatus = pd.DataFrame(
feature_runstatus_data["data"],
columns=[a[0] for a in feature_runstatus_data["attributes"]], # type: ignore[arg-type]
)
for step_name, step_info in feature_groups.items():
if step_name in feature_runstatus.columns:
failed_mask = feature_runstatus[step_name] != "ok"
failed_instances = feature_runstatus.loc[
failed_mask, "instance_id"
].values
step_features = step_info.get("provides", [])
for feat in step_features:
if feat in features.columns:
features.loc[
features["instance_id"].isin(failed_instances), feat
] = float("nan")
features = features.groupby("instance_id").mean()
if "repetition" in features.columns:
features = features.drop(columns=["repetition"])
features_running_time = pd.DataFrame(
0.0,
index=performance.index,
columns=["feature_time"], # type: ignore[arg-type]
)
if add_running_time_features and os.path.exists(features_running_time_path):
with open(features_running_time_path, "r") as f:
ft_data: dict[str, Any] = load(f)
features_running_time = pd.DataFrame(
ft_data["data"],
columns=[a[0] for a in ft_data["attributes"]], # type: ignore[arg-type]
)
features_running_time = features_running_time.groupby("instance_id").mean()
if "repetition" in features_running_time.columns:
features_running_time = features_running_time.drop(columns=["repetition"])
# Apply PAR penalization to training data
if training_par_factor is not None:
performance = apply_par(performance, budget, training_par_factor)
# Load cross-validation data
with open(cv_path, "r") as f:
cv_data: dict[str, Any] = load(f)
cv = pd.DataFrame(cv_data["data"], columns=[a[0] for a in cv_data["attributes"]]) # type: ignore[arg-type]
cv = cv.set_index("instance_id")
if "repetition" in cv.columns:
cv = cv.drop(columns=["repetition"])
# Sort indices for consistency
features = features.sort_index() # type: ignore[attr-defined]
performance = performance.sort_index() # type: ignore[attr-defined]
cv = cv.sort_index() # type: ignore[attr-defined]
features_running_time = features_running_time.sort_index() # type: ignore[attr-defined]
# Load algorithm features if available
algorithm_features = None
if os.path.exists(algorithm_features_path):
with open(algorithm_features_path, "r") as f:
af_data: dict[str, Any] = load(f)
algorithm_features = pd.DataFrame(
af_data["data"],
columns=[a[0] for a in af_data["attributes"]], # type: ignore[arg-type]
)
if os.path.exists(algorithm_features_runstatus_path):
with open(algorithm_features_runstatus_path, "r") as f:
af_rs_data: dict[str, Any] = load(f)
af_runstatus = pd.DataFrame(
af_rs_data["data"],
columns=[a[0] for a in af_rs_data["attributes"]], # type: ignore[arg-type]
)
for step_name, step_info in algorithm_feature_groups.items():
if step_name in af_runstatus.columns:
failed_mask = af_runstatus[step_name] != "ok"
failed_algos = af_runstatus.loc[failed_mask, "algorithm"].values
step_features = step_info.get("provides", [])
for feat in step_features:
if feat in algorithm_features.columns:
algorithm_features.loc[
algorithm_features["algorithm"].isin(failed_algos), feat
] = float("nan")
algorithm_features = algorithm_features.groupby("algorithm").mean()
algorithm_features = algorithm_features.drop(
columns=["repetition"], errors="ignore"
)
algorithm_features = algorithm_features.sort_index()
return (
features,
performance,
features_running_time,
cv,
feature_groups,
maximize,
budget,
algorithm_features,
)