Source code for hundred_hammers.hyperoptimizer

from __future__ import annotations
from typing import Tuple
from abc import ABC, abstractmethod
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.base import BaseEstimator
from .config import hh_logger
from .hyperparameters import find_hyperparam_grid, find_hyperparam_random
from .metric_alias import process_metric



[docs]
class HyperOptimizer(ABC):
    """
    Hyperparameter Optimizer interface.

    :param metric: function that calculates the error of the predictions of a model compared with the real dataset.
    :type metric: str or callable or Tuple[str, callable, dict]
    """

    def __init__(self, metric: str | callable | Tuple[str, callable, dict] = "MSE"):
        if isinstance(metric, tuple | list):
            _, metric_fn, metric_params = metric
        else:
            _, metric_fn, metric_params = process_metric(metric)
        self.metric_fn = make_scorer(metric_fn, **metric_params)


[docs]
    @abstractmethod
    def best_params(self, X: np.ndarray, y: np.ndarray, model: BaseEstimator, param_grid: dict = None) -> dict:
        """
        Obtains the best set parameters for the given model and dataset.

        :param X: input dataset.
        :type X: ndarray
        :param y: target dataset.
        :type y: ndarray
        :param model: machine learning model to evaluate.
        :type model: BaseEstimator
        :param param_grid: grid of parameters to search over.
        :type param_grid: dict
        :rtype: dict
        """





[docs]
class HyperOptimizerGridSearch(HyperOptimizer):
    """
    Grid Search Hyperparameter Optimizer.

    :param metric: function that calculates the error of the predicitons of a model compared with the real dataset.
    :type metric: str or callable or Tuple[str, callable, dict]
    :param n_folds_tune: number of splits in cross validation for grid search.
    :type n_folds_tune: int
    :param n_grid_points: amount of points to choose per parameter when the grid is constructed.
    :type n_grid_points: int
    """

    def __init__(self, metric: str | callable = "MSE", n_folds_tune: int = 5, n_grid_points: int = 10):
        super().__init__(metric)
        self.n_folds_tune = n_folds_tune
        self.n_grid_points = n_grid_points


[docs]
    def best_params(self, X: np.ndarray, y: np.ndarray, model: BaseEstimator, param_grid: dict = None):
        if not param_grid:
            hh_logger.info(f"No specified hyperparameter grid for {type(model).__name__}. Generating hyperparameter grid.")
            param_grid = find_hyperparam_grid(model, self.n_grid_points)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            grid_search_model = GridSearchCV(model, param_grid, scoring=self.metric_fn, n_jobs=-1, cv=self.n_folds_tune)
            grid_search_model.fit(X, y)

        results = pd.DataFrame(grid_search_model.cv_results_).dropna()
        best_params_df = results[results["rank_test_score"] == results["rank_test_score"].min()]
        best_params = best_params_df.head(1)["params"].values[0]

        return best_params





[docs]
class HyperOptimizerRandomSearch(HyperOptimizer):
    """
    Grid Search Hyperparameter Optimizer.

    :param metric: function that calculates the error of the predictions of a model compared with the real dataset.
    :type metric: str or callable or Tuple[str, callable, dict]
    :param n_folds_tune: number of splits in cross validation for grid search.
    :type n_folds_tune: int
    :param n_iter: amount of samples to take for each parameter.
    :type n_iter: int
    """

    def __init__(self, metric: str | callable = "MSE", n_folds_tune: int = 5, n_iter: int = 10):
        super().__init__(metric)
        self.n_folds_tune = n_folds_tune
        self.n_iter = n_iter


[docs]
    def best_params(self, X: np.ndarray, y: np.ndarray, model: BaseEstimator, param_grid: dict = None):
        if not param_grid:
            hh_logger.info(f"No specified hyperparameters for {type(model).__name__}. Generating hyperparameter distributions.")
            param_grid = find_hyperparam_random(model)

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            grid_search_model = RandomizedSearchCV(model, param_grid, scoring=self.metric_fn, n_jobs=-1, cv=self.n_folds_tune, n_iter=self.n_iter)
            grid_search_model.fit(X, y)

        results = pd.DataFrame(grid_search_model.cv_results_).dropna()
        best_params_df = results[results["rank_test_score"] == results["rank_test_score"].min()]
        best_params = best_params_df.head(1)["params"].values[0]

        return best_params