Source code for trainstation.optimizer

"""
Optimizer
"""

import numpy as np
from sklearn.model_selection import train_test_split
from typing import Any, Dict, List, Optional, Tuple, Union
from .model import Model
from .base_optimizer import BaseOptimizer
from .fit_methods import fit
from .tools import ScatterData


[docs]class Optimizer(BaseOptimizer): r""" This optimizer finds a solution to the linear :math:`\boldsymbol{A}\boldsymbol{x}=\boldsymbol{y}` problem. One has to specify either `train_size`/`test_size` or `train_set`/`test_set` If either `train_set` or `test_set` (or both) is specified the fractions will be ignored. Warning ------- Repeatedly setting up an :class:`Optimizer` object and training *without* changing the seed for the random number generator will yield identical or correlated results, to avoid this please specify a different seed when setting up multiple :class:`Optimizer` instances. Parameters ---------- fit_data : tuple(numpy.ndarray, numpy.ndarray) the first element of the tuple represents the fit matrix `A` (`N, M` array) while the second element represents the vector of target values `y` (`N` array); here `N` (=rows of `A`, elements of `y`) equals the number of target values and `M` (=columns of `A`) equals the number of parameters fit_method : str method to be used for training; possible choice are "ardr", "bayesian-ridge", "elasticnet", "lasso", "least-squares", "omp", "rfe", "ridge", "split-bregman" standardize : bool if True the fit matrix and target values are standardized before fitting, meaning columns in the fit matrix and th target values are rescaled to have a standard deviation of 1.0. train_size : float or int If float represents the fraction of `fit_data` (rows) to be used for training. If int, represents the absolute number of rows to be used for training. test_size : float or int If float represents the fraction of `fit_data` (rows) to be used for testing. If int, represents the absolute number of rows to be used for testing. train_set : tuple or list(int) indices of rows of `A`/`y` to be used for training test_set : tuple or list(int) indices of rows of `A`/`y` to be used for testing check_condition : bool if True the condition number will be checked (this can be sligthly more time consuming for larger matrices) seed : int seed for pseudo random number generator Attributes ---------- scatter_data_train : ScatterData target and predicted value for each row in the training set scatter_data_test : ScatterData target and predicted value for each row in the test set """ def __init__(self, fit_data: Tuple[np.ndarray, np.ndarray], fit_method: str = 'least-squares', standardize: bool = True, train_size: Union[int, float] = 0.9, test_size: Union[int, float] = None, train_set: Union[Tuple[int], List[int]] = None, test_set: Union[Tuple[int], List[int]] = None, check_condition: bool = True, seed: int = 42, **kwargs) -> None: super().__init__(fit_data, fit_method, standardize, check_condition, seed) self._kwargs = kwargs # setup train and test sets self._setup_rows(train_size, test_size, train_set, test_set) # will be populate once running train self.scatter_data_train = None # type: Optional[ScatterData] self.scatter_data_test = None # type: Optional[ScatterData]
[docs] def train(self) -> None: """ Carries out training. """ # select training data A_train = self._A[self.train_set, :] y_train = self._y[self.train_set] # perform training fit_results = fit(A_train, y_train, self.fit_method, self.standardize, self._check_condition, **self._kwargs) parameters = fit_results.pop('parameters') y_train_predicted = np.dot(A_train, parameters) self.scatter_data_train = ScatterData(y_train, y_train_predicted) # perform testing if self.test_set is not None: A_test = self._A[self.test_set, :] y_test = self._y[self.test_set] y_test_predicted = np.dot(A_test, parameters) self.scatter_data_test = ScatterData(y_test, y_test_predicted) else: y_test = None y_test_predicted = None self.scatter_data_test = None # collect model self.model = Model.from_fit_data(y_train, y_train_predicted, parameters, y_test, y_test_predicted, **fit_results)
def _setup_rows(self, train_size: Union[int, float], test_size: Optional[Union[int, float]], train_set: Optional[Union[Tuple[int], List[int]]], test_set: Optional[Union[Tuple[int], List[int]]]) -> None: """ Sets up train and test rows depending on which arguments are specified. If `train_set` and `test_set` are `None` then `train_size` and `test_size` are used. """ if train_set is None and test_set is None: train_set, test_set = self._get_rows_via_sizes(train_size, test_size) else: train_set, test_set = self._get_rows_from_indices(train_set, test_set) if len(train_set) == 0: raise ValueError('No training rows selected from fit_data') if test_set is not None: # then check overlap between train and test if len(np.intersect1d(train_set, test_set)): raise ValueError('Overlap between training and test set') if len(test_set) == 0: test_set = None self._train_set = train_set self._test_set = test_set def _get_rows_via_sizes(self, train_size: Optional[Union[int, float]], test_size: Optional[Union[int, float]]) \ -> Tuple[List[int], List[int]]: """ Returns train and test rows via sizes. """ # Handle special cases if test_size is None and train_size is None: raise ValueError('Training and test set sizes are None (empty).') elif train_size is None and abs(test_size - 1.0) < 1e-10: raise ValueError('Traininig set is empty.') elif test_size is None: if train_size == self._n_rows or abs(train_size-1.0) < 1e-10: train_set = np.arange(self._n_rows) test_set = None return train_set, test_set # split train_set, test_set = train_test_split(np.arange(self._n_rows), train_size=train_size, test_size=test_size, random_state=self.seed) return train_set, test_set def _get_rows_from_indices(self, train_set: Optional[Union[Tuple[int], List[int]]], test_set: Optional[Union[Tuple[int], List[int]]]) \ -> Tuple[np.ndarray, np.ndarray]: """ Returns rows via indices. """ if train_set is None and test_set is None: raise ValueError('Training and test sets are None (empty)') elif test_set is None: test_set = [i for i in range(self._n_rows) if i not in train_set] elif train_set is None: train_set = [i for i in range(self._n_rows) if i not in test_set] return np.array(train_set), np.array(test_set) @property def summary(self) -> Dict[str, Any]: """ Comprehensive information about the optimizer """ info = super().summary # add model metrics info = {**info, **self.model.to_dict()} # Add class specific data info['train_size'] = self.train_size info['train_set'] = self.train_set info['test_size'] = self.test_size info['test_set'] = self.test_set info['scatter_data_train'] = self.scatter_data_train info['scatter_data_test'] = self.scatter_data_test # add kwargs used for fitting info = {**info, **self._kwargs} return info def __repr__(self) -> str: kwargs = dict() kwargs['fit_method'] = self.fit_method kwargs['traininig_size'] = self.train_size kwargs['test_size'] = self.test_size kwargs['train_set'] = self.train_set kwargs['test_set'] = self.test_set kwargs['seed'] = self.seed kwargs = {**kwargs, **self._kwargs} args = ', '.join('{}={}'.format(*kwarg) for kwarg in kwargs.items()) return f'Optimizer((A, y), {args})' @property def rmse_train(self) -> float: """ Root mean squared error for training set """ return self.model.rmse_train @property def rmse_test(self) -> float: """ Root mean squared error for test set """ return self.model.rmse_test @property def AIC(self) -> float: """ Akaike information criterion (AIC) for the model """ return self.model.AIC @property def BIC(self) -> float: """ Bayesian information criterion (BIC) for the model """ return self.model.BIC @property def train_set(self) -> List[int]: """ Indices of rows included in the training set """ return self._train_set @property def test_set(self) -> List[int]: """ Indices of rows included in the test set """ return self._test_set @property def train_size(self) -> int: """ Number of rows included in training set """ return len(self.train_set) @property def train_fraction(self) -> float: """ Fraction of rows included in training set """ return self.train_size / self._n_rows @property def test_size(self) -> int: """ Number of rows included in test set """ if self.test_set is None: return 0 return len(self.test_set) @property def test_fraction(self) -> float: """ Fraction of rows included in test set """ if self.test_set is None: return 0.0 return self.test_size / self._n_rows