Source code for feature_encoders.models.linear

# -*- coding: utf-8 -*-
# Copyright (c) Hebes Intelligence Private Company

# This source code is licensed under the Apache License, Version 2.0 found in the
# LICENSE file in the root directory of this source tree.

from typing import Union

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.exceptions import NotFittedError
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.utils.validation import check_is_fitted

from ..compose import FeatureComposer, ModelStructure
from ..utils import check_X, check_y


[docs]class LinearPredictor(RegressorMixin, BaseEstimator): """A linear regression model with flexible parameterization. Args: model_structure (ModelStructure): The structure of a linear regression model. alpha (float, optional): Regularization strength of the underlying ridge regression; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. Defaults to 0.01. fit_intercept (bool, optional): Whether to fit the intercept for this model. If set to false, no intercept will be used in calculations. Defaults to False. """ def __init__( self, *, model_structure: ModelStructure, alpha=0.01, fit_intercept=False ): self.model_structure = model_structure self.alpha = alpha self.fit_intercept = fit_intercept self.composer_ = FeatureComposer(model_structure) @property def n_parameters(self): try: self.n_parameters_ except AttributeError as exc: raise ValueError( "The number of parameters is acceccible only after " "the model has been fitted" ) from exc else: return self.n_parameters_ @property def dof(self): try: self.dof_ except AttributeError as exc: raise ValueError( "The degrees of freedom are acceccible only after " "the model has been fitted" ) from exc else: return self.dof_
[docs] def fit(self, X: pd.DataFrame, y: Union[pd.DataFrame, pd.Series]): """Fit the estimator with the available data. Args: X (pandas.DataFrame): Input data. y (pandas.Series or pandas.DataFrame): Target data. Raises: Exception: If the estimator is re-fitted. An estimator object can only be fitted once. ValueError: If the input data does not pass the checks of `utils.check_X`. ValueError: If the target data does not pass the checks of `utils.check_y`. Returns: LinearPredictor: Fitted estimator. """ try: check_is_fitted(self, "fitted_") except NotFittedError: pass else: raise Exception( "Estimator object can only be fit once. Instantiate a new object." ) X = check_X(X) y = check_y(y, index=X.index) self.target_name_ = y.columns[0] design_matrix = self.composer_.fit_transform(X, y) self.n_parameters_ = design_matrix.shape[1] self.dof_ = np.linalg.matrix_rank(design_matrix) if self.alpha is None: self.base_estimator_ = LinearRegression(fit_intercept=self.fit_intercept) else: self.base_estimator_ = Ridge( alpha=self.alpha, fit_intercept=self.fit_intercept ) self.base_estimator_ = self.base_estimator_.fit(design_matrix, y) self.fitted_ = True return self
[docs] def predict(self, X: pd.DataFrame, include_components=False): """Predict using the given input data. Args: X (pandas.DataFrame): Input data. include_components (bool, optional): If True, the prediction dataframe will include also the individual components' contribution to the predicted values. Defaults to False. Returns: pandas.DataFrame: The prediction. """ check_is_fitted(self, "fitted_") X = check_X(X) design_matrix = self.composer_.transform(X) prediction = pd.DataFrame( data=self.base_estimator_.predict(design_matrix), columns=[self.target_name_], index=X.index, ) if include_components: components = pd.DataFrame( 0, index=X.index, columns=self.composer_.component_names_ ) feature_cols = self.composer_.component_matrix for col in components.columns: subset = feature_cols[feature_cols[col] == 1].index.to_list() coef = self.base_estimator_.coef_.squeeze() pred = np.matmul(design_matrix[:, subset], coef[subset]) components[col] = components[col] + pred prediction = pd.concat((prediction, components), axis=1) return prediction