Source code for feature_encoders.encode._encoders

# -*- coding: utf-8 -*-
# Copyright (c) Hebes Intelligence Private Company

# This source code is licensed under the Apache License, Version 2.0 found in the
# LICENSE file in the root directory of this source tree.

import logging
import warnings

import numpy as np
import pandas as pd
from pandas.api.types import is_bool_dtype as is_bool
from pandas.api.types import is_categorical_dtype as is_category
from pandas.api.types import is_integer_dtype as is_integer
from pandas.api.types import is_object_dtype as is_object
from scipy.stats import skew, wasserstein_distance
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    FunctionTransformer,
    OneHotEncoder,
    OrdinalEncoder,
    SplineTransformer,
    StandardScaler,
)
from sklearn.tree import DecisionTreeRegressor
from sklearn.utils.validation import check_is_fitted

from ..utils import add_constant, as_list, check_X, check_y, maybe_reshape_2d

logger = logging.getLogger("feature-encoding")

UNKNOWN_VALUE = -1


#####################################################################################
# Encode features
#
# All encoders generate numpy arrays
#####################################################################################


[docs]class IdentityEncoder(TransformerMixin, BaseEstimator):
    """Create an encoder that returns what it is fed.

    This encoder can act as a linear feature encoder.

    Args:
        feature (str or list of str, optional): The name(s) of the input dataframe's
            column(s) to return. If None, the whole input dataframe will be returned.
            Defaults to None.
        as_filter (bool, optional): If True, the encoder will return all feature labels
            for which "feature in label == True". Defaults to False.
        include_bias (bool, optional): If True, a column of ones is added to the output.
            Defaults to False.

    Raises:
        ValueError: If `as_filter` is True, `feature` cannot include multiple feature names.
    """

    def __init__(self, feature=None, as_filter=False, include_bias=False):
        if as_filter and isinstance(feature, list):
            raise ValueError(
                "If `as_filter` is True, `feature` cannot include multiple feature names"
            )

        self.feature = feature
        self.as_filter = as_filter
        self.include_bias = include_bias
        self.features_ = as_list(feature)

[docs]    def fit(self, X: pd.DataFrame, y=None):
        """Fit the encoder on the available data.

        Args:
            X (pandas.DataFrame of shape (n_samples, n_features)): The input dataframe.
            y (None, optional): Ignored.
                Defaults to None.

        Raises:
            ValueError: If the input data does not pass the checks of `utils.check_X`.

        Returns:
            IdentityEncoder: Fitted encoder.
        """
        X = check_X(X)

        if self.feature is None:
            n_features_out_ = X.shape[1]
        elif (self.feature is not None) and not self.as_filter:
            n_features_out_ = len(self.features_)
        else:
            n_features_out_ = X.filter(like=self.feature, axis=1).shape[1]

        self.n_features_out_ = int(self.include_bias) + n_features_out_
        self.fitted_ = True
        return self

[docs]    def transform(self, X: pd.DataFrame):
        """Apply the encoder.

        Args:
            X (pandas.DataFrame of shape (n_samples, n_features)): The input
                dataframe.

        Raises:
            ValueError: If the input data does not pass the checks of `utils.check_X`.
            ValueError: If `include_bias` is True and a column with constant values
                already exists in the returned columns.

        Returns:
            numpy array of shape: The selected column subset as a numpy array.
        """
        check_is_fitted(self, "fitted_")
        X = check_X(X)

        if (self.feature is not None) and not self.as_filter:
            X = X[self.features_]
        elif self.feature is not None:
            X = X.filter(like=self.feature, axis=1)

        if self.include_bias:
            X = add_constant(X, has_constant="raise")

        return np.array(X)


[docs]class SafeOrdinalEncoder(TransformerMixin, BaseEstimator):
    """Encode categorical features as an integer array.

    The encoder converts the features into ordinal integers. This results
    in a single column of integers (0 to n_categories - 1) per feature.

    Args:
        feature (str or list of str, optional): The names of the columns to
            encode. If None, all categorical columns will be encoded. Defaults
            to None.
        unknown_value (int, optional): This parameter will set the encoded value
            for unknown categories. It has to be distinct from the values used to
            encode any of the categories in `fit`. If None, the value `-1` is used.
            During `transform`, unknown categories will be replaced using the most
            frequent value along each column. Defaults to None.
    """

    def __init__(self, feature=None, unknown_value=None):
        self.feature = feature
        self.unknown_value = unknown_value
        self.features_ = as_list(feature)

[docs]    def fit(self, X: pd.DataFrame, y=None):
        """Fit the encoder on the available data.

        Args:
            X (pandas.DataFrame of shape (n_samples, n_features)): The input dataframe.
            y (None, optional): Ignored. Defaults to None.

        Returns:
            SafeOrdinalEncoder: Fitted encoder.

        Raises:
            ValueError: If the input data does not pass the checks of `utils.check_X`.
        """
        X, categorical_cols, _ = check_X(X, exists=self.features_, return_col_info=True)

        if not self.features_:
            self.features_ = categorical_cols
        else:
            for name in self.features_:
                if pd.api.types.is_float_dtype(X[name]):
                    raise ValueError("The encoder is applied on numerical data")

        self.feature_pipeline_ = Pipeline(
            [
                (
                    "select",
                    ColumnTransformer(
                        [("select", "passthrough", self.features_)], remainder="drop"
                    ),
                ),
                (
                    "encode_ordinal",
                    OrdinalEncoder(
                        handle_unknown="use_encoded_value",
                        unknown_value=self.unknown_value or UNKNOWN_VALUE,
                        dtype=np.int16,
                    ),
                ),
                (
                    "impute_unknown",
                    SimpleImputer(
                        missing_values=self.unknown_value or UNKNOWN_VALUE,
                        strategy="most_frequent",
                    ),
                ),
            ]
        )
        # Fit the pipeline
        self.feature_pipeline_.fit(X)
        self.n_features_out_ = len(self.features_)
        self.fitted_ = True
        return self

[docs]    def transform(self, X: pd.DataFrame):
        """Apply the encoder.

        Args:
            X (pandas.DataFrame of shape (n_samples, n_features)): The input
                dataframe.

        Raises:
            ValueError: If the input data does not pass the checks of `utils.check_X`.

        Returns:
            numpy array of shape: The encoded column subset as a numpy array.
        """
        check_is_fitted(self, "fitted_")
        X = check_X(X, exists=self.features_)
        return self.feature_pipeline_.transform(X)


[docs]class SafeOneHotEncoder(TransformerMixin, BaseEstimator):
    """Encode categorical features in a one-hot form.

    The encoder uses a `SafeOrdinalEncoder`to first encode the feature as an
    integer array and then a `sklearn.preprocessing.OneHotEncoder` to encode
    the features as an one-hot array.

    Args:
        feature (str or list of str, optional): The names of the columns to
            encode. If None, all categorical columns will be encoded. Defaults
            to None.
        unknown_value (int, optional): This parameter will set the encoded value
            of unknown categories. It has to be distinct from the values used to
            encode any of the categories in `fit`. If None, the value `-1` is used.
            During `transform`, unknown categories will be replaced using the most
            frequent value along each column. Defaults to None.
    """

    def __init__(self, feature=None, unknown_value=None):
        self.feature = feature
        self.unknown_value = unknown_value
        self.features_ = as_list(feature)

[docs]    def fit(self, X: pd.DataFrame, y=None):
        """Fit the encoder on the available data.

        Args:
            X (pandas.DataFrame of shape (n_samples, n_features)): The input dataframe.
            y (None, optional): Ignored. Defaults to None.

        Returns:
            SafeOneHotEncoder: Fitted encoder.

        Raises:
            ValueError: If the input data does not pass the checks of `utils.check_X`.
            ValueError: If the encoder is applied on numerical (float) data.
        """
        X, categorical_cols, _ = check_X(X, exists=self.features_, return_col_info=True)

        if not self.features_:
            self.features_ = categorical_cols
        else:
            for name in self.features_:
                if pd.api.types.is_float_dtype(X[name]):
                    raise ValueError("The encoder is applied on numerical data")

        self.feature_pipeline_ = Pipeline(
            [
                (
                    "encode_ordinal",
                    SafeOrdinalEncoder(
                        feature=self.features_,
                        unknown_value=self.unknown_value or UNKNOWN_VALUE,
                    ),
                ),
                ("one_hot", OneHotEncoder(drop=None, sparse=False)),
            ]
        )
        # Fit the pipeline
        self.feature_pipeline_.fit(X)

        self.n_features_out_ = 0
        for category in self.feature_pipeline_["one_hot"].categories_:
            self.n_features_out_ += len(category)

        self.fitted_ = True
        return self

[docs]    def transform(self, X: pd.DataFrame):
        """Apply the encoder.

        Args:
            X (pandas.DataFrame of shape (n_samples, n_features)): The input
                dataframe.

        Raises:
            ValueError: If the input data does not pass the checks of `utils.check_X`.

        Returns:
            numpy array of shape: The encoded column subset as a numpy array.
        """
        check_is_fitted(self, "fitted_")
        X = check_X(X, exists=self.features_)
        return self.feature_pipeline_.transform(X)


[docs]class TargetClusterEncoder(TransformerMixin, BaseEstimator):
    """Encode a categorical feature as clusters of the target's values.

    The purpose of this encoder is to reduce the cardinality of a categorical
    feature. This encoder does not replace unknown values with the most frequent
    one during `transform`. It just assigns them the value of `unknown_value`.

    Args:
        feature (str): The name of the categorical feature to transform. This
            encoder operates on a single feature.
        max_n_categories (int, optional): The maximum number of categories to
            produce. Defaults to None.
        stratify_by (str or list of str, optional): If not None, the encoder
            will first stratify the categorical feature into groups that have
            similar values of the features in `stratify_by`, and then cluster
            based on the relationship between the categorical feature and the
            target. It is used only if the number of unique categories minus
            the `excluded_categories` is larger than `max_n_categories`.
            Defaults to None.
        excluded_categories (str or list of str, optional): The names of the
            categories to be excluded from the clustering process. These categories
            will stay intact by the encoding process, so they cannot have the
            same values as the encoder's results (the encoder acts as an
            ``OrdinalEncoder`` in the sense that the feature is converted into
            a column of integers 0 to n_categories - 1). Defaults to None.
        unknown_value (int, optional): This parameter will set the encoded value of
            unknown categories. It has to be distinct from the values used to encode
            any of the categories in `fit`. If None, the value `-1` is used. Defaults
            to None.
        min_samples_leaf (int, optional): The minimum number of samples required to be
            at a leaf node of the decision tree model that is used for stratifying the
            categorical feature if `stratify_by` is not None. The actual number that will
            be passed to the tree model is `min_samples_leaf` multiplied by the number of
            unique values in the categorical feature to transform. Defaults to 1.
        max_features (int, float or {"auto", "sqrt", "log2"}, optional): The number of
            features that the decision tree considers when looking for the best split:

                - If int, then consider `max_features` features at each split of the decision
                  tree

                - If float, then `max_features` is a fraction and `int(max_features * n_features)`
                  features are considered at each split

                - If "auto", then `max_features=n_features`

                - If "sqrt", then `max_features=sqrt(n_features)`

                - If "log2", then `max_features=log2(n_features)`

                - If None, then `max_features=n_features`

            Defaults to "auto".
        random_state (int or RandomState instance, optional): Controls the randomness of
            the decision tree estimator. To obtain a deterministic behaviour during its
            fitting, ``random_state`` has to be fixed to an integer. Defaults to None.
    """

    def __init__(
        self,
        *,
        feature,
        max_n_categories,
        stratify_by=None,
        excluded_categories=None,
        unknown_value=None,
        min_samples_leaf=5,
        max_features="auto",
        random_state=None,
    ):
        self.feature = feature
        self.max_n_categories = max_n_categories
        self.stratify_by = stratify_by
        self.excluded_categories = excluded_categories
        self.unknown_value = unknown_value
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.random_state = random_state
        self.stratify_by_ = as_list(stratify_by)
        self.excluded_categories_ = as_list(excluded_categories)

[docs]    def fit(self, X: pd.DataFrame, y: pd.DataFrame):
        """Fit the encoder on the available data.

        Args:
            X (pandas.DataFrame of shape (n_samples, n_features)): The input dataframe.
            y (pandas.DataFrame of shape (n_samples, 1)): The target dataframe.

        Returns:
            TargetClusterEncoder: Fitted encoder.

        Raises:
            ValueError: If the input data does not pass the checks of `utils.check_X`.
            ValueError: If the encoder is applied on numerical (float) data.
            ValueError: If any of the values in `excluded_categories` is not found in
                the input data.
            ValueError: If the number of categories left after removing all in
                `excluded_categories` is not larger than `max_n_categories`.
        """
        X = check_X(X, exists=[self.feature] + self.stratify_by_)
        if pd.api.types.is_float_dtype(X[self.feature]):
            raise ValueError("The encoder is applied on numerical data")

        y = check_y(y, index=X.index)
        self.target_name_ = y.columns[0]

        X = X.merge(y, left_index=True, right_index=True)

        if self.excluded_categories_:
            unique_vals = X[self.feature].unique()
            for value in self.excluded_categories_:
                if value not in unique_vals:
                    raise ValueError(
                        f"Value {value} of `excluded_categories` not found "
                        f"in the {self.feature} data."
                    )

            mask = X[self.feature].isin(self.excluded_categories_)
            X = X.loc[~mask]
            if len(X) == 0:
                raise ValueError(
                    "No categories left after removing all in `excluded_categories`."
                )
            if X[self.feature].nunique() <= self.max_n_categories:
                raise ValueError(
                    "The number of categories left after removing all in `excluded_categories` "
                    "must be larger than `max_n_categories`."
                )

        if not self.stratify_by_:
            self.mapping_ = self._cluster_without_stratify(X)
        else:
            self.mapping_ = self._cluster_with_stratify(X)

        if self.excluded_categories_:
            for i, cat in enumerate(self.excluded_categories_):
                self.mapping_.update({cat: self.max_n_categories + i})

        self.n_features_out_ = 1
        self.fitted_ = True
        return self

    def _cluster_without_stratify(self, X):
        reference = np.array(X[self.target_name_])
        X = X.groupby(self.feature)[self.target_name_].agg(
            ["mean", "std", skew, lambda x: wasserstein_distance(x, reference)]
        )
        X.fillna(value=1, inplace=True)

        X_to_cluster = StandardScaler().fit_transform(X)
        n_clusters = min(X_to_cluster.shape[0], self.max_n_categories)
        clusterer = KMeans(n_clusters=n_clusters)

        with warnings.catch_warnings(record=True) as warning:
            cluster_labels = pd.Series(
                data=clusterer.fit_predict(X_to_cluster), index=X.index
            )
            for w in warning:
                logger.warning(str(w))
        return cluster_labels.to_dict()

    def _cluster_with_stratify(self, X):
        X_train = None
        for col in self.stratify_by_:
            if (
                is_bool(X[col])
                or is_object(X[col])
                or is_category(X[col])
                or is_integer(X[col])
            ):
                X_train = pd.concat((X_train, pd.get_dummies(X[col])), axis=1)
                X_train.columns = X_train.columns.astype(str)
            else:
                X_train = pd.concat((X_train, X[col]), axis=1)

        y_train = X[self.target_name_]
        n_categories = X[self.feature].nunique()

        min_samples_leaf = n_categories * int(self.min_samples_leaf)
        model = DecisionTreeRegressor(
            min_samples_leaf=min_samples_leaf,
            max_features=self.max_features,
            random_state=self.random_state,
        )
        model = model.fit(X_train, y_train)
        leaf_ids = model.apply(X_train)
        uniq_ids = np.unique(leaf_ids)
        leaf_samples = [np.where(leaf_ids == id)[0] for id in uniq_ids]

        X_to_cluster = pd.DataFrame(
            index=X[self.feature].unique(), columns=range(len(leaf_samples))
        )
        for i, idx in enumerate(leaf_samples):
            subset = X.iloc[idx][[self.feature, self.target_name_]]
            a = subset.groupby(self.feature)[self.target_name_].mean()
            a = a.reindex(X_to_cluster.index)
            X_to_cluster.iloc[:, i] = a

        X_to_cluster = X_to_cluster.fillna(X_to_cluster.median())
        n_clusters = min(X_to_cluster.shape[0], self.max_n_categories)

        clusterer = KMeans(n_clusters=n_clusters)
        with warnings.catch_warnings(record=True) as warning:
            cluster_labels = pd.Series(
                data=clusterer.fit_predict(X_to_cluster), index=X_to_cluster.index
            )
            for w in warning:
                logger.warning(str(w))
        return cluster_labels.to_dict()

[docs]    def transform(self, X: pd.DataFrame):
        """Apply the encoder.

        Args:
            X (pandas.DataFrame of shape (n_samples, n_features)): The input dataframe.

        Returns:
            numpy array: The encoded column subset as a numpy array.

        Raises:
            ValueError: If the input data does not pass the checks of `utils.check_X`.
        """
        check_is_fitted(self, "fitted_")
        X = check_X(X, exists=self.feature)

        return maybe_reshape_2d(
            np.array(
                X[self.feature].map(
                    lambda x: int(
                        self.mapping_.get(x, self.unknown_value or UNKNOWN_VALUE)
                    )
                )
            )
        )


[docs]class CategoricalEncoder(TransformerMixin, BaseEstimator):
    """Encode categorical features.

    If `max_n_categories` is not `None` and the number of unique values of the
    categorical feature is larger than the `max_n_categories` minus the
    `excluded_categories`, the `TargetClusterEncoder` will be called.

    If `encode_as = 'onehot'`, the result comes from a `TargetClusterEncoder` +
    `SafeOneHotEncoder` pipeline, otherwise from a `TargetClusterEncoder` +
    `SafeOrdinalEncoder` one.

    Args:
        feature (str): The name of the categorical feature to transform. This
            encoder operates on a single feature.
        max_n_categories (int, optional): The maximum number of categories to
            produce. Defaults to None.
        stratify_by (str or list of str, optional): If not None, the encoder
            will first stratify the categorical feature into groups that have
            similar values of the features in `stratify_by`, and then cluster
            based on the relationship between the categorical feature and the
            target. It is used only if the number of unique categories minus
            the `excluded_categories` is larger than `max_n_categories`.
            Defaults to None.
        excluded_categories (str or list of str, optional): The names of the
            categories to be excluded from the clustering process. These categories
            will stay intact by the encoding process, so they cannot have the
            same values as the encoder's results (the encoder acts as an
            ``OrdinalEncoder`` in the sense that the feature is converted into
            a column of integers 0 to n_categories - 1). Defaults to None.
        unknown_value (int, optional): This parameter will set the encoded value of
            unknown categories. It has to be distinct from the values used to encode
            any of the categories in `fit`. If None, the value `-1` is used. Defaults
            to None.
        min_samples_leaf (int, optional): The minimum number of samples required to be
            at a leaf node of the decision tree model that is used for stratifying the
            categorical feature if `stratify_by` is not None. The actual number that will
            be passed to the tree model is `min_samples_leaf` multiplied by the number of
            unique values in the categorical feature to transform. Defaults to 1.
        max_features (int, float or {"auto", "sqrt", "log2"}, optional): The number of
            features that the decision tree considers when looking for the best split:

                - If int, then consider `max_features` features at each split of the decision
                  tree

                - If float, then `max_features` is a fraction and `int(max_features * n_features)`
                  features are considered at each split

                - If "auto", then `max_features=n_features`

                - If "sqrt", then `max_features=sqrt(n_features)`

                - If "log2", then `max_features=log2(n_features)`

                - If None, then `max_features=n_features`

            Defaults to "auto".
        random_state (int or RandomState instance, optional): Controls the randomness of
            the decision tree estimator. To obtain a deterministic behaviour during its
            fitting, ``random_state`` has to be fixed to an integer. Defaults to None.
        encode_as ({'onehot', 'ordinal'}, optional): Method used to encode the transformed
            result.

                - If "onehot", encode the transformed result with one-hot encoding and return a
                  dense array

                - If "ordinal", encode the transformed result as integer values

            Defaults to "onehot".
    """

    def __init__(
        self,
        *,
        feature,
        max_n_categories=None,
        stratify_by=None,
        excluded_categories=None,
        unknown_value=None,
        min_samples_leaf=1,
        max_features="auto",
        random_state=None,
        encode_as="onehot",
    ):
        self.feature = feature
        self.max_n_categories = max_n_categories
        self.stratify_by = stratify_by
        self.excluded_categories = excluded_categories
        self.unknown_value = unknown_value
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.random_state = random_state
        self.encode_as = encode_as
        self.excluded_categories_ = as_list(excluded_categories)

    def _to_pandas(self, arr: np.ndarray):
        return pd.DataFrame(arr, columns=[self.feature])

[docs]    def fit(self, X: pd.DataFrame, y: pd.DataFrame = None):
        """Fit the encoder on the available data.

        Args:
            X (pandas.DataFrame of shape (n_samples, n_features)): The input dataframe.
            y (pandas.DataFrame of shape (n_samples, 1), optional): The target dataframe.
                Defaults to None.

        Raises:
            ValueError: If the input data does not pass the checks of `utils.check_X`.
            ValueError: If the encoder is applied on numerical (float) data.
            ValueError: If the number of categories minus the `excluded_categories`
                is larger than `max_n_categories` but target values (y) are not
                provided.
            ValueError: If any of the values in `excluded_categories` is not found in
                the input data.

        Returns:
            CategoricalEncoder: Fitted encoder.
        """
        X = check_X(X, exists=self.feature)
        if pd.api.types.is_float_dtype(X[self.feature]):
            raise ValueError("The encoder is applied on numerical data")

        n_categories = X[self.feature].nunique()
        use_target = (self.max_n_categories is not None) and (
            n_categories - len(self.excluded_categories_) > self.max_n_categories
        )

        if use_target and (y is None):
            raise ValueError(
                f"The number of categories to encode: {n_categories - len(self.excluded_categories_)}"
                f" is larger than `max_n_categories`: {self.max_n_categories}. In this case, "
                "the target values must be provided for target-based encoding."
            )

        if not use_target:
            self.feature_pipeline_ = Pipeline(
                [
                    (
                        "encode_features",
                        SafeOneHotEncoder(
                            feature=self.feature, unknown_value=self.unknown_value
                        ),
                    )
                    if self.encode_as == "onehot"
                    else (
                        "encode_features",
                        SafeOrdinalEncoder(
                            feature=self.feature, unknown_value=self.unknown_value
                        ),
                    )
                ]
            )
        else:
            self.feature_pipeline_ = Pipeline(
                [
                    (
                        "reduce_dimension",
                        TargetClusterEncoder(
                            feature=self.feature,
                            stratify_by=self.stratify_by,
                            max_n_categories=self.max_n_categories,
                            excluded_categories=self.excluded_categories,
                            unknown_value=self.unknown_value,
                            min_samples_leaf=self.min_samples_leaf,
                            max_features=self.max_features,
                            random_state=self.random_state,
                        ),
                    ),
                    (
                        "to_pandas",
                        FunctionTransformer(self._to_pandas),
                    ),
                    (
                        "encode_features",
                        SafeOneHotEncoder(
                            feature=self.feature, unknown_value=self.unknown_value
                        ),
                    )
                    if self.encode_as == "onehot"
                    else (
                        "encode_features",
                        SafeOrdinalEncoder(
                            feature=self.feature, unknown_value=self.unknown_value
                        ),
                    ),
                ]
            )

        # Fit the pipeline
        self.feature_pipeline_.fit(X, y)
        self.n_features_out_ = self.feature_pipeline_["encode_features"].n_features_out_
        self.fitted_ = True
        return self

[docs]    def transform(self, X: pd.DataFrame):
        """Apply the encoder.

        Args:
            X (pandas.DataFrame of shape (n_samples, n_features)): The input dataframe.

        Raises:
            ValueError: If the input data does not pass the checks of `utils.check_X`.

        Returns:
            numpy array: The encoded features as a numpy array.
        """
        check_is_fitted(self, "fitted_")
        X = check_X(X, exists=self.feature)
        return self.feature_pipeline_.transform(X)


[docs]class SplineEncoder(TransformerMixin, BaseEstimator):
    """Generate univariate B-spline bases for features.

    The encoder generates a matrix consisting of `n_splines=n_knots + degree - 1`
    spline basis functions (B-splines) of polynomial order=`degree` for the given
    feature.

    Args:
        feature (str): The name of the column to encode.
        n_knots (int, optional): Number of knots of the splines if `knots` equals one
            of {'uniform', 'quantile'}. Must be larger or equal 2. Ignored if `knots`
            is array-like. Defaults to 5.
        degree (int, optional): The polynomial degree of the spline basis. Must be a
            non-negative integer. Defaults to 3.
        strategy ({'uniform', 'quantile'} or array-like of shape (n_knots, n_features),
            optional): Set knot positions such that first knot <= features <= last knot.

                - If 'uniform', `n_knots` number of knots are distributed uniformly
                  from min to max values of the features (each bin has the same width)

                - If 'quantile', they are distributed uniformly along the quantiles of
                  the features (each bin has the same number of observations)

                - If an array-like is given, it directly specifies the sorted knot
                  positions including the boundary knots. Note that, internally,
                  `degree` number of knots are added before the first knot, the same
                  after the last knot

            Defaults to "uniform".
        extrapolation ({'error', 'constant', 'linear', 'continue'}, optional): If 'error',
            values outside the min and max values of the training features raises a `ValueError`.
            If 'constant', the value of the splines at minimum and maximum value of the features
            is used as constant extrapolation. If 'linear', a linear extrapolation is used. If
            'continue', the splines are extrapolated as is, option `extrapolate=True` in
            `scipy.interpolate.BSpline`.
            Defaults to "constant".
        include_bias (bool, optional): If False, then the last spline element inside the data
            range of a feature is dropped. As B-splines sum to one over the spline basis functions
            for each data point, they implicitly include a bias term. Defaults to True.
        order ({'C', 'F'}, optional): Order of output array. 'F' order is faster to compute, but
            may slow down subsequent estimators. Defaults to "C".
    """

    def __init__(
        self,
        *,
        feature,
        n_knots=5,
        degree=3,
        strategy="uniform",
        extrapolation="constant",
        include_bias=True,
        order="C",
    ):
        self.feature = feature
        self.n_knots = n_knots
        self.degree = degree
        self.strategy = strategy
        self.extrapolation = extrapolation
        self.include_bias = include_bias
        self.order = order

[docs]    def fit(self, X: pd.DataFrame, y=None, sample_weight=None):
        """Fit the encoder.

        Args:
            X (pandas.DataFrame of shape (n_samples, n_features)): The data to fit.
            y (None, optional): Ignored. Defaults to None.
            sample_weight (array-like of shape (n_samples,), optional): Individual
                weights for each sample. Used to calculate quantiles if `strategy="quantile"`.
                For `strategy="uniform"`, zero weighted observations are ignored for finding
                the min and max of `X`. Defaults to None.

        Raises:
            ValueError: If the input data does not pass the checks of `utils.check_X`.

        Returns:
            SplineEncoder: Fitted encoder.
        """
        X = check_X(X, exists=self.feature)
        self.encoder_ = SplineTransformer(
            n_knots=self.n_knots,
            degree=self.degree,
            knots=self.strategy,
            extrapolation=self.extrapolation,
            include_bias=self.include_bias,
            order=self.order,
        )

        self.encoder_.fit(X[[self.feature]])
        self.n_features_out_ = self.encoder_.n_features_out_
        self.fitted_ = True
        return self

[docs]    def transform(self, X):
        """Transform the feature data to B-splines.

        Args:
            X (pandas.DataFrame of shape (n_samples, n_features)): The data to transform.

        Raises:
            ValueError: If the input data does not pass the checks of `utils.check_X`.

        Returns:
            numpy.ndarray: The B-splines matrix.
        """
        check_is_fitted(self, "fitted_")
        X = check_X(X, exists=self.feature)
        return self.encoder_.transform(X[[self.feature]])