Source code for maxframe.learn.preprocessing._label._label_encoder

# Copyright 1999-2025 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np

from .... import tensor as mt
from ....udf import builtin_function
from ...core import BaseEstimator, TransformerMixin
from ...utils._encode import _encode, _unique
from ...utils.validation import _num_samples, column_or_1d

try:
    from sklearn.utils.validation import check_is_fitted
except ImportError:
    check_is_fitted = lambda *_: None



[docs]
class LabelEncoder(TransformerMixin, BaseEstimator):
    """Encode target labels with value between 0 and n_classes-1.

    This transformer should be used to encode target values, *i.e.* `y`, and
    not the input `X`.

    Read more in the :ref:`User Guide <preprocessing_targets>`.

    Attributes
    ----------
    classes_ : ndarray of shape (n_classes,)
        Holds the label for each class.

    See Also
    --------
    OrdinalEncoder : Encode categorical features using an ordinal encoding
        scheme.
    OneHotEncoder : Encode categorical features as a one-hot numeric array.

    Examples
    --------
    `LabelEncoder` can be used to normalize labels.

    >>> from maxframe.learn import preprocessing
    >>> le = preprocessing.LabelEncoder()
    >>> le.fit([1, 2, 2, 6]).execute()
    LabelEncoder()
    >>> le.classes_.to_numpy()
    array([1, 2, 6])
    >>> le.transform([1, 1, 2, 6]).to_numpy()
    array([0, 0, 1, 2]...)
    >>> le.inverse_transform([0, 0, 1, 2]).to_numpy()
    array([1, 1, 2, 6])

    It can also be used to transform non-numerical labels (as long as they are
    hashable and comparable) to numerical labels.

    >>> le = preprocessing.LabelEncoder()
    >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]).execute()
    LabelEncoder()
    >>> list(le.classes_.to_numpy())
    ['amsterdam', 'paris', 'tokyo']
    >>> le.transform(["tokyo", "tokyo", "paris"]).to_numpy()
    array([2, 2, 1]...)
    >>> list(le.inverse_transform([2, 2, 1]).to_numpy())
    ['tokyo', 'tokyo', 'paris']
    """

    def fit(self, y, execute=False, session=None, run_kwargs=None):
        """Fit label encoder.

        Parameters
        ----------
        y : array-like of shape (n_samples,)
            Target values.

        Returns
        -------
        self : returns an instance of self.
            Fitted label encoder.
        """
        y = column_or_1d(y, warn=True)
        self.classes_ = _unique(y)
        if execute:
            self.execute(session=session, **(run_kwargs or dict()))
        return self

    def fit_transform(self, y, execute=False, session=None, run_kwargs=None):
        """Fit label encoder and return encoded labels.

        Parameters
        ----------
        y : array-like of shape (n_samples,)
            Target values.

        Returns
        -------
        y : array-like of shape (n_samples,)
            Encoded labels.
        """
        y = column_or_1d(y, warn=True)
        self.classes_, y = _unique(y, return_inverse=True)
        if execute:
            self.execute(session=session, extra_tileables=y, **(run_kwargs or dict()))
        return y

    def transform(self, y, execute=False, session=None, run_kwargs=None):
        """Transform labels to normalized encoding.

        Parameters
        ----------
        y : array-like of shape (n_samples,)
            Target values.

        Returns
        -------
        y : array-like of shape (n_samples,)
            Labels as normalized encodings.
        """
        check_is_fitted(self)
        y = column_or_1d(y, warn=True)
        # transform of empty array is empty array
        if _num_samples(y) == 0:
            return mt.array([])

        t = _encode(y, uniques=self.classes_)
        if execute:
            t = t.execute(session=session, **(run_kwargs or dict()))
        return t

    @staticmethod
    @builtin_function
    def _class_checker(chunk_data, classes_data):
        diff = np.setdiff1d(chunk_data, np.arange(len(classes_data)))
        if len(diff):
            raise ValueError("y contains previously unseen labels: %s" % str(diff))
        return classes_data[chunk_data]

    def inverse_transform(self, y, execute=False, session=None, run_kwargs=None):
        """Transform labels back to original encoding.

        Parameters
        ----------
        y : ndarray of shape (n_samples,)
            Target values.

        Returns
        -------
        y : ndarray of shape (n_samples,)
            Original encoding.
        """
        check_is_fitted(self)
        y = column_or_1d(y, warn=True)
        # inverse transform of empty array is empty array
        if _num_samples(y) == 0:
            return mt.array([])

        labels = mt.asarray(y).mf.apply_chunk(
            self._class_checker, args=(self.classes_,), dtype=self.classes_.dtype
        )
        if execute:
            labels.execute(session=session, **(run_kwargs or dict()))
        return labels

    def _more_tags(self):
        return {"X_types": ["1dlabels"]}