# Copyright 1999-2025 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Union
import pandas as pd
from pandas.api.types import is_list_like
from pandas.core.dtypes.common import pandas_dtype
from ..core import ENTITY_TYPE
from ..serialization.serializables import SerializableMeta
from ..tensor import stack
from ..tensor import tensor as astensor
from ..tensor.array_utils import is_cupy
from ..tensor.core import TENSOR_TYPE
from ..utils import ceildiv, lazy_import
from .core import DATAFRAME_TYPE, INDEX_TYPE, SERIES_TYPE
from .core import DataFrame as _Frame
from .core import Index as _Index
from .core import Series as _Series
from .datasource.dataframe import from_pandas as from_pandas_df
from .datasource.from_tensor import (
dataframe_from_1d_tileables,
dataframe_from_tensor,
series_from_tensor,
)
from .datasource.index import from_pandas as from_pandas_index
from .datasource.index import from_tileable as from_tileable_index
from .datasource.series import from_pandas as from_pandas_series
from .utils import is_cudf, is_index
cudf = lazy_import("cudf")
class InitializerMeta(SerializableMeta):
def __instancecheck__(cls, instance):
return isinstance(instance, (cls.__base__,) + getattr(cls, "_allow_data_type_"))
[docs]
class DataFrame(_Frame, metaclass=InitializerMeta):
[docs]
def __init__(
self,
data=None,
index=None,
columns=None,
dtype=None,
copy=False,
chunk_size=None,
gpu=None,
sparse=None,
num_partitions=None,
):
need_repart = False
if columns is not None and not is_list_like(columns):
raise ValueError("columns must be a list-like object")
if isinstance(data, TENSOR_TYPE):
if chunk_size is not None:
data = data.rechunk(chunk_size)
df = dataframe_from_tensor(
data, index=index, columns=columns, gpu=gpu, sparse=sparse
)
need_repart = num_partitions is not None
elif isinstance(data, SERIES_TYPE):
if columns is not None and len(columns) != 1:
raise ValueError("columns' length must be 1 when data is Series")
col_name = columns[0] if columns else None
df = data.to_frame(name=col_name)
need_repart = num_partitions is not None
elif isinstance(data, DATAFRAME_TYPE):
if not hasattr(data, "data"):
# DataFrameData
df = _Frame(data)
else:
df = data
if columns is not None:
if len(df.columns) != len(columns):
raise ValueError("columns' length must be equal to the data's")
df.columns = columns
need_repart = num_partitions is not None
elif isinstance(data, dict) and self._can_process_by_1d_tileables(data):
# data is a dict and some value is tensor
df = dataframe_from_1d_tileables(
data, index=index, columns=columns, gpu=gpu, sparse=sparse
)
need_repart = num_partitions is not None
elif isinstance(data, list) and any(isinstance(v, ENTITY_TYPE) for v in data):
# stack data together
data = stack(data)
df = dataframe_from_tensor(
data, index=index, columns=columns, gpu=gpu, sparse=sparse
)
need_repart = num_partitions is not None
elif isinstance(index, (INDEX_TYPE, SERIES_TYPE)):
if isinstance(data, dict):
data = {k: astensor(v, chunk_size=chunk_size) for k, v in data.items()}
df = dataframe_from_1d_tileables(
data, index=index, columns=columns, gpu=gpu, sparse=sparse
)
else:
if data is not None:
data = astensor(data, chunk_size=chunk_size)
df = dataframe_from_tensor(
data, index=index, columns=columns, gpu=gpu, sparse=sparse
)
need_repart = num_partitions is not None
else:
if is_cudf(data) or is_cupy(data): # pragma: no cover
pdf = cudf.DataFrame(data, index=index, columns=columns, dtype=dtype)
if copy:
pdf = pdf.copy()
else:
pdf = pd.DataFrame(
data, index=index, columns=columns, dtype=dtype, copy=copy
)
if num_partitions is not None:
chunk_size = ceildiv(len(pdf), num_partitions)
df = from_pandas_df(pdf, chunk_size=chunk_size, gpu=gpu, sparse=sparse)
if need_repart:
df = df.rebalance(num_partitions=num_partitions)
super().__init__(df.data)
@classmethod
def _can_process_by_1d_tileables(cls, data: dict):
for value in data.values():
if isinstance(value, ENTITY_TYPE):
return True
elif isinstance(value, (list, tuple)) and any(
isinstance(v, ENTITY_TYPE) for v in value
):
return True
return False
[docs]
class Series(_Series, metaclass=InitializerMeta):
[docs]
def __init__(
self,
data=None,
index=None,
dtype=None,
name=None,
copy=False,
chunk_size=None,
gpu=None,
sparse=None,
num_partitions=None,
):
if dtype is not None:
dtype = pandas_dtype(dtype)
need_repart = False
if isinstance(data, (TENSOR_TYPE, INDEX_TYPE)):
if chunk_size is not None:
data = data.rechunk(chunk_size)
name = name or getattr(data, "name", None)
series = series_from_tensor(
data, index=index, name=name, gpu=gpu, sparse=sparse
)
need_repart = num_partitions is not None
elif isinstance(index, INDEX_TYPE):
if data is not None:
data = astensor(data, chunk_size=chunk_size)
series = series_from_tensor(
data, index=index, name=name, dtype=dtype, gpu=gpu, sparse=sparse
)
need_repart = num_partitions is not None
elif isinstance(data, SERIES_TYPE):
if not hasattr(data, "data"):
# SeriesData
series = _Series(data)
else:
series = data
need_repart = num_partitions is not None
else:
if is_cudf(data) or is_cupy(data): # pragma: no cover
pd_series = cudf.Series(data, index=index, dtype=dtype, name=name)
if copy:
pd_series = pd_series.copy()
else:
pd_series = pd.Series(
data, index=index, dtype=dtype, name=name, copy=copy
)
if num_partitions is not None:
chunk_size = ceildiv(len(pd_series), num_partitions)
series = from_pandas_series(
pd_series, chunk_size=chunk_size, gpu=gpu, sparse=sparse
)
if need_repart:
series = series.rebalance(num_partitions=num_partitions)
super().__init__(series.data)
[docs]
class Index(_Index, metaclass=InitializerMeta):
def __new__(cls, data, **_):
# just return cls always until we support other Index's initializers
return object.__new__(cls)
[docs]
def __init__(
self,
data=None,
dtype=None,
copy=False,
name=None,
tupleize_cols=True,
chunk_size=None,
gpu=None,
sparse=None,
names=None,
num_partitions=None,
store_data=False,
):
need_repart = False
if isinstance(data, INDEX_TYPE):
if not hasattr(data, "data"):
# IndexData
index = _Index(data)
else:
index = data
need_repart = num_partitions is not None
else:
if isinstance(data, ENTITY_TYPE):
name = name if name is not None else getattr(data, "name", None)
index = from_tileable_index(data, dtype=dtype, name=name, names=names)
need_repart = num_partitions is not None
else:
if not is_index(data):
name = name if name is not None else getattr(data, "name", None)
xdf = cudf if is_cudf(data) or is_cupy(data) else pd
try:
pd_index = xdf.Index(
data=data,
dtype=dtype,
copy=copy,
name=name,
tupleize_cols=tupleize_cols,
)
except TypeError: # pragma: no cover
pd_index = xdf.Index(
data=data, dtype=dtype, copy=copy, name=name
)
else:
pd_index = data
if num_partitions is not None:
chunk_size = ceildiv(len(pd_index), num_partitions)
index = from_pandas_index(
pd_index,
chunk_size=chunk_size,
gpu=gpu,
sparse=sparse,
store_data=store_data,
)
if need_repart:
index = index.rebalance(num_partitions=num_partitions)
super().__init__(index.data)
_pd_type_mapping = {
pd.DataFrame: DataFrame,
pd.Series: Series,
pd.Index: Index,
}
[docs]
def read_pandas(
data: Union[pd.DataFrame, pd.Series, pd.Index], **kwargs
) -> Union[DataFrame, Series, Index]:
"""
Create MaxFrame objects from pandas.
Parameters
----------
data: Union[pd.DataFrame, pd.Series, pd.Index]
pandas data
kwargs: dict
arguments to be passed to initializers.
Returns
-------
result: Union[DataFrame, Series, Index]
result MaxFrame object
"""
for pd_cls, cls in _pd_type_mapping.items():
if isinstance(data, pd_cls):
return cls(data, **kwargs)
raise ValueError(f"Type {type(data)} not supported")