Source code for maxframe.dataframe.datasource.read_json

# Copyright 1999-2026 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import MutableMapping, Union
from urllib.parse import urlparse

import numpy as np
import pandas as pd

from maxframe.protocol import DefaultIndexType

try:
    from pyarrow import NativeFile
except ImportError:  # pragma: no cover
    NativeFile = None

from maxframe import opcodes
from maxframe.config import options
from maxframe.core import OutputType
from maxframe.dataframe.datasource.core import (
    ColumnPruneSupportedDataSourceMixin,
    DtypeBackendCompatibleMixin,
    LakeDataSource,
)
from maxframe.dataframe.datasource.utils import get_lake_output_info, iter_local_files
from maxframe.dataframe.utils import (
    parse_index,
    to_arrow_dtypes,
    validate_default_index_type,
    validate_dtype_backend,
)
from maxframe.serialization.serializables import (
    AnyField,
    BoolField,
    DictField,
    Int32Field,
    Int64Field,
    StringField,
)
from maxframe.utils import lazy_import, no_default

cudf = lazy_import("cudf")


class DataFrameReadJSON(
    LakeDataSource,
    ColumnPruneSupportedDataSourceMixin,
    DtypeBackendCompatibleMixin,
):
    _op_type_ = opcodes.READ_JSON

    orient = StringField("orient")
    typ = StringField("typ")
    dtype = AnyField("dtype")
    convert_axes = BoolField("convert_axes")
    lines = BoolField("lines")
    chunksize = Int64Field("chunksize")
    compression = StringField("compression")
    index_col = Int32Field("index_col")
    usecols = AnyField("usecols")
    keep_usecols_order = BoolField("keep_usecols_order", default=None)
    chunk_bytes = StringField("chunk_bytes", default=None)
    read_kwargs = DictField("read_kwargs", default=None)
    head_bytes = StringField("head_bytes", default=None)
    head_lines = Int64Field("head_lines", default=None)

    def __init__(self, output_type=None, **kwargs):
        if output_type is not None:
            kwargs["_output_types"] = [output_type]
        super().__init__(**kwargs)

    def get_columns(self):
        return self.usecols

    def set_pruned_columns(self, columns, *, keep_order=None):
        self.usecols = columns
        self.keep_usecols_order = keep_order

    def __call__(self, chunk_bytes=None, **kwargs):
        if self.read_stage is not None:
            # output for planning or meta fetching
            self._output_types = [OutputType.scalar]
            return self.new_tileable(None, shape=(), dtype=np.dtype("O"))

        shape = (
            (np.nan, len(kwargs["dtypes"]))
            if self.output_types[0] == OutputType.dataframe
            else (np.nan,)
        )
        return self.new_tileable(None, shape=shape, chunk_bytes=chunk_bytes, **kwargs)

    @classmethod
    def estimate_size(
        cls, ctx: MutableMapping[str, Union[int, float]], op: "DataFrameReadJSON"
    ):  # pragma: no cover
        # todo implement this to facilitate local computation
        ctx[op.outputs[0].key] = float("inf")



[docs]
def read_json(
    path,
    *,
    orient=None,
    typ="frame",
    dtype=None,
    convert_axes=None,
    lines=False,
    chunksize=None,
    compression="infer",
    index_col=None,
    usecols=None,
    chunk_bytes="64M",
    gpu=None,
    head_bytes="100k",
    head_lines=None,
    default_index_type: Union[DefaultIndexType, str] = None,
    use_nullable_dtypes: bool = no_default,
    dtype_backend: str = no_default,
    storage_options: dict = None,
    memory_scale: int = None,
    merge_small_files: bool = True,
    merge_small_file_options: dict = None,
    session=None,
    run_kwargs: dict = None,
    **kwargs,
):
    r"""
    Read a JSON file into a DataFrame.

    Parameters
    ----------
    path : str, path object, or file-like object
        Any valid string path is acceptable. The string could be a URL. Valid
        URL schemes include http, ftp, s3, and file. For file URLs, a host is
        expected. A local file could be: file://localhost/path/to/table.json,
        you can also read from external resources using a URL like:
        hdfs://localhost:8020/test.json.
        If you want to pass in a path object, pandas accepts any ``os.PathLike``.
        By file-like object, we refer to objects with a ``read()`` method, such as
        a file handler (e.g. via builtin ``open`` function) or ``StringIO``.
    orient : str, optional
        Indication of expected JSON string format.
        Compatible JSON strings can be produced by ``to_json()`` with a
        corresponding orient value.
        The set of possible orients is:
        - ``'split'`` : dict like ``{'index' -> [index], 'columns' -> [columns], 'data' -> [values]}``
        - ``'records'`` : list like ``[{column -> value}, ... , {column -> value}]``
        - ``'index'`` : dict like ``{index -> {column -> value}}``
        - ``'columns'`` : dict like ``{column -> {index -> value}}``
        - ``'values'`` : just the values array
        The allowed and default values depend on the value of the `typ` parameter.
        * when ``typ == 'series'``,
          - allowed orients are ``{'split','records','index'}``
          - default is ``'index'``
          - The Series index must be unique for orient ``'index'``.
        * when ``typ == 'frame'``,
          - allowed orients are ``{'split','records','index','columns','values'}``
          - default is ``'columns'``
          - The DataFrame index must be unique for orients ``'index'`` and ``'columns'``.
          - The DataFrame columns must be unique for orients ``'index'``, ``'columns'``,
            and ``'records'``.
    typ : {{'frame', 'series'}}, default 'frame'
        The type of object to recover.
    dtype : bool or dict, default None
        If True, infer dtypes; if a dict of column to dtype, then use those;
        if False, then don't infer dtypes at all, applies only to the data.
    convert_axes : bool, default None
        Try to convert the axes to the proper dtypes.
    convert_dates : bool or list of str, default True
        List of columns to parse for dates. If True, then try to parse datelike columns.
        A column label is datelike if
        * it ends with ``'_at'``,
        * it ends with ``'_time'``,
        * it begins with ``'date'``, or
        * it is ``'datetime'``, ``'timestamp'``, ``'modified'``, or ``'created'``.
    keep_default_dates : bool, default True
        If parsing dates, then parse the default datelike columns.
    precise_float : bool, default False
        Set to enable usage of higher precision (strtod) function when
        decoding string to double values. Default (False) is to use fast but
        less precise builtin functionality.
    date_unit : str, default None
        The timestamp unit to detect if converting dates. The default behaviour
        is to try and detect the correct precision, but if this is not desired
        then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds,
        milliseconds, microseconds or nanoseconds respectively.
    encoding : str, default is 'utf-8'
        The encoding to use to decode py3 bytes.
    lines : bool, default False
        Read the file as a json object per line.
    chunksize : int, optional
        Return JsonReader object for iteration.
        See the `IO Tools docs
        <https://pandas.pydata.org/pandas-docs/stable/io.html#io-jsonl>`_
        for more information on ``chunksize``.
        This can only be passed if `lines=True`.
        If this is None, the file will be read into memory all at once.
    compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer'
        For on-the-fly decompression of on-disk data. If 'infer' and
        `filepath_or_buffer` is path-like, then detect compression from the
        following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
        decompression). If using 'zip', the ZIP file must contain only one data
        file to be read in. Set to None for no decompression.
    index_col : int, str, sequence of int / str, or False, default ``None``
      Column(s) to use as the row labels of the ``DataFrame``, either given as
      string name or column index. If a sequence of int / str is given, a
      MultiIndex is used.
      Note: ``index_col=False`` can be used to force pandas to *not* use the first
      column as the index, e.g. when you have a malformed file with delimiters at
      the end of each line.
    usecols : list-like or callable, optional
        Return a subset of the columns. If list-like, all elements must either
        be positional (i.e. integer indices into the document columns) or strings
        that correspond to column names provided either by the user in `names` or
        inferred from the document header row(s). For example, a valid list-like
        `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
        Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
        To instantiate a DataFrame from ``data`` with element order preserved use
        ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
        in ``['foo', 'bar']`` order or
        ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
        for ``['bar', 'foo']`` order.
        If callable, the callable function will be evaluated against the column
        names, returning names where the callable function evaluates to True. An
        example of a valid callable argument would be ``lambda x: x.upper() in
        ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
        parsing time and lower memory usage.
    chunk_bytes: int, float or str, optional
        Number of chunk bytes.
    gpu: bool, default False
        If read into cudf DataFrame.
    head_bytes: int, float or str, optional
        Number of bytes to use in the head of file, mainly for data inference.
    head_lines: int, optional
        Number of lines to use in the head of file, mainly for data inference.
    default_index_type: {None, 'range', 'incremental'}, default None
        If index_col not specified, specify type of index to generate.
        If not specified, `options.dataframe.default_index_type` will be used.
    dtype_backend: {'numpy', 'pyarrow'}, default 'numpy'
        Back-end data type applied to the resultant DataFrame (still experimental).
    storage_options: dict, optional
        Options for storage connection.
    merge_small_files: bool, default True
        Merge small files whose size is small.
    merge_small_file_options: dict
        Options for merging small files

    Returns
    -------
    DataFrame or Series
        A JSON file is returned as two-dimensional data structure with labeled axes.

    See Also
    --------
    to_json : Convert DataFrame to JSON string.
    json_normalize : Normalize semi-structured JSON data into a flat table.

    Examples
    --------
    >>> import maxframe.dataframe as md
    >>> md.read_json('data.json')  # doctest: +SKIP
    >>> # read from HDFS
    >>> md.read_json('hdfs://localhost:8020/test.json')  # doctest: +SKIP
    >>> # read from OSS
    >>> md.read_json('oss://oss-cn-hangzhou.aliyuncs.com/bucket/test.json',
    >>>             storage_options={'role_arn': 'acs:ram::xxxxxx:role/aliyunodpsdefaultrole'})
    """
    from maxframe.dataframe.datasource.dataframe import from_pandas as from_pandas_df
    from maxframe.dataframe.datasource.series import from_pandas as from_pandas_series

    default_index_type = validate_default_index_type(default_index_type, **kwargs)
    local_test_mode = kwargs.pop("_local_test_mode", False)
    single_path = path[0] if isinstance(path, list) else path
    parsed_path = urlparse(single_path)
    if not local_test_mode and (
        not parsed_path.scheme or parsed_path.scheme.lower() == "file"
    ):
        # just read locally when path is not remote
        local_dfs = []
        for path, part_keys in iter_local_files(path):
            kw = {}
            if use_nullable_dtypes is not no_default:
                kw = {"use_nullable_dtypes": use_nullable_dtypes}
            if dtype_backend is not no_default:
                kw = {"dtype_backend": dtype_backend}
            sub_df = pd.read_json(
                path,
                orient=orient,
                typ=typ,
                dtype=dtype,
                convert_axes=convert_axes,
                lines=lines,
                chunksize=chunksize,
                compression=compression,
                **kw,
            )
            for k, v in part_keys or ():
                sub_df[k] = v
            local_dfs.append(sub_df)
        df = pd.concat(local_dfs) if len(local_dfs) > 1 else local_dfs[0]
        return from_pandas_df(df) if typ == "frame" else from_pandas_series(df)

    common_kwargs = dict(
        orient=orient,
        typ=typ,
        convert_axes=convert_axes,
        lines=lines,
        chunksize=chunksize,
        compression=compression,
        index_col=index_col,
        usecols=usecols,
        use_nullable_dtypes=use_nullable_dtypes,
        dtype_backend=dtype_backend,
        storage_options=storage_options,
        read_kwargs=kwargs,
    )
    # Get dtypes, index_dtypes and index_value using the common utility function
    result = get_lake_output_info(
        DataFrameReadJSON,
        path=path,
        dtype=dtype,
        head_bytes=head_bytes,
        head_lines=head_lines,
        default_index_type=default_index_type,
        session=session,
        run_kwargs=run_kwargs,
        **common_kwargs,
    )

    dtypes = result.dtypes
    index_dtypes = result.index_dtypes
    index_value = result.index_value
    is_partitioned = result.is_partitioned
    output_type = result.output_type

    # Handle series case
    ser_dtype = name = None
    if output_type == OutputType.series:
        ser_dtype = dtypes.iloc[0] if len(dtypes) > 0 else None
        name = dtypes.index[0] if len(dtypes.index) > 0 else None

    # For JSON, we need to combine index_dtypes with dtypes for the full_dtypes
    full_dtypes = (
        pd.concat([index_dtypes, dtypes]) if index_dtypes is not None else dtypes
    )
    default_index_type = None if index_dtypes is not None else default_index_type

    if output_type == OutputType.series:
        columns_value = None
    else:
        columns_value = parse_index(dtypes.index, store_data=True)

    chunk_bytes = chunk_bytes or options.chunk_store_limit
    op = DataFrameReadJSON(
        path=path,
        dtype=full_dtypes,
        gpu=gpu,
        default_index_type=default_index_type,
        is_partitioned=is_partitioned,
        memory_scale=memory_scale,
        merge_small_files=merge_small_files,
        merge_small_file_options=merge_small_file_options,
        chunk_bytes=chunk_bytes,
        output_type=output_type,
        **common_kwargs,
    )
    dtype_backend = validate_dtype_backend(
        dtype_backend or options.dataframe.dtype_backend
    )

    if not gpu and dtype_backend == "pyarrow":
        dtypes = to_arrow_dtypes(dtypes)
    if output_type == OutputType.dataframe:
        ret = op(index_value=index_value, columns_value=columns_value, dtypes=dtypes)
    else:
        ret = op(index_value=index_value, name=name, dtype=ser_dtype)
    return ret