Source code for maxframe.dataframe.datasource.read_json

# Copyright 1999-2026 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import MutableMapping, Union
from urllib.parse import urlparse

import numpy as np
import pandas as pd

from maxframe.protocol import DefaultIndexType

try:
    from pyarrow import NativeFile
except ImportError:  # pragma: no cover
    NativeFile = None

from maxframe import opcodes
from maxframe.config import options
from maxframe.core import OutputType
from maxframe.dataframe.datasource.core import (
    ColumnPruneSupportedDataSourceMixin,
    DtypeBackendCompatibleMixin,
    LakeDataSource,
)
from maxframe.dataframe.datasource.utils import get_lake_output_info, iter_local_files
from maxframe.dataframe.utils import (
    parse_index,
    to_arrow_dtypes,
    validate_default_index_type,
    validate_dtype_backend,
)
from maxframe.serialization.serializables import (
    AnyField,
    BoolField,
    DictField,
    Int32Field,
    Int64Field,
    StringField,
)
from maxframe.utils import lazy_import, no_default

cudf = lazy_import("cudf")


class DataFrameReadJSON(
    LakeDataSource,
    ColumnPruneSupportedDataSourceMixin,
    DtypeBackendCompatibleMixin,
):
    _op_type_ = opcodes.READ_JSON

    orient = StringField("orient")
    typ = StringField("typ")
    dtype = AnyField("dtype")
    convert_axes = BoolField("convert_axes")
    lines = BoolField("lines")
    chunksize = Int64Field("chunksize")
    compression = StringField("compression")
    index_col = Int32Field("index_col")
    usecols = AnyField("usecols")
    keep_usecols_order = BoolField("keep_usecols_order", default=None)
    chunk_bytes = StringField("chunk_bytes", default=None)
    read_kwargs = DictField("read_kwargs", default=None)
    head_bytes = StringField("head_bytes", default=None)
    head_lines = Int64Field("head_lines", default=None)

    def __init__(self, output_type=None, **kwargs):
        if output_type is not None:
            kwargs["_output_types"] = [output_type]
        super().__init__(**kwargs)

    def get_columns(self):
        return self.usecols

    def set_pruned_columns(self, columns, *, keep_order=None):
        self.usecols = columns
        self.keep_usecols_order = keep_order

    def __call__(self, chunk_bytes=None, **kwargs):
        if self.read_stage is not None:
            # output for planning or meta fetching
            self._output_types = [OutputType.scalar]
            return self.new_tileable(None, shape=(), dtype=np.dtype("O"))

        shape = (
            (np.nan, len(kwargs["dtypes"]))
            if self.output_types[0] == OutputType.dataframe
            else (np.nan,)
        )
        return self.new_tileable(None, shape=shape, chunk_bytes=chunk_bytes, **kwargs)

    @classmethod
    def estimate_size(
        cls, ctx: MutableMapping[str, Union[int, float]], op: "DataFrameReadJSON"
    ):  # pragma: no cover
        # todo implement this to facilitate local computation
        ctx[op.outputs[0].key] = float("inf")


[docs] def read_json( path, *, orient=None, typ="frame", dtype=None, convert_axes=None, lines=False, chunksize=None, compression="infer", index_col=None, usecols=None, chunk_bytes="64M", gpu=None, head_bytes="100k", head_lines=None, default_index_type: Union[DefaultIndexType, str] = None, use_nullable_dtypes: bool = no_default, dtype_backend: str = no_default, storage_options: dict = None, memory_scale: int = None, merge_small_files: bool = True, merge_small_file_options: dict = None, session=None, run_kwargs: dict = None, **kwargs, ): r""" Read a JSON file into a DataFrame. Parameters ---------- path : str, path object, or file-like object Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: file://localhost/path/to/table.json, you can also read from external resources using a URL like: hdfs://localhost:8020/test.json. If you want to pass in a path object, pandas accepts any ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, such as a file handler (e.g. via builtin ``open`` function) or ``StringIO``. orient : str, optional Indication of expected JSON string format. Compatible JSON strings can be produced by ``to_json()`` with a corresponding orient value. The set of possible orients is: - ``'split'`` : dict like ``{'index' -> [index], 'columns' -> [columns], 'data' -> [values]}`` - ``'records'`` : list like ``[{column -> value}, ... , {column -> value}]`` - ``'index'`` : dict like ``{index -> {column -> value}}`` - ``'columns'`` : dict like ``{column -> {index -> value}}`` - ``'values'`` : just the values array The allowed and default values depend on the value of the `typ` parameter. * when ``typ == 'series'``, - allowed orients are ``{'split','records','index'}`` - default is ``'index'`` - The Series index must be unique for orient ``'index'``. * when ``typ == 'frame'``, - allowed orients are ``{'split','records','index','columns','values'}`` - default is ``'columns'`` - The DataFrame index must be unique for orients ``'index'`` and ``'columns'``. - The DataFrame columns must be unique for orients ``'index'``, ``'columns'``, and ``'records'``. typ : {{'frame', 'series'}}, default 'frame' The type of object to recover. dtype : bool or dict, default None If True, infer dtypes; if a dict of column to dtype, then use those; if False, then don't infer dtypes at all, applies only to the data. convert_axes : bool, default None Try to convert the axes to the proper dtypes. convert_dates : bool or list of str, default True List of columns to parse for dates. If True, then try to parse datelike columns. A column label is datelike if * it ends with ``'_at'``, * it ends with ``'_time'``, * it begins with ``'date'``, or * it is ``'datetime'``, ``'timestamp'``, ``'modified'``, or ``'created'``. keep_default_dates : bool, default True If parsing dates, then parse the default datelike columns. precise_float : bool, default False Set to enable usage of higher precision (strtod) function when decoding string to double values. Default (False) is to use fast but less precise builtin functionality. date_unit : str, default None The timestamp unit to detect if converting dates. The default behaviour is to try and detect the correct precision, but if this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, milliseconds, microseconds or nanoseconds respectively. encoding : str, default is 'utf-8' The encoding to use to decode py3 bytes. lines : bool, default False Read the file as a json object per line. chunksize : int, optional Return JsonReader object for iteration. See the `IO Tools docs <https://pandas.pydata.org/pandas-docs/stable/io.html#io-jsonl>`_ for more information on ``chunksize``. This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer' and `filepath_or_buffer` is path-like, then detect compression from the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no decompression). If using 'zip', the ZIP file must contain only one data file to be read in. Set to None for no decompression. index_col : int, str, sequence of int / str, or False, default ``None`` Column(s) to use as the row labels of the ``DataFrame``, either given as string name or column index. If a sequence of int / str is given, a MultiIndex is used. Note: ``index_col=False`` can be used to force pandas to *not* use the first column as the index, e.g. when you have a malformed file with delimiters at the end of each line. usecols : list-like or callable, optional Return a subset of the columns. If list-like, all elements must either be positional (i.e. integer indices into the document columns) or strings that correspond to column names provided either by the user in `names` or inferred from the document header row(s). For example, a valid list-like `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To instantiate a DataFrame from ``data`` with element order preserved use ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns in ``['foo', 'bar']`` order or ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]`` for ``['bar', 'foo']`` order. If callable, the callable function will be evaluated against the column names, returning names where the callable function evaluates to True. An example of a valid callable argument would be ``lambda x: x.upper() in ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster parsing time and lower memory usage. chunk_bytes: int, float or str, optional Number of chunk bytes. gpu: bool, default False If read into cudf DataFrame. head_bytes: int, float or str, optional Number of bytes to use in the head of file, mainly for data inference. head_lines: int, optional Number of lines to use in the head of file, mainly for data inference. default_index_type: {None, 'range', 'incremental'}, default None If index_col not specified, specify type of index to generate. If not specified, `options.dataframe.default_index_type` will be used. dtype_backend: {'numpy', 'pyarrow'}, default 'numpy' Back-end data type applied to the resultant DataFrame (still experimental). storage_options: dict, optional Options for storage connection. merge_small_files: bool, default True Merge small files whose size is small. merge_small_file_options: dict Options for merging small files Returns ------- DataFrame or Series A JSON file is returned as two-dimensional data structure with labeled axes. See Also -------- to_json : Convert DataFrame to JSON string. json_normalize : Normalize semi-structured JSON data into a flat table. Examples -------- >>> import maxframe.dataframe as md >>> md.read_json('data.json') # doctest: +SKIP >>> # read from HDFS >>> md.read_json('hdfs://localhost:8020/test.json') # doctest: +SKIP >>> # read from OSS >>> md.read_json('oss://oss-cn-hangzhou.aliyuncs.com/bucket/test.json', >>> storage_options={'role_arn': 'acs:ram::xxxxxx:role/aliyunodpsdefaultrole'}) """ from maxframe.dataframe.datasource.dataframe import from_pandas as from_pandas_df from maxframe.dataframe.datasource.series import from_pandas as from_pandas_series default_index_type = validate_default_index_type(default_index_type, **kwargs) local_test_mode = kwargs.pop("_local_test_mode", False) single_path = path[0] if isinstance(path, list) else path parsed_path = urlparse(single_path) if not local_test_mode and ( not parsed_path.scheme or parsed_path.scheme.lower() == "file" ): # just read locally when path is not remote local_dfs = [] for path, part_keys in iter_local_files(path): kw = {} if use_nullable_dtypes is not no_default: kw = {"use_nullable_dtypes": use_nullable_dtypes} if dtype_backend is not no_default: kw = {"dtype_backend": dtype_backend} sub_df = pd.read_json( path, orient=orient, typ=typ, dtype=dtype, convert_axes=convert_axes, lines=lines, chunksize=chunksize, compression=compression, **kw, ) for k, v in part_keys or (): sub_df[k] = v local_dfs.append(sub_df) df = pd.concat(local_dfs) if len(local_dfs) > 1 else local_dfs[0] return from_pandas_df(df) if typ == "frame" else from_pandas_series(df) common_kwargs = dict( orient=orient, typ=typ, convert_axes=convert_axes, lines=lines, chunksize=chunksize, compression=compression, index_col=index_col, usecols=usecols, use_nullable_dtypes=use_nullable_dtypes, dtype_backend=dtype_backend, storage_options=storage_options, read_kwargs=kwargs, ) # Get dtypes, index_dtypes and index_value using the common utility function result = get_lake_output_info( DataFrameReadJSON, path=path, dtype=dtype, head_bytes=head_bytes, head_lines=head_lines, default_index_type=default_index_type, session=session, run_kwargs=run_kwargs, **common_kwargs, ) dtypes = result.dtypes index_dtypes = result.index_dtypes index_value = result.index_value is_partitioned = result.is_partitioned output_type = result.output_type # Handle series case ser_dtype = name = None if output_type == OutputType.series: ser_dtype = dtypes.iloc[0] if len(dtypes) > 0 else None name = dtypes.index[0] if len(dtypes.index) > 0 else None # For JSON, we need to combine index_dtypes with dtypes for the full_dtypes full_dtypes = ( pd.concat([index_dtypes, dtypes]) if index_dtypes is not None else dtypes ) default_index_type = None if index_dtypes is not None else default_index_type if output_type == OutputType.series: columns_value = None else: columns_value = parse_index(dtypes.index, store_data=True) chunk_bytes = chunk_bytes or options.chunk_store_limit op = DataFrameReadJSON( path=path, dtype=full_dtypes, gpu=gpu, default_index_type=default_index_type, is_partitioned=is_partitioned, memory_scale=memory_scale, merge_small_files=merge_small_files, merge_small_file_options=merge_small_file_options, chunk_bytes=chunk_bytes, output_type=output_type, **common_kwargs, ) dtype_backend = validate_dtype_backend( dtype_backend or options.dataframe.dtype_backend ) if not gpu and dtype_backend == "pyarrow": dtypes = to_arrow_dtypes(dtypes) if output_type == OutputType.dataframe: ret = op(index_value=index_value, columns_value=columns_value, dtypes=dtypes) else: ret = op(index_value=index_value, name=name, dtype=ser_dtype) return ret