Source code for maxframe.dataframe.datasource.date_range

# Copyright 1999-2026 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import warnings
from datetime import date, datetime, time
from typing import MutableMapping, Union

import numpy as np
import pandas as pd
from pandas import NaT, Timestamp
from pandas._libs.tslibs import timezones
from pandas.tseries.frequencies import to_offset
from pandas.tseries.offsets import Tick

from maxframe import opcodes
from maxframe.core import OutputType
from maxframe.dataframe.operators import DataFrameOperator, DataFrameOperatorMixin
from maxframe.dataframe.utils import parse_index
from maxframe.serialization.serializables import (
    AnyField,
    BoolField,
    Int64Field,
    StringField,
)
from maxframe.utils import no_default, pd_release_version

try:
    from pandas._libs.tslib import normalize_date
except ImportError:  # pragma: no cover

    def normalize_date(dt):  # from pandas/_libs/tslibs/conversion.pyx
        if isinstance(dt, datetime):
            if isinstance(dt, pd.Timestamp):
                return dt.replace(
                    hour=0, minute=0, second=0, microsecond=0, nanosecond=0
                )
            else:
                return dt.replace(hour=0, minute=0, second=0, microsecond=0)
        elif isinstance(dt, date):
            return datetime(dt.year, dt.month, dt.day)
        else:
            raise TypeError(f"Unrecognized type: {type(dt)}")


_date_range_use_inclusive = pd_release_version[:2] >= (1, 4)


# adapted from pandas.core.arrays.datetimes.generate_range
def generate_range_count(
    start=None, end=None, periods=None, offset=None
):  # pragma: no cover
    offset = to_offset(offset)

    start = Timestamp(start)
    start = start if start is not NaT else None
    end = Timestamp(end)
    end = end if end is not NaT else None

    if start and not offset.is_on_offset(start):
        start = offset.rollforward(start)

    elif end and not offset.is_on_offset(end):
        end = offset.rollback(end)

    if periods is None and end < start and offset.n >= 0:
        end = None
        periods = 0

    if end is None:
        end = start + (periods - 1) * offset

    if start is None:
        start = end - (periods - 1) * offset

    cur = start
    count = 0
    if offset.n >= 0:
        while cur <= end:
            count += 1

            if cur == end:
                # GH#24252 avoid overflows by not performing the addition
                # in offset.apply unless we have to
                break

            # faster than cur + offset
            try:
                next_date = offset._apply(cur)
            except AttributeError:
                next_date = cur + offset
            if next_date <= cur:
                raise ValueError(f"Offset {offset} did not increment date")
            cur = next_date
    else:
        while cur >= end:
            count += 1

            if cur == end:
                # GH#24252 avoid overflows by not performing the addition
                # in offset.apply unless we have to
                break

            # faster than cur + offset
            try:
                next_date = offset._apply(cur)
            except AttributeError:
                next_date = cur + offset
            if next_date >= cur:
                raise ValueError(f"Offset {offset} did not decrement date")
            cur = next_date
    return count


class DataFrameDateRange(DataFrameOperator, DataFrameOperatorMixin):
    _op_type_ = opcodes.DATE_RANGE

    start = AnyField("start")
    end = AnyField("end")
    periods = Int64Field("periods")
    freq = AnyField("freq")
    tz = AnyField("tz")
    normalize = BoolField("normalize")
    name = StringField("name")
    inclusive = StringField("inclusive")

    def __init__(
        self,
        output_types=None,
        **kw,
    ):
        super().__init__(_output_types=output_types, **kw)
        if self.output_types is None:
            self.output_types = [OutputType.index]
        if getattr(self, "inclusive", None) is None:
            self.inclusive = "both"

    def __call__(self, shape, chunk_size=None):
        dtype = pd.Index([self.start]).dtype
        index_value = parse_index(
            pd.Index([], dtype=dtype), self.start, self.end, self.periods, self.tz
        )
        # gen index value info
        index_value.value._min_val = self.start
        index_value.value._min_val_close = True
        index_value.value._max_val = self.end
        index_value.value._max_val_close = True
        index_value.value._is_unique = True
        index_value.value._is_monotonic_increasing = True
        index_value.value._freq = self.freq
        return self.new_index(
            None,
            shape=shape,
            dtype=dtype,
            index_value=index_value,
            name=self.name,
            raw_chunk_size=chunk_size,
            freq=self.freq,
        )

    @classmethod
    def estimate_size(
        cls, ctx: MutableMapping[str, Union[int, float]], op: "DataFrameDateRange"
    ):  # pragma: no cover
        # todo implement this to facilitate local computation
        ctx[op.outputs[0].key] = float("inf")


_midnight = time(0, 0)


def _maybe_normalize_endpoints(start, end, normalize):  # pragma: no cover
    _normalized = True

    if start is not None:
        if normalize:
            start = normalize_date(start)
            _normalized = True
        else:
            _normalized = _normalized and start.time() == _midnight

    if end is not None:
        if normalize:
            end = normalize_date(end)
            _normalized = True
        else:
            _normalized = _normalized and end.time() == _midnight

    return start, end, _normalized


def _infer_tz_from_endpoints(start, end, tz):  # pragma: no cover
    """
    If a timezone is not explicitly given via `tz`, see if one can
    be inferred from the `start` and `end` endpoints.  If more than one
    of these inputs provides a timezone, require that they all agree.

    Parameters
    ----------
    start : Timestamp
    end : Timestamp
    tz : tzinfo or None

    Returns
    -------
    tz : tzinfo or None

    Raises
    ------
    TypeError : if start and end timezones do not agree
    """
    try:
        inferred_tz = timezones.infer_tzinfo(start, end)
    except AssertionError:
        # infer_tzinfo raises AssertionError if passed mismatched timezones
        raise TypeError(
            "Start and end cannot both be tz-aware with different timezones"
        )

    inferred_tz = timezones.maybe_get_tz(inferred_tz)
    tz = timezones.maybe_get_tz(tz)

    if tz is not None and inferred_tz is not None:
        if not timezones.tz_compare(inferred_tz, tz):
            raise AssertionError("Inferred time zone not equal to passed time zone")

    elif inferred_tz is not None:
        tz = inferred_tz

    return tz


def _maybe_localize_point(
    ts, is_none, is_not_none, freq, tz, ambiguous, nonexistent
):  # pragma: no cover
    """
    Localize a start or end Timestamp to the timezone of the corresponding
    start or end Timestamp

    Parameters
    ----------
    ts : start or end Timestamp to potentially localize
    is_none : argument that should be None
    is_not_none : argument that should not be None
    freq : Tick, DateOffset, or None
    tz : str, timezone object or None
    ambiguous: str, localization behavior for ambiguous times
    nonexistent: str, localization behavior for nonexistent times

    Returns
    -------
    ts : Timestamp
    """
    # Make sure start and end are timezone localized if:
    # 1) freq = a Timedelta-like frequency (Tick)
    # 2) freq = None i.e. generating a linspaced range
    if is_none is None and is_not_none is not None:
        # Note: We can't ambiguous='infer' a singular ambiguous time; however,
        # we have historically defaulted ambiguous=False
        ambiguous = ambiguous if ambiguous != "infer" else False
        localize_args = {"ambiguous": ambiguous, "nonexistent": nonexistent, "tz": None}
        if isinstance(freq, Tick) or freq is None:
            localize_args["tz"] = tz
        ts = ts.tz_localize(**localize_args)
    return ts


[docs] def date_range( start=None, end=None, periods=None, freq=None, tz=None, normalize=False, name=None, closed=no_default, inclusive=None, chunk_size=None, **kwargs, ): """ Return a fixed frequency DatetimeIndex. Parameters ---------- start : str or datetime-like, optional Left bound for generating dates. end : str or datetime-like, optional Right bound for generating dates. periods : int, optional Number of periods to generate. freq : str or DateOffset, default 'D' Frequency strings can have multiples, e.g. '5H'. See :ref:`here <timeseries.offset_aliases>` for a list of frequency aliases. tz : str or tzinfo, optional Time zone name for returning localized DatetimeIndex, for example 'Asia/Hong_Kong'. By default, the resulting DatetimeIndex is timezone-naive. normalize : bool, default False Normalize start/end dates to midnight before generating date range. name : str, default None Name of the resulting DatetimeIndex. inclusive : {“both”, “neither”, “left”, “right”}, default “both” Include boundaries; Whether to set each bound as closed or open. **kwargs For compatibility. Has no effect on the result. Returns ------- rng : DatetimeIndex See Also -------- DatetimeIndex : An immutable container for datetimes. timedelta_range : Return a fixed frequency TimedeltaIndex. period_range : Return a fixed frequency PeriodIndex. interval_range : Return a fixed frequency IntervalIndex. Notes ----- Of the four parameters ``start``, ``end``, ``periods``, and ``freq``, exactly three must be specified. If ``freq`` is omitted, the resulting ``DatetimeIndex`` will have ``periods`` linearly spaced elements between ``start`` and ``end`` (closed on both sides). To learn more about the frequency strings, please see `this link <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__. Examples -------- **Specifying the values** The next four examples generate the same `DatetimeIndex`, but vary the combination of `start`, `end` and `periods`. Specify `start` and `end`, with the default daily frequency. >>> import maxframe.dataframe as md >>> md.date_range(start='1/1/2018', end='1/08/2018').execute() DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'], dtype='datetime64[ns]', freq='D') Specify `start` and `periods`, the number of periods (days). >>> md.date_range(start='1/1/2018', periods=8).execute() DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04', '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'], dtype='datetime64[ns]', freq='D') Specify `end` and `periods`, the number of periods (days). >>> md.date_range(end='1/1/2018', periods=8).execute() DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28', '2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'], dtype='datetime64[ns]', freq='D') Specify `start`, `end`, and `periods`; the frequency is generated automatically (linearly spaced). >>> md.date_range(start='2018-04-24', end='2018-04-27', periods=3).execute() DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00', '2018-04-27 00:00:00'], dtype='datetime64[ns]', freq=None) **Other Parameters** Changed the `freq` (frequency) to ``'M'`` (month end frequency). >>> md.date_range(start='1/1/2018', periods=5, freq='M').execute() DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30', '2018-05-31'], dtype='datetime64[ns]', freq='M') Multiples are allowed >>> md.date_range(start='1/1/2018', periods=5, freq='3M').execute() DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', '2019-01-31'], dtype='datetime64[ns]', freq='3M') `freq` can also be specified as an Offset object. >>> md.date_range(start='1/1/2018', periods=5, freq=md.offsets.MonthEnd(3)).execute() DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31', '2019-01-31'], dtype='datetime64[ns]', freq='3M') Specify `tz` to set the timezone. >>> md.date_range(start='1/1/2018', periods=5, tz='Asia/Tokyo').execute() DatetimeIndex(['2018-01-01 00:00:00+09:00', '2018-01-02 00:00:00+09:00', '2018-01-03 00:00:00+09:00', '2018-01-04 00:00:00+09:00', '2018-01-05 00:00:00+09:00'], dtype='datetime64[ns, Asia/Tokyo]', freq='D') `inclusive` controls whether to include `start` and `end` that are on the boundary. The default, "both", includes boundary points on either end. >>> md.date_range(start='2017-01-01', end='2017-01-04', inclusive='both').execute() DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq='D') Use ``inclusive='left'`` to exclude `end` if it falls on the boundary. >>> md.date_range(start='2017-01-01', end='2017-01-04', closed='left').execute() DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'], dtype='datetime64[ns]', freq='D') Use ``inclusive='right'`` to exclude `start` if it falls on the boundary, and similarly inclusive='neither' will exclude both `start` and `end`. >>> md.date_range(start='2017-01-01', end='2017-01-04', closed='right').execute() DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'], dtype='datetime64[ns]', freq='D') .. note:: Pandas 1.4.0 or later is required to use ``inclusive='neither'``. Otherwise an error may be raised. """ # validate periods if isinstance(periods, (float, np.floating)): periods = int(periods) if periods is not None and not isinstance(periods, (int, np.integer)): raise TypeError(f"periods must be a number, got {periods}") if freq is None and any(arg is None for arg in [periods, start, end]): freq = "D" if sum(arg is not None for arg in [start, end, periods, freq]) != 3: raise ValueError( "Of the four parameters: start, end, periods, " "and freq, exactly three must be specified" ) if isinstance(freq, str) and any(ch.isdigit() for ch in freq): freq = freq.lower() freq = to_offset(freq) if _date_range_use_inclusive and closed is not no_default: warnings.warn( "Argument `closed` is deprecated in favor of `inclusive`.", FutureWarning ) elif closed is no_default: closed = None if inclusive is None and closed is not no_default: inclusive = closed if start is not None: start = pd.Timestamp(start) if end is not None: end = pd.Timestamp(end) if start is pd.NaT or end is pd.NaT: raise ValueError("Neither `start` nor `end` can be NaT") start, end, _ = _maybe_normalize_endpoints(start, end, normalize) tz = _infer_tz_from_endpoints(start, end, tz) if start is None and end is not None: # start is None and end is not None # adjust end first end = pd.date_range(end=end, periods=1, freq=freq)[0] if inclusive == "neither": end -= freq size = periods start = end - (periods - 1) * freq if inclusive in ("neither", "left"): size -= 1 elif inclusive == "right": # when start is None, closed == 'left' would not take effect # thus just ignore inclusive = "both" elif end is None: # end is None # adjust start first start = pd.date_range(start=start, periods=1, freq=freq)[0] size = periods if inclusive == "neither": end = start + periods * freq else: end = start + (periods - 1) * freq if inclusive in ("neither", "right"): size -= 1 elif inclusive == "left": # when end is None, closed == 'left' would not take effect # thus just ignore inclusive = "both" else: if periods is None: periods = size = generate_range_count(start, end, periods, freq) else: size = periods if inclusive in ("left", "right"): size -= 1 elif inclusive == "neither": size -= 2 shape = (size,) op = DataFrameDateRange( start=start, end=end, periods=periods, freq=freq, tz=tz, normalize=normalize, name=name, inclusive=inclusive, **kwargs, ) return op(shape, chunk_size=chunk_size)