Source code for pandas_ts.packer

"""Module for converting between "flat" and "list" and "ts" representations

TODO: mask support
TODO: multi-index support
"""

# "|" for python 3.9
from __future__ import annotations

from collections.abc import Sequence

import numpy as np
import pandas as pd
import pyarrow as pa

from pandas_ts.ts_dtype import TsDtype
from pandas_ts.ts_ext_array import TsExtensionArray

__all__ = ["pack_flat", "pack_lists", "pack_dfs"]


N_ROWS_INFER_DTYPE = 1000


def pack_flat_into_df(df: pd.DataFrame, name=None) -> pd.DataFrame:
    """Pack a "flat" dataframe into a "nested" dataframe.

    For the input dataframe with repeated indexes, make a pandas.DataFrame,
    where each original column is replaced by a column of lists, and,
    optionally, a "structure" column is added, containing a structure of
    lists with the original columns.

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe, with repeated indexes.

    name : str, optional
        Name of the structure column. The default is None, which means no
        structure column is added.

    Returns
    -------
    pd.DataFrame
        Output dataframe.
    """
    # TODO: we can optimize name=None case a bit
    struct_series = pack_flat(df, name=name)
    packed_df = struct_series.struct.explode()
    if name is not None:
        packed_df[name] = struct_series
    return packed_df


[docs] def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series: """Make a structure of lists representation of a "flat" dataframe. For the input dataframe with repeated indexes, make a pandas.Series, where each original column is replaced by a structure of lists. The dtype of the column is `pandas_ts.TsDtype` with the corresponding pyarrow type. The index of the output series is the unique index of the input dataframe. The Series has `.ts` accessor, see `pandas_ts.ts_accessor.TsAccessor` for details. Parameters ---------- df : pd.DataFrame Input dataframe, with repeated indexes. name : str, optional Name of the pd.Series. Returns ------- pd.Series Output series, with unique indexes. See Also -------- pandas_ts.ts_accessor.TsAccessor : The accessor for the output series. pandas_ts.TsDtype : The dtype of the output series. pandas_ts.packer.pack_lists : Pack a dataframe of nested arrays. """ # TODO: think about the case when the data is pre-sorted and we don't need a data copy. flat = df.sort_index(kind="stable") return pack_sorted_df_into_struct(flat, name=name)
[docs] def pack_dfs(dfs: Sequence[pd.DataFrame], index: object = None, name: str | None = None) -> pd.Series: """Pack a sequence of "flat" dataframes into a "nested" series. Parameters ---------- dfs : Sequence[pd.DataFrame] Input sequence of dataframes. index : pd.Index, optional Index of the output series. name : str, optional Name of the output series. Returns ------- pd.Series Output series. """ if isinstance(dfs, pd.Series) and index is None: index = dfs.index first_df = dfs.iloc[0] if hasattr(dfs, "iloc") else dfs[0] field_types = { column: pa.array(first_df[column].iloc[:N_ROWS_INFER_DTYPE]).type for column in first_df.columns } dtype = TsDtype.from_fields(field_types) dummy_value: dict[str, list] = {column: [] for column in first_df.columns} series = pd.Series([dummy_value] * len(dfs), dtype=dtype, index=index, name=name) series[:] = dfs return series
def pack_sorted_df_into_struct(df: pd.DataFrame, name: str | None = None) -> pd.Series: """Make a structure of lists representation of a "flat" dataframe. Input dataframe must be sorted and all the columns must have pyarrow dtypes. Parameters ---------- df : pd.DataFrame Input dataframe, with repeated indexes. It must be sorted and all the columns must have pyarrow dtypes. name : str, optional Name of the pd.Series. Returns ------- pd.Series Output series, with unique indexes. """ packed_df = view_sorted_df_as_list_arrays(df) # No need to validate the dataframe, the length of the nested arrays is forced to be the same by # the view_sorted_df_as_list_arrays function. return pack_lists(packed_df, name=name, validate=False)
[docs] def pack_lists(df: pd.DataFrame, name: str | None = None, *, validate: bool = True) -> pd.Series: """Make a series of arrow structures from a dataframe with nested arrays. For the input dataframe with repeated indexes, make a pandas.Series, where each original column is replaced by a structure of lists. The dtype of the column is `pandas_ts.TsDtype` with the corresponding pyarrow type. The index of the output series is the unique index of the input dataframe. The Series has `.ts` accessor, see `pandas_ts.ts_accessor.TsAccessor` for details. For every row, all the nested array (aka pyarrow list) lengths must be the same. Parameters ---------- df : pd.DataFrame Input dataframe, with pyarrow list-arrays. name : str, optional Name of the pd.Series. validate : bool, default True Whether to validate the input dataframe. Returns ------- pd.Series Output series, with unique indexes. See Also -------- pandas_ts.ts_accessor.TsAccessor : The accessor for the output series. pandas_ts.TsDtype : The dtype of the output series. pandas_ts.packer.pack_flat : Pack a "flat" dataframe with repeated indexes. """ struct_array = pa.StructArray.from_arrays( [df[column] for column in df.columns], names=df.columns, ) ext_array = TsExtensionArray(struct_array, validate=validate) return pd.Series( ext_array, index=df.index, copy=False, name=name, )
def view_sorted_df_as_list_arrays(df: pd.DataFrame) -> pd.DataFrame: """Make a nested array representation of a "flat" dataframe. Parameters ---------- df : pd.DataFrame Input dataframe, with repeated indexes. It must be sorted by its index. Returns ------- pd.DataFrame Output dataframe, with unique indexes. It is a view over the input dataframe, so it would mute the input dataframe if modified. """ offset_array = calculate_sorted_index_offsets(df.index) unique_index = df.index.values[offset_array[:-1]] series_ = { column: view_sorted_series_as_list_array(df[column], offset_array, unique_index) for column in df.columns } df = pd.DataFrame(series_) return df def view_sorted_series_as_list_array( series: pd.Series, offset: np.ndarray | None = None, unique_index: np.ndarray | None = None ) -> pd.Series: """Make a nested array representation of a "flat" series. Parameters ---------- series : pd.Series Input series, with repeated indexes. It must be sorted by its index. offset: np.ndarray or None, optional Pre-calculated offsets of the input series index. unique_index: np.ndarray or None, optional Pre-calculated unique index of the input series. If given it must be equal to `series.index.unique()` and `series.index.values[offset[:-1]]`. Returns ------- pd.Series Output series, with unique indexes. It is a view over the input series, so it would mute the input series if modified. """ if offset is None: offset = calculate_sorted_index_offsets(series.index) if unique_index is None: unique_index = series.index.values[offset[:-1]] list_array = pa.ListArray.from_arrays( offset, pa.array(series), ) return pd.Series( list_array, dtype=pd.ArrowDtype(list_array.type), index=unique_index, copy=False, ) def calculate_sorted_index_offsets(index: pd.Index) -> np.ndarray: """Calculate the offsets of the pre-sorted index values. Parameters ---------- index : pd.Index Input index, must be sorted. Returns ------- np.ndarray Output array of offsets, one element more than the number of unique index values. """ # TODO: implement multi-index support index_diff = np.diff(index.values, prepend=index.values[0] - 1, append=index.values[-1] + 1) if np.any(index_diff < 0): raise ValueError("Table index must be strictly sorted.") offset = np.nonzero(index_diff)[0] return offset