"""Module for converting between "flat" and "list" and "ts" representations
TODO: mask support
TODO: multi-index support
"""
# "|" for python 3.9
from __future__ import annotations
from collections.abc import Sequence
import numpy as np
import pandas as pd
import pyarrow as pa
from pandas_ts.ts_dtype import TsDtype
from pandas_ts.ts_ext_array import TsExtensionArray
__all__ = ["pack_flat", "pack_lists", "pack_dfs"]
N_ROWS_INFER_DTYPE = 1000
def pack_flat_into_df(df: pd.DataFrame, name=None) -> pd.DataFrame:
"""Pack a "flat" dataframe into a "nested" dataframe.
For the input dataframe with repeated indexes, make a pandas.DataFrame,
where each original column is replaced by a column of lists, and,
optionally, a "structure" column is added, containing a structure of
lists with the original columns.
Parameters
----------
df : pd.DataFrame
Input dataframe, with repeated indexes.
name : str, optional
Name of the structure column. The default is None, which means no
structure column is added.
Returns
-------
pd.DataFrame
Output dataframe.
"""
# TODO: we can optimize name=None case a bit
struct_series = pack_flat(df, name=name)
packed_df = struct_series.struct.explode()
if name is not None:
packed_df[name] = struct_series
return packed_df
[docs]
def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
"""Make a structure of lists representation of a "flat" dataframe.
For the input dataframe with repeated indexes, make a pandas.Series,
where each original column is replaced by a structure of lists.
The dtype of the column is `pandas_ts.TsDtype` with the corresponding
pyarrow type. The index of the output series is the unique index of the
input dataframe. The Series has `.ts` accessor, see
`pandas_ts.ts_accessor.TsAccessor` for details.
Parameters
----------
df : pd.DataFrame
Input dataframe, with repeated indexes.
name : str, optional
Name of the pd.Series.
Returns
-------
pd.Series
Output series, with unique indexes.
See Also
--------
pandas_ts.ts_accessor.TsAccessor : The accessor for the output series.
pandas_ts.TsDtype : The dtype of the output series.
pandas_ts.packer.pack_lists : Pack a dataframe of nested arrays.
"""
# TODO: think about the case when the data is pre-sorted and we don't need a data copy.
flat = df.sort_index(kind="stable")
return pack_sorted_df_into_struct(flat, name=name)
[docs]
def pack_dfs(dfs: Sequence[pd.DataFrame], index: object = None, name: str | None = None) -> pd.Series:
"""Pack a sequence of "flat" dataframes into a "nested" series.
Parameters
----------
dfs : Sequence[pd.DataFrame]
Input sequence of dataframes.
index : pd.Index, optional
Index of the output series.
name : str, optional
Name of the output series.
Returns
-------
pd.Series
Output series.
"""
if isinstance(dfs, pd.Series) and index is None:
index = dfs.index
first_df = dfs.iloc[0] if hasattr(dfs, "iloc") else dfs[0]
field_types = {
column: pa.array(first_df[column].iloc[:N_ROWS_INFER_DTYPE]).type for column in first_df.columns
}
dtype = TsDtype.from_fields(field_types)
dummy_value: dict[str, list] = {column: [] for column in first_df.columns}
series = pd.Series([dummy_value] * len(dfs), dtype=dtype, index=index, name=name)
series[:] = dfs
return series
def pack_sorted_df_into_struct(df: pd.DataFrame, name: str | None = None) -> pd.Series:
"""Make a structure of lists representation of a "flat" dataframe.
Input dataframe must be sorted and all the columns must have pyarrow dtypes.
Parameters
----------
df : pd.DataFrame
Input dataframe, with repeated indexes. It must be sorted and
all the columns must have pyarrow dtypes.
name : str, optional
Name of the pd.Series.
Returns
-------
pd.Series
Output series, with unique indexes.
"""
packed_df = view_sorted_df_as_list_arrays(df)
# No need to validate the dataframe, the length of the nested arrays is forced to be the same by
# the view_sorted_df_as_list_arrays function.
return pack_lists(packed_df, name=name, validate=False)
[docs]
def pack_lists(df: pd.DataFrame, name: str | None = None, *, validate: bool = True) -> pd.Series:
"""Make a series of arrow structures from a dataframe with nested arrays.
For the input dataframe with repeated indexes, make a pandas.Series,
where each original column is replaced by a structure of lists.
The dtype of the column is `pandas_ts.TsDtype` with the corresponding
pyarrow type. The index of the output series is the unique index of the
input dataframe. The Series has `.ts` accessor, see
`pandas_ts.ts_accessor.TsAccessor` for details.
For every row, all the nested array (aka pyarrow list) lengths must be
the same.
Parameters
----------
df : pd.DataFrame
Input dataframe, with pyarrow list-arrays.
name : str, optional
Name of the pd.Series.
validate : bool, default True
Whether to validate the input dataframe.
Returns
-------
pd.Series
Output series, with unique indexes.
See Also
--------
pandas_ts.ts_accessor.TsAccessor : The accessor for the output series.
pandas_ts.TsDtype : The dtype of the output series.
pandas_ts.packer.pack_flat : Pack a "flat" dataframe with repeated indexes.
"""
struct_array = pa.StructArray.from_arrays(
[df[column] for column in df.columns],
names=df.columns,
)
ext_array = TsExtensionArray(struct_array, validate=validate)
return pd.Series(
ext_array,
index=df.index,
copy=False,
name=name,
)
def view_sorted_df_as_list_arrays(df: pd.DataFrame) -> pd.DataFrame:
"""Make a nested array representation of a "flat" dataframe.
Parameters
----------
df : pd.DataFrame
Input dataframe, with repeated indexes. It must be sorted by its index.
Returns
-------
pd.DataFrame
Output dataframe, with unique indexes. It is a view over the input
dataframe, so it would mute the input dataframe if modified.
"""
offset_array = calculate_sorted_index_offsets(df.index)
unique_index = df.index.values[offset_array[:-1]]
series_ = {
column: view_sorted_series_as_list_array(df[column], offset_array, unique_index)
for column in df.columns
}
df = pd.DataFrame(series_)
return df
def view_sorted_series_as_list_array(
series: pd.Series, offset: np.ndarray | None = None, unique_index: np.ndarray | None = None
) -> pd.Series:
"""Make a nested array representation of a "flat" series.
Parameters
----------
series : pd.Series
Input series, with repeated indexes. It must be sorted by its index.
offset: np.ndarray or None, optional
Pre-calculated offsets of the input series index.
unique_index: np.ndarray or None, optional
Pre-calculated unique index of the input series. If given it must be
equal to `series.index.unique()` and `series.index.values[offset[:-1]]`.
Returns
-------
pd.Series
Output series, with unique indexes. It is a view over the input series,
so it would mute the input series if modified.
"""
if offset is None:
offset = calculate_sorted_index_offsets(series.index)
if unique_index is None:
unique_index = series.index.values[offset[:-1]]
list_array = pa.ListArray.from_arrays(
offset,
pa.array(series),
)
return pd.Series(
list_array,
dtype=pd.ArrowDtype(list_array.type),
index=unique_index,
copy=False,
)
def calculate_sorted_index_offsets(index: pd.Index) -> np.ndarray:
"""Calculate the offsets of the pre-sorted index values.
Parameters
----------
index : pd.Index
Input index, must be sorted.
Returns
-------
np.ndarray
Output array of offsets, one element more than the number of unique
index values.
"""
# TODO: implement multi-index support
index_diff = np.diff(index.values, prepend=index.values[0] - 1, append=index.values[-1] + 1)
if np.any(index_diff < 0):
raise ValueError("Table index must be strictly sorted.")
offset = np.nonzero(index_diff)[0]
return offset