Source code for openoa.utils.timeseries

"""
This module provides useful functions for processing timeseries data
"""

from __future__ import annotations

import datetime

import numpy as np
import pandas as pd
from pytz import utc, timezone
from dateutil.parser import parse

from openoa.utils._converters import series_method


[docs] def offset_to_seconds(offset: int | float | str | np.datetime64) -> int | float: """Converts pandas datetime offset alias to its corresponding number of seconds. Args: offset(:obj:`int` | :obj:`float` | :obj:`str` | :obj:`numpy.datetime64`): The pandas offset alias or numpy timestamp to be converted to seconds. If a number (int or float) is passed, then it must be in nanoseconds, the Pandas default. Returns: :obj:`int` | `float`: The number of seconds corresponding to :py:attr:`offset`. """ try: seconds = pd.to_timedelta(offset).total_seconds() except ValueError: # Needs a leading number or the above will fail seconds = pd.to_timedelta(f"1{offset}").total_seconds() return seconds
[docs] def determine_frequency_seconds(data: pd.DataFrame, index_col: str | None = None) -> int | float: """Calculates the most common time difference between all non-duplicate timestamps and returns that difference in seconds. Args: data(:obj:`pandas.DataFrame`): The pandas DataFrame to determine the DatetimeIndex frequency. index_col(:obj:`str` | `None`, optional): The name of the index column if :py:attr:`data` uses a MultiIndex, otherwise leave as None. Defaults to None. Returns: :obj:`int` | `float`: The number of seconds corresponding to :py:attr:`offset`. """ # Get the non-duplicated DatetimeIndex values from a single level, or multi-level index index = data.index if index_col is None else data.index.get_level_values(index_col) index = index.unique() unique_diffs, counts = np.unique(np.diff(index), return_counts=True) return offset_to_seconds(unique_diffs[np.argmax(counts)])
[docs] def determine_frequency(data: pd.DataFrame, index_col: str | None = None) -> str | int | float: """Gets the offset alias from the datetime index of :py:attr:`data`, or calculates the most common time difference between all non-duplicate timestamps. Args: data(:obj:`pandas.DataFrame`): The pandas DataFrame to determine the DatetimeIndex frequency. index_col(:obj:`str` | `None`, optional): The name of the index column if :py:attr:`data` uses a MultiIndex, otherwise leave as None. Defaults to None. Returns: :obj:`str` | :obj:`int` | :obj:`float`: The offset string or number of seconds between timestamps. """ # Get the timetamp index values index = data.index if index_col is None else data.index.get_level_values(index_col) # Check for an offset string being available freq = index.freqstr if freq is None: freq = pd.infer_freq(data.index.get_level_values("time")) # If there is at least one missing data point, or timestamp misalignment, the above will fail, # so if freq is None: freq = determine_frequency_seconds(data, index_col) return freq
[docs] def convert_local_to_utc(d: str | datetime.datetime, tz_string: str) -> datetime.datetime: """ Convert timestamps in local time to UTC. The function can only act on a single timestamp at a time, so for example use the .apply function in Pandas: date_utc = df['time'].apply(convert_local_to_utc, args = ('US/Pacific',)) Also note that this function doesn't solve the end of DST when times between 1:00-2:00 are repeated in November. Those dates are left repeated in UTC time and need to be shifted manually. The function does address the missing 2:00-3:00 times at the start of DST in March Args: d(:obj:`datetime.datetime`): the local date, tzinfo must not be set tz_string(:obj:`str`): the local timezone Returns: :obj:`datetime.datetime`: the local date converted to UTC time """ # TODO: Make a second copy of this method that aligns with the QA.convert_datetime_column method if isinstance(d, str): d = parse(d) if not isinstance(d, datetime.datetime): raise TypeError( "The input to `d` must be a `datetime.datetime` object or a string that can be converted to one." ) # Define the timezone, and convert to a the localized timestamp as needed tz = timezone(tz_string) # TODO: Figure out why a datetime object with tzinfo encoded is different than localizing with pytz d_local = d if d.tzinfo else tz.localize(d, is_dst=True) return d_local.astimezone(utc) # calculate UTC time
[docs] @series_method(data_cols=["dt_col"]) def convert_dt_to_utc( dt_col: pd.Series | str, tz_string: str, data: pd.DataFrame = None ) -> pd.Series: """Converts a pandas ``Series`` of timestamps, string-formatted or ``datetime.datetime`` objects that are in a local timezone ``tz_string`` to a UTC encoded pandas ``Series``. Args: dt_col (:obj:`pandas.Series` | `str`): A pandas ``Series`` of datetime objects or string-encoded timestamps, or a the name of the column in `data`. tz_string (str): The string name for the expected timezone of the provided timestamps in :py:attr:`dt_col`. data (:obj:`pandas.DataFrame`, optional): The pandas ``DataFrame`` containing the timestamp column: :py:attr:`dt_col`. Defaults to None. Returns: pd.Series: _description_ """ if isinstance(dt_col[0], str): dt_col = dt_col.apply(parse) # If the timezone information is already encoded, then convert it to a UTC-converted # pandas datetime object automatically, otherwise, localize it, then convert it if dt_col[0].tzinfo is not None: return pd.to_datetime(dt_col, utc=True) return dt_col.dt.tz_localize(tz_string, ambiguous=True).dt.tz_convert(utc)
[docs] @series_method(data_cols=["dt_col"]) def find_time_gaps(dt_col: pd.Series | str, freq: str, data: pd.DataFrame = None) -> pd.Series: """ Finds gaps in `dt_col` based on the expected frequency, `freq`, and returns them. Args: dt_col(:obj:`pandas.Series`): Pandas ``Series`` of ``datetime.datetime`` objects or the name of the column in :py:attr:`data`. freq(:obj:`string`): The expected frequency of the timestamps, which should align with the pandas timestamp conventions (https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases). data (:obj:`pandas.DataFrame`, optional): The pandas ``DataFrame`` containing the timestamp column: :py:attr:`dt_col`. Defaults to None. Returns: :obj:`pandas.Series`: Series of missing time stamps in ``datetime.datetime`` format """ if isinstance(dt_col, pd.DatetimeIndex): dt_col = dt_col.to_series() # If the difference for all of the timestamps is the expected frequency, 0 (duplicate), or a NaT # (first element of `diff`), then return an empty series if np.all(dt_col.diff().isin([pd.Timedelta(freq), pd.Timedelta("0"), pd.NaT])): return pd.Series([], name=dt_col.name, dtype="object") # Create a date range object and return a Series of the set difference of both objects range_dt = pd.Series(data=pd.date_range(dt_col.min(), end=dt_col.max(), freq=freq)) return pd.Series(tuple(set(range_dt).difference(dt_col)), name=dt_col.name)
[docs] @series_method(data_cols=["dt_col"]) def find_duplicate_times(dt_col: pd.Series | str, data: pd.DataFrame = None): """ Find duplicate input data and report them. The first duplicated item is not reported, only subsequent duplicates. Args: dt_col(:obj:`pandas.Series` | `str`): Pandas series of ``datetime.datetime`` objects or the name of the column in :py:attr:`data`. data (:obj:`pandas.DataFrame`, optional): The pandas `DataFrame` containing the timestamp column: :py:attr:`dt_col`. Defaults to None. Returns: :obj:`pandas.Series`: Duplicates from input data """ if isinstance(dt_col, pd.DatetimeIndex): dt_col = dt_col.to_series() return dt_col[dt_col.duplicated()]
[docs] def gap_fill_data_frame(data: pd.DataFrame, dt_col: str, freq: str) -> pd.DataFrame: """ Insert any missing timestamps into :py:attr:`data` while filling the data columns with NaNs. Args: data(:obj:`pandas.DataFrame`): The dataframe with potentially missing timestamps. dt_col(:obj:`str`): Name of the column in 'data' with timestamps. freq(:obj:`str`): The expected frequency of the timestamps. Returns: :obj:`pandas.DataFrame`: output data frame with NaN data for the data gaps """ # If the dataframe is empty, just return it. if data.shape[0] == 0: return data gap_df = pd.DataFrame(columns=data.columns) gap_df[dt_col] = find_time_gaps(data[dt_col], freq) if gap_df.size > 0: data = pd.concat([data, gap_df], axis=0) try: return data.sort_values(dt_col) except ValueError: # Catches when dt_col and the index name are the same, and temporarily renames the column # to perform the sort, and puts it back for the returned data return ( data.rename(columns={dt_col: f"_{dt_col}"}) .sort_values(f"_{dt_col}") .rename(columns={f"_{dt_col}": dt_col}) )
[docs] @series_method(data_cols=["col"]) def percent_nan(col: pd.Series | str, data: pd.DataFrame = None): """ Return percentage of data that are Nan or 1 if the series is empty. Args: col(:obj:`pandas.Series`): The pandas `Series` to be checked for NaNs, or the name of the column in :py:attr:`data`. data (:obj:`pandas.DataFrame`, optional): The pandas ``DataFrame`` containing the timestamp column: :py:attr:`col`. Defaults to None. Returns: :obj:`float`: Percentage of NaN data in the data series """ return 1 if (denominator := float(col.size)) == 0 else np.isnan(col.values).sum() / denominator
[docs] @series_method(data_cols=["dt_col"]) def num_days(dt_col: pd.Series | str, data: pd.DataFrame = None) -> int: """ Calculates the number of non-duplicate days in :py:attr:`dt_col`. Args: dt_col(:obj:`pandas.Series` | str): A pandas ``Series`` with a timeseries index to be checked for the number of days contained in the data. data (:obj:`pandas.DataFrame`, optional): The pandas ``DataFrame`` containing the timestamp column: :py:attr:`dt_col` and having a timeseries index. Defaults to None. Returns: :obj:`int`: Number of days in the data """ return dt_col[~dt_col.index.duplicated()].resample("D").asfreq().index.size
[docs] @series_method(data_cols=["dt_col"]) def num_hours(dt_col: pd.Series | str, *, data: pd.DataFrame = None) -> int: """ Calculates the number of non-duplicate hours in `dt_col`. Args: dt_col(:obj:`pandas.Series` | str): A pandas ``Series`` of timeseries data to be checked for the number of hours contained in the data data (:obj:`pandas.DataFrame`, optional): The pandas `DataFrame` containing the timestamp column: :py:attr:`dt_col`. Defaults to None. Returns: :obj:`int`: Number of hours in the data """ return dt_col[~dt_col.index.duplicated()].resample("h").asfreq().index.size