Source code for etna.transforms.timestamp.time_flags
from copy import deepcopy
from typing import Dict
from typing import List
from typing import Optional
import numpy as np
import pandas as pd
from etna.distributions import BaseDistribution
from etna.distributions import CategoricalDistribution
from etna.transforms.base import FutureMixin
from etna.transforms.base import IrreversibleTransform
[docs]class TimeFlagsTransform(IrreversibleTransform, FutureMixin):
"""TimeFlagsTransform is a class that implements extraction of the main time-based features from datetime column."""
def __init__(
self,
minute_in_hour_number: bool = True,
fifteen_minutes_in_hour_number: bool = False,
hour_number: bool = True,
half_hour_number: bool = False,
half_day_number: bool = False,
one_third_day_number: bool = False,
out_column: Optional[str] = None,
):
"""Initialise class attributes.
Parameters
----------
minute_in_hour_number:
if True: add column with minute number to feature dataframe in transform
fifteen_minutes_in_hour_number:
if True: add column with number of fifteen-minute interval within hour with numeration from 0
to feature dataframe in transform
hour_number:
if True: add column with hour number to feature dataframe in transform
half_hour_number:
if True: add column with 0 for the first half of the hour and 1 for the second
to feature dataframe in transform
half_day_number:
if True: add column with 0 for the first half of the day and 1 for the second
to feature dataframe in transform
one_third_day_number:
if True: add column with number of 8-hour interval within day with numeration from 0
to feature dataframe in transform
out_column:
base for the name of created columns;
* if set the final name is '{out_column}_{feature_name}';
* if don't set, name will be ``transform.__repr__()``,
repr will be made for transform that creates exactly this column
Raises
------
ValueError: if feature has invalid initial params
"""
if not any(
[
minute_in_hour_number,
fifteen_minutes_in_hour_number,
hour_number,
half_hour_number,
half_day_number,
one_third_day_number,
]
):
raise ValueError(
f"{type(self).__name__} feature does nothing with given init args configuration, "
f"at least one of minute_in_hour_number, fifteen_minutes_in_hour_number, hour_number, "
f"half_hour_number, half_day_number, one_third_day_number should be True."
)
super().__init__(required_features=["target"])
self.date_column_name = None
self.minute_in_hour_number: bool = minute_in_hour_number
self.fifteen_minutes_in_hour_number: bool = fifteen_minutes_in_hour_number
self.hour_number: bool = hour_number
self.half_hour_number: bool = half_hour_number
self.half_day_number: bool = half_day_number
self.one_third_day_number: bool = one_third_day_number
self.out_column = out_column
# create empty init parameters
self._empty_parameters = dict(
minute_in_hour_number=False,
fifteen_minutes_in_hour_number=False,
hour_number=False,
half_hour_number=False,
half_day_number=False,
one_third_day_number=False,
)
def _get_column_name(self, feature_name: str) -> str:
if self.out_column is None:
init_parameters = deepcopy(self._empty_parameters)
init_parameters[feature_name] = getattr(self, feature_name)
temp_transform = TimeFlagsTransform(**init_parameters, out_column=self.out_column) # type: ignore
return repr(temp_transform)
else:
return f"{self.out_column}_{feature_name}"
[docs] def get_regressors_info(self) -> List[str]:
"""Return the list with regressors created by the transform."""
features = [
"minute_in_hour_number",
"fifteen_minutes_in_hour_number",
"hour_number",
"half_hour_number",
"half_day_number",
"one_third_day_number",
]
output_columns = [
self._get_column_name(feature_name=feature_name) for feature_name in features if getattr(self, feature_name)
]
return output_columns
def _fit(self, *args, **kwargs) -> "TimeFlagsTransform":
"""Fit datetime model."""
return self
def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform method for features based on time.
Parameters
----------
df:
Features dataframe with time
Returns
-------
result: pd.DataFrame
Dataframe with extracted features
"""
features = pd.DataFrame(index=df.index)
timestamp_series = pd.Series(df.index)
if self.minute_in_hour_number:
minute_in_hour_number = self._get_minute_number(timestamp_series=timestamp_series)
features[self._get_column_name("minute_in_hour_number")] = minute_in_hour_number
if self.fifteen_minutes_in_hour_number:
fifteen_minutes_in_hour_number = self._get_period_in_hour(
timestamp_series=timestamp_series, period_in_minutes=15
)
features[self._get_column_name("fifteen_minutes_in_hour_number")] = fifteen_minutes_in_hour_number
if self.hour_number:
hour_number = self._get_hour_number(timestamp_series=timestamp_series)
features[self._get_column_name("hour_number")] = hour_number
if self.half_hour_number:
half_hour_number = self._get_period_in_hour(timestamp_series=timestamp_series, period_in_minutes=30)
features[self._get_column_name("half_hour_number")] = half_hour_number
if self.half_day_number:
half_day_number = self._get_period_in_day(timestamp_series=timestamp_series, period_in_hours=12)
features[self._get_column_name("half_day_number")] = half_day_number
if self.one_third_day_number:
one_third_day_number = self._get_period_in_day(timestamp_series=timestamp_series, period_in_hours=8)
features[self._get_column_name("one_third_day_number")] = one_third_day_number
for feature in features.columns:
features[feature] = features[feature].astype("category")
dataframes = []
for seg in df.columns.get_level_values("segment").unique():
tmp = df[seg].join(features)
_idx = tmp.columns.to_frame()
_idx.insert(0, "segment", seg)
tmp.columns = pd.MultiIndex.from_frame(_idx)
dataframes.append(tmp)
result = pd.concat(dataframes, axis=1).sort_index(axis=1)
result.columns.names = ["segment", "feature"]
return result
@staticmethod
def _get_minute_number(timestamp_series: pd.Series) -> np.ndarray:
"""Generate array with the minute number in the hour."""
return timestamp_series.apply(lambda x: x.minute).values
@staticmethod
def _get_period_in_hour(timestamp_series: pd.Series, period_in_minutes: int = 15) -> np.ndarray:
"""Generate an array with the period number in the hour.
Accepts a period length in minutes as input and returns array where timestamps marked by period number.
"""
return timestamp_series.apply(lambda x: x.minute // period_in_minutes).values
@staticmethod
def _get_hour_number(timestamp_series: pd.Series) -> np.ndarray:
"""Generate an array with the hour number in the day."""
return timestamp_series.apply(lambda x: x.hour).values
@staticmethod
def _get_period_in_day(timestamp_series: pd.Series, period_in_hours: int = 12) -> np.ndarray:
"""Generate an array with the period number in the day.
Accepts a period length in hours as input and returns array where timestamps marked by period number.
"""
return timestamp_series.apply(lambda x: x.hour // period_in_hours).values
[docs] def params_to_tune(self) -> Dict[str, BaseDistribution]:
"""Get default grid for tuning hyperparameters.
This grid tunes parameters: ``minute_in_hour_number``, ``fifteen_minutes_in_hour_number``, ``hour_number``,
``half_hour_number``, ``half_day_number``, ``one_third_day_number``.
Other parameters are expected to be set by the user.
There are no restrictions on all ``False`` values for the flags.
Returns
-------
:
Grid to tune.
"""
return {
"minute_in_hour_number": CategoricalDistribution([False, True]),
"fifteen_minutes_in_hour_number": CategoricalDistribution([False, True]),
"hour_number": CategoricalDistribution([False, True]),
"half_hour_number": CategoricalDistribution([False, True]),
"half_day_number": CategoricalDistribution([False, True]),
"one_third_day_number": CategoricalDistribution([False, True]),
}
__all__ = ["TimeFlagsTransform"]