Source code for etna.models.prophet

import warnings
from copy import deepcopy
from datetime import datetime
from typing import Dict
from typing import Iterable
from typing import List
from typing import Optional
from typing import Sequence
from typing import Set
from typing import Union

import pandas as pd

from etna import SETTINGS
from etna.distributions import BaseDistribution
from etna.distributions import CategoricalDistribution
from etna.distributions import FloatDistribution
from etna.models.base import BaseAdapter
from etna.models.base import PredictionIntervalContextIgnorantAbstractModel
from etna.models.mixins import PerSegmentModelMixin
from etna.models.mixins import PredictionIntervalContextIgnorantModelMixin

if SETTINGS.prophet_required:
    from prophet import Prophet
    from prophet.serialize import model_from_dict
    from prophet.serialize import model_to_dict


[docs]class _ProphetAdapter(BaseAdapter):
    """Class for holding Prophet model."""

    predefined_regressors_names = ("floor", "cap")

    def __init__(
        self,
        growth: str = "linear",
        changepoints: Optional[List[datetime]] = None,
        n_changepoints: int = 25,
        changepoint_range: float = 0.8,
        yearly_seasonality: Union[str, bool] = "auto",
        weekly_seasonality: Union[str, bool] = "auto",
        daily_seasonality: Union[str, bool] = "auto",
        holidays: Optional[pd.DataFrame] = None,
        seasonality_mode: str = "additive",
        seasonality_prior_scale: float = 10.0,
        holidays_prior_scale: float = 10.0,
        changepoint_prior_scale: float = 0.05,
        mcmc_samples: int = 0,
        interval_width: float = 0.8,
        uncertainty_samples: Union[int, bool] = 1000,
        stan_backend: Optional[str] = None,
        additional_seasonality_params: Iterable[Dict[str, Union[str, float, int]]] = (),
    ):

        self.growth = growth
        self.n_changepoints = n_changepoints
        self.changepoints = changepoints
        self.changepoint_range = changepoint_range
        self.yearly_seasonality = yearly_seasonality
        self.weekly_seasonality = weekly_seasonality
        self.daily_seasonality = daily_seasonality
        self.holidays = holidays
        self.seasonality_mode = seasonality_mode
        self.seasonality_prior_scale = seasonality_prior_scale
        self.holidays_prior_scale = holidays_prior_scale
        self.changepoint_prior_scale = changepoint_prior_scale
        self.mcmc_samples = mcmc_samples
        self.interval_width = interval_width
        self.uncertainty_samples = uncertainty_samples
        self.stan_backend = stan_backend
        self.additional_seasonality_params = additional_seasonality_params

        self.model = self._create_model()

        self.regressor_columns: Optional[List[str]] = None

    def _create_model(self) -> "Prophet":
        model = Prophet(
            growth=self.growth,
            changepoints=self.changepoints,
            n_changepoints=self.n_changepoints,
            changepoint_range=self.changepoint_range,
            yearly_seasonality=self.yearly_seasonality,
            weekly_seasonality=self.weekly_seasonality,
            daily_seasonality=self.daily_seasonality,
            holidays=self.holidays,
            seasonality_mode=self.seasonality_mode,
            seasonality_prior_scale=self.seasonality_prior_scale,
            holidays_prior_scale=self.holidays_prior_scale,
            changepoint_prior_scale=self.changepoint_prior_scale,
            mcmc_samples=self.mcmc_samples,
            interval_width=self.interval_width,
            uncertainty_samples=self.uncertainty_samples,
            stan_backend=self.stan_backend,
        )

        for seasonality_params in self.additional_seasonality_params:
            model.add_seasonality(**seasonality_params)

        return model

    def _check_not_used_columns(self, df: pd.DataFrame):
        if self.regressor_columns is None:
            raise ValueError("Something went wrong, regressor_columns is None!")

        columns_not_used = [col for col in df.columns if col not in ["target", "timestamp"] + self.regressor_columns]
        if columns_not_used:
            warnings.warn(
                message=f"This model doesn't work with exogenous features unknown in future. "
                f"Columns {columns_not_used} won't be used."
            )

    def _select_regressors(self, df: pd.DataFrame) -> Optional[pd.DataFrame]:
        """Select data with regressors.

        During fit there can't be regressors with NaNs, they are removed at higher level.
        Look at the issue: https://github.com/tinkoff-ai/etna/issues/557

        During prediction without validation NaNs in regressors lead to exception from the underlying model.

        This model requires data to be in numeric dtype.
        """
        if self.regressor_columns is None:
            raise ValueError("Something went wrong, regressor_columns is None!")

        regressors_with_nans = [regressor for regressor in self.regressor_columns if df[regressor].isna().sum() > 0]
        if regressors_with_nans:
            raise ValueError(
                f"Regressors {regressors_with_nans} contain NaN values. "
                "Try to lower horizon value, or drop these regressors."
            )

        if self.regressor_columns:
            try:
                result = df[self.regressor_columns].apply(pd.to_numeric)
            except ValueError as e:
                raise ValueError(f"Only convertible to numeric features are allowed! Error: {str(e)}")
        else:
            result = None

        return result

[docs]    def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_ProphetAdapter":
        """
        Fits a Prophet model.

        Parameters
        ----------
        df:
            Features dataframe
        regressors:
            List of the columns with regressors
        """
        self.regressor_columns = regressors
        self._check_not_used_columns(df)

        prophet_df = self._prepare_prophet_df(df=df)
        for regressor in self.regressor_columns:
            if regressor not in self.predefined_regressors_names:
                self.model.add_regressor(regressor)
        self.model.fit(prophet_df)
        return self

[docs]    def predict(self, df: pd.DataFrame, prediction_interval: bool, quantiles: Sequence[float]) -> pd.DataFrame:
        """
        Compute predictions from a Prophet model.

        Parameters
        ----------
        df:
            Features dataframe
        prediction_interval:
            If True returns prediction interval for forecast
        quantiles:
            Levels of prediction distribution

        Returns
        -------
        :
            DataFrame with predictions
        """
        prophet_df = self._prepare_prophet_df(df=df)
        forecast = self.model.predict(prophet_df)
        y_pred = pd.DataFrame(forecast["yhat"])
        if prediction_interval:
            sim_values = self.model.predictive_samples(prophet_df)
            for quantile in quantiles:
                percentile = quantile * 100
                y_pred[f"yhat_{quantile:.4g}"] = self.model.percentile(sim_values["yhat"], percentile, axis=1)
        rename_dict = {
            column: column.replace("yhat", "target") for column in y_pred.columns if column.startswith("yhat")
        }
        y_pred = y_pred.rename(rename_dict, axis=1)
        return y_pred

    def _prepare_prophet_df(self, df: pd.DataFrame) -> pd.DataFrame:
        """Prepare dataframe for fit and predict."""
        if self.regressor_columns is None:
            raise ValueError("List of regressor is not set!")

        df = df.reset_index()

        prophet_df = pd.DataFrame()
        prophet_df["y"] = df["target"]
        prophet_df["ds"] = df["timestamp"]

        regressors_data = self._select_regressors(df)
        if regressors_data is not None:
            prophet_df[self.regressor_columns] = regressors_data[self.regressor_columns]

        return prophet_df

    @staticmethod
    def _filter_aggregated_components(components: Iterable[str]) -> Set[str]:
        """Filter out aggregated components."""
        # aggregation of corresponding model terms, e.g. sum
        aggregated_components = {
            "additive_terms",
            "multiplicative_terms",
            "extra_regressors_additive",
            "extra_regressors_multiplicative",
        }

        return set(components) - aggregated_components

    def _check_mul_components(self):
        """Raise error if model contains multiplicative components."""
        components_modes = self.model.component_modes
        if components_modes is None:
            raise ValueError("This model is not fitted!")

        mul_components = self._filter_aggregated_components(self.model.component_modes["multiplicative"])
        if len(mul_components) > 0:
            raise ValueError("Forecast decomposition is only supported for additive components!")

    def _predict_seasonal_components(self, df: pd.DataFrame) -> pd.DataFrame:
        """Estimate seasonal, holidays and exogenous components."""
        model = self.model

        seasonal_features, _, component_cols, _ = model.make_all_seasonality_features(df)

        holiday_names = set(model.train_holiday_names) if model.train_holiday_names is not None else set()

        components_names = list(
            filter(lambda v: v not in holiday_names, self._filter_aggregated_components(component_cols.columns))
        )

        beta_c = model.params["beta"].T * component_cols[components_names].values
        comp = seasonal_features.values @ beta_c

        # apply rescaling for additive components
        comp *= model.y_scale

        return pd.DataFrame(data=comp, columns=components_names)

[docs]    def predict_components(self, df: pd.DataFrame) -> pd.DataFrame:
        """Estimate prediction components.

        Parameters
        ----------
        df:
            features dataframe

        Returns
        -------
        :
            dataframe with prediction components
        """
        self._check_mul_components()

        prophet_df = self._prepare_prophet_df(df=df)

        prophet_df = self.model.setup_dataframe(prophet_df)

        components = self._predict_seasonal_components(df=prophet_df)
        components["trend"] = self.model.predict_trend(df=prophet_df)

        return components.add_prefix("target_component_")

[docs]    def get_model(self) -> Prophet:
        """Get internal prophet.Prophet model that is used inside etna class.

        Returns
        -------
        result:
           Internal model
        """
        return self.model

    def __getstate__(self):
        state = self.__dict__.copy()
        try:
            model_dict = model_to_dict(self.model)
            is_fitted = True
        except ValueError:
            is_fitted = False
            model_dict = {}
        del state["model"]
        state["_is_fitted"] = is_fitted
        state["_model_dict"] = model_dict
        return state

    def __setstate__(self, state):
        local_state = deepcopy(state)
        is_fitted = local_state["_is_fitted"]
        model_dict = local_state["_model_dict"]
        del local_state["_is_fitted"]
        del local_state["_model_dict"]

        self.__dict__.update(local_state)

        if is_fitted:
            self.model = model_from_dict(model_dict)
        else:
            self.model = self._create_model()


[docs]class ProphetModel(
    PerSegmentModelMixin, PredictionIntervalContextIgnorantModelMixin, PredictionIntervalContextIgnorantAbstractModel
):
    """Class for holding Prophet model.

    Notes
    -----
    Original Prophet can use features 'cap' and 'floor',
    they should be added to the known_future list on dataset initialization.

    This model supports in-sample and out-of-sample forecast decomposition. The number
    of components in the decomposition depends on model parameters. Main components are:
    trend, seasonality, holiday and exogenous effects. Seasonal components will be decomposed
    down to individual periods if fitted. Holiday and exogenous will be present in decomposition
    if fitted.Corresponding components are obtained directly from the model.

    Examples
    --------
    >>> from etna.datasets import generate_periodic_df
    >>> from etna.datasets import TSDataset
    >>> from etna.models import ProphetModel
    >>> classic_df = generate_periodic_df(
    ...     periods=100,
    ...     start_time="2020-01-01",
    ...     n_segments=4,
    ...     period=7,
    ...     sigma=3
    ... )
    >>> df = TSDataset.to_dataset(df=classic_df)
    >>> ts = TSDataset(df, freq="D")
    >>> future = ts.make_future(7)
    >>> model = ProphetModel(growth="flat")
    >>> model.fit(ts=ts)
    ProphetModel(growth = 'flat', changepoints = None, n_changepoints = 25,
    changepoint_range = 0.8, yearly_seasonality = 'auto', weekly_seasonality = 'auto',
    daily_seasonality = 'auto', holidays = None, seasonality_mode = 'additive',
    seasonality_prior_scale = 10.0, holidays_prior_scale = 10.0, changepoint_prior_scale = 0.05,
    mcmc_samples = 0, interval_width = 0.8, uncertainty_samples = 1000, stan_backend = None,
    additional_seasonality_params = (), )
    >>> forecast = model.forecast(future)
    >>> forecast
    segment    segment_0 segment_1 segment_2 segment_3
    feature       target    target    target    target
    timestamp
    2020-04-10      9.00      9.00      4.00      6.00
    2020-04-11      5.00      2.00      7.00      9.00
    2020-04-12      0.00      4.00      7.00      9.00
    2020-04-13      0.00      5.00      9.00      7.00
    2020-04-14      1.00      2.00      1.00      6.00
    2020-04-15      5.00      7.00      4.00      7.00
    2020-04-16      8.00      6.00      2.00      0.00
    """

    def __init__(
        self,
        growth: str = "linear",
        changepoints: Optional[List[datetime]] = None,
        n_changepoints: int = 25,
        changepoint_range: float = 0.8,
        yearly_seasonality: Union[str, bool] = "auto",
        weekly_seasonality: Union[str, bool] = "auto",
        daily_seasonality: Union[str, bool] = "auto",
        holidays: Optional[pd.DataFrame] = None,
        seasonality_mode: str = "additive",
        seasonality_prior_scale: float = 10.0,
        holidays_prior_scale: float = 10.0,
        changepoint_prior_scale: float = 0.05,
        mcmc_samples: int = 0,
        interval_width: float = 0.8,
        uncertainty_samples: Union[int, bool] = 1000,
        stan_backend: Optional[str] = None,
        additional_seasonality_params: Iterable[Dict[str, Union[str, float, int]]] = (),
    ):
        """
        Create instance of Prophet model.

        Parameters
        ----------
        growth:
            Options are ‘linear’ and ‘logistic’. This likely will not be tuned;
            if there is a known saturating point and growth towards that point
            it will be included and the logistic trend will be used, otherwise
            it will be linear.
        changepoints:
            List of dates at which to include potential changepoints. If
            not specified, potential changepoints are selected automatically.
        n_changepoints:
            Number of potential changepoints to include. Not used
            if input ``changepoints`` is supplied. If ``changepoints`` is not supplied,
            then ``n_changepoints`` potential changepoints are selected uniformly from
            the first ``changepoint_range`` proportion of the history.
        changepoint_range:
            Proportion of history in which trend changepoints will
            be estimated. Defaults to 0.8 for the first 80%. Not used if
            ``changepoints`` is specified.
        yearly_seasonality:
            By default (‘auto’) this will turn yearly seasonality on if there is
            a year of data, and off otherwise. Options are [‘auto’, True, False].
            If there is more than a year of data, rather than trying to turn this
            off during HPO, it will likely be more effective to leave it on and
            turn down seasonal effects by tuning ``seasonality_prior_scale``.
        weekly_seasonality:
            Same as for ``yearly_seasonality``.
        daily_seasonality:
            Same as for ``yearly_seasonality``.
        holidays:
            ``pd.DataFrame`` with columns holiday (string) and ds (date type)
            and optionally columns lower_window and upper_window which specify a
            range of days around the date to be included as holidays.
            ``lower_window=-2`` will include 2 days prior to the date as holidays. Also
            optionally can have a column ``prior_scale`` specifying the prior scale for
            that holiday.
        seasonality_mode:
            'additive' (default) or 'multiplicative'.
        seasonality_prior_scale:
            Parameter modulating the strength of the
            seasonality model. Larger values allow the model to fit larger seasonal
            fluctuations, smaller values dampen the seasonality. Can be specified
            for individual seasonalities using ``add_seasonality``.
        holidays_prior_scale:
            Parameter modulating the strength of the holiday components model, unless overridden
            in the holidays input.
        changepoint_prior_scale:
            Parameter modulating the flexibility of the
            automatic changepoint selection. Large values will allow many
            changepoints, small values will allow few changepoints.
        mcmc_samples:
            Integer, if greater than 0, will do full Bayesian inference
            with the specified number of MCMC samples. If 0, will do MAP
            estimation.
        interval_width:
            Float, width of the uncertainty intervals provided
            for the forecast. If ``mcmc_samples=0``, this will be only the uncertainty
            in the trend using the MAP estimate of the extrapolated generative
            model. If ``mcmc.samples>0``, this will be integrated over all model
            parameters, which will include uncertainty in seasonality.
        uncertainty_samples:
            Number of simulated draws used to estimate
            uncertainty intervals. Settings this value to 0 or False will disable
            uncertainty estimation and speed up the calculation.
        stan_backend:
            as defined in StanBackendEnum default: None - will try to
            iterate over all available backends and find the working one
        additional_seasonality_params: Iterable[Dict[str, Union[int, float, str]]]
            parameters that describe additional (not 'daily', 'weekly', 'yearly') seasonality that should be
            added to model; dict with required keys 'name', 'period', 'fourier_order' and optional ones 'prior_scale',
            'mode', 'condition_name' will be used for :py:meth:`prophet.Prophet.add_seasonality` method call.
        """
        self.growth = growth
        self.n_changepoints = n_changepoints
        self.changepoints = changepoints
        self.changepoint_range = changepoint_range
        self.yearly_seasonality = yearly_seasonality
        self.weekly_seasonality = weekly_seasonality
        self.daily_seasonality = daily_seasonality
        self.holidays = holidays
        self.seasonality_mode = seasonality_mode
        self.seasonality_prior_scale = seasonality_prior_scale
        self.holidays_prior_scale = holidays_prior_scale
        self.changepoint_prior_scale = changepoint_prior_scale
        self.mcmc_samples = mcmc_samples
        self.interval_width = interval_width
        self.uncertainty_samples = uncertainty_samples
        self.stan_backend = stan_backend
        self.additional_seasonality_params = additional_seasonality_params

        super(ProphetModel, self).__init__(
            base_model=_ProphetAdapter(
                growth=self.growth,
                n_changepoints=self.n_changepoints,
                changepoints=self.changepoints,
                changepoint_range=self.changepoint_range,
                yearly_seasonality=self.yearly_seasonality,
                weekly_seasonality=self.weekly_seasonality,
                daily_seasonality=self.daily_seasonality,
                holidays=self.holidays,
                seasonality_mode=self.seasonality_mode,
                seasonality_prior_scale=self.seasonality_prior_scale,
                holidays_prior_scale=self.holidays_prior_scale,
                changepoint_prior_scale=self.changepoint_prior_scale,
                mcmc_samples=self.mcmc_samples,
                interval_width=self.interval_width,
                uncertainty_samples=self.uncertainty_samples,
                stan_backend=self.stan_backend,
                additional_seasonality_params=self.additional_seasonality_params,
            )
        )

[docs]    def params_to_tune(self) -> Dict[str, BaseDistribution]:
        """Get default grid for tuning hyperparameters.

        This grid tunes parameters: ``seasonality_mode``, ``seasonality_prior_scale``, ``changepoint_prior_scale``,
        ``changepoint_range``, ``holidays_prior_scale``.
        Other parameters are expected to be set by the user.

        Returns
        -------
        :
            Grid to tune.
        """
        return {
            "seasonality_mode": CategoricalDistribution(["additive", "multiplicative"]),
            "seasonality_prior_scale": FloatDistribution(low=1e-2, high=10, log=True),
            "changepoint_prior_scale": FloatDistribution(low=1e-3, high=0.5, log=True),
            "changepoint_range": FloatDistribution(low=0.8, high=0.95),
            "holidays_prior_scale": FloatDistribution(low=1e-2, high=10, log=True),
        }