Source code for muarch.uarch

from typing import List, Optional, Union

import numpy as np
import pandas as pd

from ._base import PARAMS, RNG_GEN, _ArchBase
from .distributions import GeneralizedError, Normal, SkewStudent, StudentsT
from .exceptions import NotFittedError
from .mean import ARX, ConstantMean, HARX, LS, ZeroMean
from .summary import Summary
from .volatility import ARCH, ConstantVariance, EGARCH, FIGARCH, GARCH, HARCH

__all__ = ['UArch']


[docs]class UArch(_ArchBase):
    """
    Univariate ARCH model that wraps on top of Mean, Volatility and Distribution classes defined in the `arch` package.
    Mainly, this class combines the original model and fitted model in the `arch` package for convenience. It has also
    some additional methods such as :code:`simulate_mc` for Monte Carlo simulations.
    """

[docs]    def __init__(self, mean='Constant', lags=0, vol='GARCH', p=1, o=0, q=1, power=2.0, dist='Normal', hold_back=None,
                 scale=1):
        """
        Creates the wrapping arch model

        Parameters
        ----------
        mean: { 'zero', 'constant', 'harx', 'har', 'ar', 'arx', 'ls' }, optional
            Name of the mean model.  Currently supported options are:

            * **Constant** (default) - Constant mean model
            * **Zero** - Zero mean model
            * **AR** - Autoregression model
            * **ARX** - Autoregression model with exogenous regressors. Falls back to **AR** if no exogenous regressors
            * **HAR** - Heterogeneous Autoregression model
            * **HARX** - Heterogeneous Autoregressions with exogenous regressors
            * **LS** - Least squares model

            For more information on the different models, check out the documentation at
            https://arch.readthedocs.io/en/latest/univariate/mean.html

        lags: int or list (int), optional
            Either a scalar integer value indicating lag length or a list of integers specifying lag locations.

        vol: { 'GARCH', 'ARCH', 'CONSTANT' 'EGARCH', 'FIGARCH' and 'HARCH', 'CONSTANT' }, optional
            Name of the volatility model.  Currently supported options are:

             * **GARCH** (default) - Standard GARCH process which can be used to specify the following models:

                * ARCH(p)
                * GARCH(p,q)
                * GJR-GARCH(p,o,q)
                * AVARCH(p)
                * AVGARCH(p,q)
                * TARCH(p,o,q)
                * Models with arbitrary, pre-specified powers

            * **ARCH** - ARCH process
            * **EGARCH** - EGARCH process
            * **FIGARCH** - Fractionally Integrated (FI) GARCH process
            * **HARCH** - Heterogeneous ARCH process
            * **Constant** (default) - Constant volatility process

        p: int, optional
            Lag order of the symmetric innovation

        o: int, optional
            Lag order of the asymmetric innovation

        q: int, optional
            Lag order of lagged volatility or equivalent

        power: float, optional
            Power to use with the innovations. Default is 2.0, which produces ARCH and related models. Using 1.0
            produces AVARCH and related models. Other powers can be specified, although these should be strictly
            positive, and usually larger than 0.25.

        dist: { 'normal', 'gaussian', 'studentst', 't', 'skewstudent', 'skewt', 'ged', 'generalized error' }, optional
            Name of the distribution for the innovations.  Currently supported options are:

            * **normal**, **gaussian** (default) - Standard Normal distribution
            * **t**, **studentst** - Standardized Student’s distribution
            * **skewstudent**, **skewt** - Standardized Skewed Student’s distribution.
            * **ged**, **generalized error" - Generalized Error Distribution

        hold_back: int, optional
            Number of observations at the start of the sample to exclude when estimating model parameters. Used when
            comparing models with different lag lengths to estimate on the common sample.

        scale: float
            Factor to scale data up or down by. This is useful when your data is too small leading to numerical errors
            when fitting. It will be used to scale simulation data
        """

        super(UArch, self).__init__()

        known_mean = ('zero', 'constant', 'harx', 'har', 'ar', 'arx', 'ls')
        known_vol = ('arch', 'figarch', 'garch', 'harch', 'constant', 'egarch')
        known_dist = ('normal', 'gaussian', 'studentst', 't', 'skewstudent',
                      'skewt', 'ged', 'generalized error')

        mean = mean.lower()
        vol = vol.lower()
        dist = dist.lower()
        if mean not in known_mean:
            raise ValueError(f"Unknown model type '{mean}' in mean")
        if vol.lower() not in known_vol:
            raise ValueError(f"Unknown model type '{vol}' in vol")
        if dist.lower() not in known_dist:
            raise ValueError(f"Unknown model type '{dist}' in dist")

        self._scale = scale

        self._model_setup = {
            'mean': mean,
            'lags': lags,
            'vol': vol,
            'p': p,
            'o': o,
            'q': q,
            'power': power,
            'dist': dist,
            'hold_back': hold_back
        }
        self._form_model(None)

[docs]    def fit(self, y, x=None, update_freq=1, disp='off', starting_values=None, cov_type='robust', show_warning=True,
            first_obs=None, last_obs=None, tol=None, options=None, backcast=None):
        r"""
        Fits the model given a nobs by 1 vector of sigma2 values

        Parameters
        ----------
        y: {ndarray, Series}
            The dependent variable

        x: {ndarray, DataFrame}, optional
            Exogenous regressors.  Ignored if model does not permit exogenous regressors.

        update_freq: int, optional
            Frequency of iteration updates.  Output is generated every `update_freq` iterations. Set to 0 to disable
            iterative output

        disp: 'final' or 'off' (default)
            Either 'final' to print optimization result or 'off' to display nothing

        starting_values: ndarray, optional
            Array of starting values to use.  If not provided, starting values are constructed by the model components

        cov_type:  str, optional
            Estimation method of parameter covariance.  Supported options are 'robust', which does not assume the
            Information Matrix Equality holds and 'classic' which does.  In the ARCH literature, 'robust' corresponds
            to Bollerslev-Wooldridge covariance estimator.

        show_warning: bool, optional
            Flag indicating whether convergence warnings should be shown.

        first_obs:  {int, str, datetime, Timestamp}
            First observation to use when estimating model

        last_obs: {int, str, datetime, Timestamp}
            Last observation to use when estimating model

        tol: float, optional
            Tolerance for termination

        options: dict, optional
            Options to pass to `scipy.optimize.minimize`.  Valid entries include 'ftol', 'eps', 'disp', and 'maxiter'

        backcast: float, optional
            Value to use as backcast. Should be measure :math:`\sigma^2_0`  since model-specific non-linear
            transformations are applied to value before computing the variance recursions.

        Returns
        -------
        UArch
            Fitted UArch instance
        """
        self._form_model(y, x)

        self._fit_model = self._model.fit(update_freq, disp, starting_values, cov_type, show_warning, first_obs,
                                          last_obs, tol, options, backcast)
        return self

[docs]    def forecast(self, params=None, horizon=1, start=None, align='origin', method='analytic',
                 simulations=1000, rng=None):
        """
        Construct forecasts from estimated model

        Parameters
        ----------
        params : ndarray, optional
            Alternative parameters to use.  If not provided, the parameters estimated when fitting the model are used.
            Must be identical in shape to the parameters computed by fitting the model.

        horizon : int, optional
           Number of steps to forecast

        start : {int, datetime, Timestamp, str}, optional
            An integer, datetime or str indicating the first observation to produce the forecast for. Datetimes can
            only be used with pandas inputs that have a datetime index. Strings must be convertible to a date time,
            such as in '1945-01-01'.

        align : {'origin', 'target'}, optional
            When set to 'origin', the t-th row of forecasts contains the forecasts for t+1, t+2, ..., t+h.

            When set to 'target', the t-th row contains the 1-step ahead forecast from time t-1, the 2 step from
            time t-2, ..., and the h-step from time t-h. 'target' simplifies computing forecast errors since the
            realization and h-step forecast are aligned.

        method : {'analytic', 'simulation', 'bootstrap'}, optional
            Method to use when producing the forecast. The default is 'analytic'. The method only affects the
            variance forecast generation.  Not all volatility models support all methods. In particular, volatility
            models that do not evolve in squares such as EGARCH or TARCH do not support the 'analytic' method
            for horizons > 1.

        simulations : int, optional
            Number of simulations to run when computing the forecast using either simulation or bootstrap.

        rng : {callable, ndarray}, optional
            If using a custom random number generator to for simulation-based forecasts, function must produce random
            samples using the syntax `rng(size)` where size is a 2-element tuple (simulations, horizon).

            Else, if a numpy array is passed in, array must have shape (simulation x horizon).

        Returns
        -------
        forecasts : ARCHModelForecast
            t by h data frame containing the forecasts.  The alignment of the forecasts is controlled by `align`.

        Notes
        -----
        The most basic 1-step ahead forecast will return a vector with the same length as the original data, where
        the t-th value will be the time-t forecast for time t + 1.  When the horizon is > 1, and when using the
        default value for `align`, the forecast value in position [t, h] is the time-t, h+1 step ahead forecast.

        If model contains exogenous variables (`model.x is not None`), then only 1-step ahead forecasts are available.
        Using horizon > 1 will produce a warning and all columns, except the first, will be nan-filled.

        If `align` is 'origin', forecast[t,h] contains the forecast made using y[:t] (that is, up to but not
        including t) for horizon h + 1.  For example, y[100,2] contains the 3-step ahead forecast using the first
        100 data points, which will correspond to the realization y[100 + 2]. If `align` is 'target', then the same
        forecast is in location [102, 2], so that it is aligned with the observation to use when evaluating,
        but still in the same column.
        """
        if params is None:
            params = self.params
        else:
            if params.size != np.array(self.params).size or params.ndim != self.params.ndim:
                raise ValueError('`params` have incorrect dimensions')

        if rng is not None and not callable(rng):
            rng = np.asarray(rng)
            if rng.shape != (simulations, horizon):
                raise ValueError(f"`rng` array should have dimensions ({(simulations, horizon)})")
            rng = lambda _: rng

        return self._fit_model.forecast(params, horizon, start, align, method, simulations, rng)

[docs]    def hedgehog_plot(self, params=None, horizon=10, step=10, start=None, type_='volatility', method='analytic',
                      simulations=1000):
        """
        Plot forecasts from estimated model

        Parameters
        ----------
        params: {Series, ndarray}, optional
            Alternative parameters to use.  If not provided, the parameters
            computed by fitting the model are used.  Must be 1-d and identical
            in shape to the parameters computed by fitting the model

        horizon: int, optional
            Number of steps to forecast

        step: int, optional
            Non-negative number of forecasts to skip between spines

        start: int, datetime or str, optional
            An integer, datetime or str indicating the first observation to produce the forecast for. Datetimes can only
            be used with pandas inputs that have a datetime index. Strings must be convertible to a date time, such as
            in '1945-01-01'. If not provided, the start is set to the earliest forecastable date

        type_: {'volatility', 'mean'}
            Quantity to plot, the forecast volatility or the forecast mean

        method: {'analytic', 'simulation', 'bootstrap'}
            Method to use when producing the forecast. The default is analytic. The method only affects the variance
            forecast generation.  Not all volatility models support all methods. In particular, volatility models that
            do not evolve in squares such as EGARCH or TARCH do not support the 'analytic' method for horizons > 1

        simulations: int
            Number of simulations to run when computing the forecast using either simulation or bootstrap

        Returns
        -------
        figure
            Handle to the figure
        """
        if params is None:
            params = self.params
        return self._fit_model.hedgehog_plot(params, horizon, step, start, type_, method, simulations)

    @property
    def params(self) -> pd.Series:
        """Model Parameters"""
        self._ensure_is_fitted()
        return self._fit_model.params

[docs]    def residual_plot(self, annualize=None, scale=None):
        """
        Plot standardized residuals and conditional volatility

        Parameters
        ----------
        annualize: str, optional
            String containing frequency of data that indicates plot should contain annualized volatility. Supported
            values are 'D' (daily), 'W' (weekly) and 'M' (monthly), which scale variance by 252, 52, and 12 respectively

        scale: float, optional
            Value to use when scaling returns to annualize.  If scale is provides, annualize is ignored and the value
            in scale is used.

        Returns
        -------
        figure
            Handle to the figure
        """
        return self._fit_model.plot(annualize, scale)

[docs]    def residuals(self, standardize=True) -> np.ndarray:
        """
        Model residuals

        Parameters
        ----------
        standardize: bool, optional
            Whether to standardize residuals. Residuals are standardized by dividing it with the conditional volatility

        Returns
        -------
        ndarray
            Residuals
        """
        model = self._fit_model
        lags = self._model_setup['lags']
        r = np.asarray(model.resid) / self._scale

        if standardize:
            r /= model.conditional_volatility
        return r[lags:]

[docs]    def simulate(self,
                 nobs: int,
                 burn=500,
                 initial_value: Union[np.ndarray, float] = None,
                 x: Optional[Union[np.ndarray, pd.DataFrame]] = None,
                 initial_value_vol: Union[np.ndarray, float] = None,
                 data_only=False,
                 params: PARAMS = None,
                 custom_dist: Optional[Union[RNG_GEN, np.ndarray]] = None) -> Union[pd.DataFrame, np.ndarray]:
        """
        Simulates data from a ARMA-GARCH model

        Parameters
        ----------
        nobs: int
            Length of series to simulate

        burn: int, optional
            Number of values to simulate to initialize the model and remove dependence on initial values

        initial_value: {ndarray, float}, optional
            Either a scalar value or `max(lags)` array set of initial values to use when initializing the model.
            If omitted, 0.0 is used

        x: {ndarray, DataFrame}, optional
            nobs + burn by k array of exogenous variables to include in the simulation. This should be a 2D matrix

        initial_value_vol: {ndarray, float}, optional
            An array or scalar to use when initializing the volatility process.

        data_only: bool, default True
            If True, this returns only the simulated data, omits the volatility and error. In this case, it will return
            as a numpy array. Otherwise, it returns a data frame with the data, volatility and error

        params: ndarray, optional
            If not None, model will use the parameters supplied to generate simulations. Otherwise, it will use the
            fitted parameters.

        custom_dist: {ndarray, Callable}, optional
            Optional density from which to simulate the innovations (Distribution) in the GARCH models. This is useful
            when working with the copula-GARCH model where each univariate model innovations has dependence on others.
            It is assumed that the values supplied are standardized [0, 1] innovations instead of the unstandardized
            residuals.

            The shape of the array must be at least as long as the simulation size required after accounting for burn
            and type of innovation process. If unsure, use :code:`simulation_size_required` to check.

            If a random number generator function is passed in, ensure that it only takes only argument and returns
            a numpy array. The argument can be an integer or a tuple of integers. In this case, the size will be
            automatically derived to save the user the trouble.

        Returns
        -------
        DataFrame or ndarray
            DataFrame with columns data containing the simulated values, volatility, containing the conditional
            volatility and errors containing the errors used in the simulation.
            If data_only, it returns the 'data' column as a numpy array
        """
        self._set_custom_dist(custom_dist, nobs, burn)
        if params is None:
            params = self.params

        return self._model.simulate(params, nobs, burn, initial_value, x, initial_value_vol, data_only) / self._scale

[docs]    def simulate_mc(self,
                    nobs: int,
                    reps: int,
                    burn=500,
                    initial_value: Union[np.ndarray, float] = None,
                    x: Optional[Union[np.ndarray, pd.DataFrame]] = None,
                    initial_value_vol: Union[np.ndarray, float] = None,
                    params: PARAMS = None,
                    custom_dist: Optional[Union[RNG_GEN, np.ndarray]] = None) -> Union[pd.DataFrame, np.ndarray]:
        """
        Simulates data from a ARMA-GARCH model with multiple repetitions.

        This is used for Monte Carlo simulations.

        Parameters
        ----------
        nobs: int
            Length of series to simulate

        reps: int
            Number of repetitions in Monte Carlo simulation

        burn: int, optional
            Number of values to simulate to initialize the model and remove dependence on initial values

        initial_value: {ndarray, float}, optional
            Either a scalar value or `max(lags)` array set of initial values to use when initializing the model.
            If omitted, 0.0 is used

        x: {ndarray, DataFrame}, optional
            nobs + burn by k array of exogenous variables to include in the simulation. This should be a 2D matrix

        initial_value_vol: {ndarray, float}, optional
            An array or scalar to use when initializing the volatility process.

        params: {Series, ndarray}, optional
            If not None, model will use the parameters supplied to generate simulations. Otherwise, it will use the
            fitted parameters.

        custom_dist: {ndarray, Callable}, optional
            Optional density from which to simulate the innovations (Distribution) in the GARCH models. This is useful
            when working with the copula-GARCH model where each univariate model innovations has dependence on others.
            It is assumed that the values supplied are standardized [0, 1] innovations instead of the unstandardized
            residuals.

            The shape of the array must be at least as long as the simulation size required after accounting for burn
            and type of innovation process. If unsure, use :code:`simulation_size_required` to check.

            If a random number generator function is passed in, he size will be automatically derived to save the user
            the trouble. However, the function must:

            * take as it first argument an integer or a tuple of integer
            * have other parameters that are optional
            * return a numpy array

        Returns
        -------
        simulated_data: ndarray
            Array containing the simulated values

        See Also
        --------
        UArch.simulation_horizon_required: Calculates the simulation size required
        """
        if not isinstance(reps, int) or reps < 1:
            raise ValueError('reps must be an integer greater than 0')

        self._set_custom_dist(custom_dist, nobs, burn, reps)
        if params is None:
            params = self.params

        return self._model.simulate_mc(params, nobs, reps, burn, initial_value, x, initial_value_vol) / self._scale

[docs]    def summary(self, short=False, dp=4) -> Union[pd.Series, Summary]:
        """
        Summary of fitted model

        Parameters
        ----------
        short: bool, optional
            Whether to show short summary or full summary.

        dp: int, optional
            Number of decimal places to show in short summary

        Returns
        -------
        Summary
            Model Summary
        """
        self._ensure_is_fitted()
        if short:
            params = self._fit_model.params.round(dp)
            se = self._fit_model.std_err.round(dp)
            return pd.Series(['{p:.{dp}f} ± {s:.{dp}f}'.format(p=p, s=se[i], dp=dp) for i, p in enumerate(params)],
                             index=params.index)
        else:
            return Summary(self._fit_model.summary())

    def _ensure_is_fitted(self):
        """Ensures model is fitted. Else raises error"""
        if self._fit_model is None:
            raise NotFittedError(f"This ArchModel instance is not fitted yet")

    def _form_model(self, y, x=None):
        """
        Forms the actual ARCH model. This is a convenience method so that users can specify data at run time
        instead of at class instantiation
        """
        mean: str = self._model_setup['mean']
        lags: Union[int, List[int]] = self._model_setup['lags']
        vol: str = self._model_setup['vol']
        p: int = self._model_setup['p']
        o: int = self._model_setup['o']
        q: int = self._model_setup['q']
        power: float = self._model_setup['power']
        dist: str = self._model_setup['dist']
        hold_back: Optional[int] = self._model_setup['hold_back']

        y = y * self._scale if y is not None else None
        if x is not None:
            x = np.asarray(x) * self._scale
            if x.ndim == 1:
                x = x.reshape(-1, 1)

        if mean == 'zero':
            am = ZeroMean(y, hold_back=hold_back)
        elif mean == 'constant':
            am = ConstantMean(y, hold_back=hold_back)

        elif mean == 'arx':
            am = ARX(y, x, lags, hold_back=hold_back)
        elif mean == 'ar':
            am = ARX(y, None, lags, hold_back=hold_back)
        elif mean == 'harx':
            am = HARX(y, x, lags, hold_back=hold_back)
        elif mean == 'har':
            am = HARX(y, None, lags, hold_back=hold_back)
        else:  # mean == 'ls'
            am = LS(y, x, hold_back=hold_back)

        if vol == 'constant':
            v = ConstantVariance()
        elif vol == 'arch':
            v = ARCH(p=p)
        elif vol == 'figarch':
            v = FIGARCH(p=p, q=q)
        elif vol == 'garch':
            v = GARCH(p=p, o=o, q=q, power=power)
        elif vol == 'egarch':
            v = EGARCH(p=p, o=o, q=q)
        else:  # vol == 'harch'
            v = HARCH(lags=p)

        if dist in ('skewstudent', 'skewt'):
            d = SkewStudent()
        elif dist in ('studentst', 't'):
            d = StudentsT()
        elif dist in ('ged', 'generalized error'):
            d = GeneralizedError()
        else:  # ('gaussian', 'normal')
            d = Normal()

        am.volatility = v
        am.distribution = d
        self._model = am

    def __model_description__(self, include_lags=True, include_fitted_stats=False):
        """Generates the model description for use by __str__ and related functions"""
        desc = self._model._model_description(include_lags)

        if self._fit_model is not None and include_fitted_stats:
            pass

        return desc

    def _repr_html_(self):
        """HTML representation for IPython Notebook"""
        desc = self.__model_description__(include_fitted_stats=True)

        html = f"<strong>{self._model.name}</strong>"

        fragment = ",\n".join(f"<strong>{key}: </strong> {val}" for key, val in desc.items())
        html = f"{html}(\n{fragment}\n<strong>ID: </strong>: {hex(id(self))})"

        return html

    def _set_custom_dist(self, custom_dist, nobs, burn, reps=None):
        horizon = self.simulation_horizon_required(nobs, burn)
        if isinstance(reps, int):  # for monte carlo
            size = horizon, reps
        else:
            size = horizon

        if custom_dist is not None:
            if callable(custom_dist):
                self._model.distribution.custom_dist = custom_dist(size)
            else:
                custom_dist = np.asarray(custom_dist)
                if len(custom_dist) < horizon:
                    raise ValueError(f"`custom_dist` array size should at least be {horizon} long")
                if reps is not None and reps != custom_dist.shape[1]:
                    raise ValueError(f"number of columns `custom_dist` array should equals {reps}")

                self._model.distribution.custom_dist = custom_dist

    def __repr__(self):
        txt = self.__str__()
        return txt + f', {hex(id(self))}'

    def __str__(self):
        desc = self.__model_description__()
        return f"{self._model.name}({', '.join(f'{k}: {v}' for k, v in desc.items() if k and v)})"