Source code for muarch.muarch

import os
from collections import abc
from concurrent.futures import ProcessPoolExecutor
from functools import partial
from typing import Iterable, List, Optional, Union

import numpy as np
import pandas as pd

from ._base import RNG_GEN
from .summary import SummaryList
from .uarch import UArch

Exog = Optional[Union[List[Union[None, np.ndarray]], np.ndarray]]


[docs]class MUArch:
    """
    Multi-univariate ARCH model. Unlike a multivariate ARCH model, this fits each univariate time series individually.
    Any simulations returns simulations of each univariate series column bound together.
    """

    __n: int
    __models: List[UArch]

[docs]    def __init__(self, n: Union[int, Iterable[UArch]], mean='Constant', lags=0, vol='GARCH', p=1, o=0, q=1, power=2.0,
                 dist='Normal', hold_back=None, scale=1):
        r"""
        Initializes the MUArch model.

        The MUArch model holds multiple univariate models which are determined during fitting. If the models are not
        specified, the global default options will be used. Models can be individually specified after initializing
        the MUArch instance.

        Parameters
        ----------
        n:  int or list of UArch models
            Number of univariate models to fit. Alternatively, a list of UArch (univariate) models can be specified.

        mean: { 'zero', 'constant', 'harx', 'har', 'ar', 'arx', 'ls' }, optional
            Name of the global default mean model.  Currently supported options are:

            * **Constant** (default) - Constant mean model
            * **Zero** - Zero mean model
            * **AR** - Autoregression model
            * **ARX** - Autoregression model with exogenous regressors. Falls back to **AR** if no exogenous regressors
            * **HAR** - Heterogeneous Autoregression model
            * **HARX** - Heterogeneous Autoregressions with exogenous regressors
            * **LS** - Least squares model

            For more information on the different models, check out the documentation at
            https://arch.readthedocs.io/en/latest/univariate/mean.html

        lags: int or list (int), optional
            Global default lag. Either a scalar integer value indicating lag length or a list of integers specifying
            lag locations.

        vol: { 'GARCH', 'ARCH', 'EGARCH', 'FIGARCH' and 'HARCH', 'CONSTANT' }, optional
            Name of the global default volatility model.  Currently supported options are:

            * **GARCH** (default) - Standard GARCH process which can be used to specify the following models:

                * ARCH(p)
                * GARCH(p,q)
                * GJR-GARCH(p,o,q)
                * AVARCH(p)
                * AVGARCH(p,q)
                * TARCH(p,o,q)
                * Models with arbitrary, pre-specified powers

            * **ARCH** - ARCH process
            * **EGARCH** - EGARCH process
            * **FIGARCH** - Fractionally Integrated (FI) GARCH process
            * **HARCH** - Heterogeneous ARCH process
            * **Constant** (default) - Constant volatility process

        p: int, optional
            Global default lag order of the symmetric innovation

        o: int, optional
            Global default lag order of the asymmetric innovation

        q: int, optional
            Global default lag order of lagged volatility or equivalent

        power: float, optional
            Global default power to use with the innovations, Default is 2.0, which produces ARCH and related
            models. Using 1.0 produces AVARCH and related models. Other powers can be specified, although these should
            be strictly positive, and usually larger than 0.25.

        dist:  { 'normal', 'gaussian', 'studentst', 't', 'skewstudent', 'skewt', 'ged', 'generalized error' }, optional
            Name of the global default distribution for the innovations.  Currently supported options are:

            * **normal**, **gaussian** (default) - Standard Normal distribution
            * **t**, **studentst** - Standardized Student’s distribution
            * **skewstudent**, **skewt** - Standardized Skewed Student’s distribution.
            * **ged**, **generalized error" - Generalized Error Distribution

        hold_back: int
            Global default. Number of observations at the start of the sample to exclude when estimating model
            parameters. Used when comparing models with different lag lengths to estimate on the common sample.

        scale: float
            Global default factor to scale data up or down by. This is useful when your data is too small leading to
            numerical errors when fitting. It will be used to scale simulation data
        """
        super(MUArch, self).__init__()

        if isinstance(n, abc.Iterable):
            self.__models = list(n)
            self.__n = len(self.__models)

            assert self.__n != 0, "If passing in a list of UArch models, list cannot be empty!"

            for i, m in enumerate(self.__models):
                assert isinstance(m, UArch), f'The model in index {i} must be an instance of UArch'

            self._model_names = [str(i) for i in range(self.__n)]

        elif type(n) is int:
            self.__n = n
            self.__models = [UArch(mean, lags, vol, p, o, q, power, dist, hold_back, scale) for _ in range(n)]
            self._model_names = [str(i) for i in range(n)]
        else:
            raise TypeError('`n` must either be an integer specifying the number of models or a list of UArch models')

[docs]    def fit(self, y: Union[pd.DataFrame, np.ndarray], x: Exog = None, update_freq=1, disp='off', cov_type='robust',
            show_warning=True, tol=None, options=None):
        """
        Fits the MUArch model.

        If finer control over the MUArch models is required, set the UArch models separately. Otherwise, method will
        set the default parameters.

        Parameters
        ----------
        y: {ndarray, Series}
            The dependent variable. If a vector is passed in, it is assumed that the same vector (endog) is used for
            all models. Otherwise, the last value of the shape must match the number of models

        x: list of {ndarray, None}, optional
            Exogenous regressors.  Ignored if model does not permit exogenous regressors. If passed in, the first shape
            must match the number of models.

        update_freq: int, optional
            Frequency of iteration updates.  Output is generated every `update_freq` iterations. Set to 0 to disable
            iterative output

        disp: 'final' or 'off' (default)
            Either 'final' to print optimization result or 'off' to display nothing

        cov_type: str, optional
            Estimation method of parameter covariance.  Supported options are 'robust', which does not assume the
            Information Matrix Equality holds and 'classic' which does. In the ARCH literature, 'robust' corresponds
            to Bollerslev-Wooldridge covariance estimator.

        show_warning: bool, optional
            Flag indicating whether convergence warnings should be shown.

        tol: float, optional
            Tolerance for termination

        options: dict, optional
            Options to pass to `scipy.optimize.minimize`.  Valid entries include 'ftol', 'eps', 'disp', and 'maxiter'

        Returns
        -------
        MUArch
            Fitted self instance
        """

        if y.shape[1] != self.__n:  # checks on input data
            raise ValueError("number of columns in data 'y' does not match expected dimension of MUArch object")

        if isinstance(y, pd.DataFrame):  # setting names for params later
            self._model_names = list(y.columns)
            y = y.values
        else:
            self._model_names = [str(i) for i in range(self.__n)]

        if y.ndim == 1:
            y = np.tile(y, self.__n)
        elif y.ndim > 2:
            raise ValueError("Dependent variable `y` must either be 1 or 2 dimensional")

        for i in range(self.__n):
            yy = y[:, i]
            xx = x[i] if x is not None else None
            if xx is not None:
                xx = np.asarray(xx)
                if xx.ndim == 1:
                    xx = xx.reshape(len(xx), 1)

            self[i].fit(yy, xx, update_freq=update_freq, disp=disp, cov_type=cov_type, show_warning=show_warning,
                        tol=tol,
                        options=options)

    @property
    def params(self):
        return pd.DataFrame([m.params for m in self], self._model_names)

[docs]    def residuals(self, standardize=True) -> np.ndarray:
        """
        Model residuals

        The residuals will be burnt by the maximum lag of the underlying models. For example, given 3 models - AR(1),
        AR(10), Constant with 400 data points each, the residuals will be 399, 390 and 400 long. The function will
        cut off the first 10 data points in this instance.

        Parameters
        ----------
        standardize: bool, optional
            Whether to standardize residuals. Residuals are standardized by dividing it with the conditional volatility

        Returns
        -------
        ndarray
            Residuals
        """
        residuals = [mm.residuals(standardize) for mm in self]
        length = min([len(r) for r in residuals])

        return np.asarray([r[-length:] for r in residuals]).T

[docs]    def simulate(self,
                 nobs,
                 burn=500,
                 initial_value=None,
                 x=None,
                 initial_value_vol=None,
                 data_only=True,
                 custom_dist: Optional[Union[RNG_GEN, np.ndarray]] = None):
        """
        Simulates data from the multiple ARMA-GARCH models

        Parameters
        ----------
        nobs: int
            Length of series to simulate

        burn: int, optional
            Number of values to simulate to initialize the model and remove dependence on initial values

        initial_value: {ndarray, float}, optional
            Either a scalar value or `max(lags)` array set of initial values to use when initializing the model.
            If omitted, 0.0 is used. If array, the last column must be of the same size as the number of models

        x: {ndarray, list of ndarray}, optional
            If supplied as a list, this list should have the same number of elements as the number of models in the
            MUArch model. Each array inside is the specified exogenous variable for that  particular model and
            this must be a nobs + burn by k matrix of exogenous variables to include in the simulation. Otherwise,
            leave the value as `None` to indicate no exogenous variables are used for simulation in the model.

            If an array is supplied directly, it means every model has an exogenous variable associated with it. In
            this case, it should be a 3 dimensional tensor where the first dimension represents the number of models.

        initial_value_vol: {ndarray, float}, optional
            An array or scalar to use when initializing the volatility process. If array, the last column must be of the
            same size as the number of models

        data_only: bool, default True
            If True, this returns only the simulated data, omits the volatility and error. In this case, it will return
            as a numpy array. Otherwise, it returns a data frame with the data, volatility and error

        custom_dist: {ndarray, Callable}, optional
            Optional density from which to simulate the innovations (Distribution) in the GARCH models. This is useful
            when working with the copula-GARCH model where each univariate model innovations has dependence on others.
            It is assumed that the values supplied are standardized [0, 1] innovations instead of the unstandardized
            residuals.

            The shape of the array must be at least as long as the simulation size required after accounting for burn
            and type of innovation process. If unsure, use :code:`simulation_size_required` to check. It must also
            account for the number of dimensions of the MUArch model. For example, if MUArch model is simulating a
            horizon of 120 time steps, 10000 trials and has 5 UArch models, the shape of the numpy array should be
            (120, 10000, 5).

            If a random number generator function is passed in, ensure that it only takes only argument and returns
            a numpy array. The argument can be an integer or a tuple of integers. In this case, the size will be
            automatically derived to save the user the trouble.

        Returns
        -------
        simulated_data : {List[DataFrame], ndarray}
            List of DataFrame with columns data containing the simulated values, volatility, containing the conditional
            volatility and errors containing the errors used in the simulation

        See Also
        --------
        UArch.simulation_horizon_required: Calculates the simulation size required
        """

        initial_value, initial_value_vol, x, custom_dist = self._format_simulation_parameters(nobs,
                                                                                              burn,
                                                                                              None,
                                                                                              initial_value,
                                                                                              initial_value_vol,
                                                                                              x,
                                                                                              custom_dist)

        sims = []
        for i in range(self.__n):
            r = custom_dist[..., i] if custom_dist is not None else None
            xx = x[i]
            iv = initial_value[i]
            ivv = initial_value_vol[i]

            sims.append(self[i].simulate(nobs, burn, iv, xx, ivv, data_only, None, r))

        if data_only:
            sims = np.asarray(sims).T

        return sims

[docs]    def simulate_mc(self,
                    nobs,
                    reps,
                    burn=500,
                    initial_value=None,
                    x=None,
                    initial_value_vol=None,
                    custom_dist: Optional[Union[RNG_GEN, np.ndarray]] = None,
                    n_jobs: Optional[int] = None):
        """
        Simulates data from the multiple ARCH-GARCH models.

        This function is specially crafted for Monte-Carlo simulations.

        Parameters
        ----------
        nobs: int
            Length of series to simulate

        reps: int
            Number of repetitions

        burn: int, optional
            Number of values to simulate to initialize the model and remove dependence on initial values

        initial_value: {ndarray, float}, optional
            Either a scalar value or `max(lags)` array set of initial values to use when initializing the model.
            If omitted, 0.0 is used. If array, the last column must be of the same size as the number of models

        x: {ndarray, list of ndarray}, optional
            If supplied as a list, this list should have the same number of elements as the number of models in the
            MUArch model. Each array inside is the specified exogenous variable for that  particular model and
            this must be a nobs + burn by k matrix of exogenous variables to include in the simulation. Otherwise,
            leave the value as `None` to indicate no exogenous variables are used for simulation in the model.

            If an array is supplied directly, it means every model has an exogenous variable associated with it. In
            this case, it should be a 3 dimensional tensor where the first dimension represents the number of models.

        initial_value_vol: {ndarray, float}, optional
            An array or scalar to use when initializing the volatility process. If array, the last column must be of the
            same size as the number of models

        custom_dist: {ndarray, Callable}, optional
            Optional density from which to simulate the innovations (Distribution) in the GARCH models. This is useful
            when working with the copula-GARCH model where each univariate model innovations has dependence on others.
            It is assumed that the values supplied are standardized [0, 1] innovations instead of the unstandardized
            residuals.

            The shape of the array must be at least as long as the simulation size required after accounting for burn
            and type of innovation process. If unsure, use :code:`simulation_size_required` to check.

            If a random number generator function is passed in, he size will be automatically derived to save the user
            the trouble. However, the function must:

            * take as it first argument an integer or a tuple of integer
            * have other parameters that are optional
            * return a numpy array

        n_jobs: int or None, optional
            The number of jobs to run in parallel for simulation. This is particularly useful when simulating large
            number of repetitions with more than 1 dimension. None defaults to using 1 processor. Any numbers less or
            equal to 0 means to use all processors. Even if a large number is used, it will be capped at the
            maximum number of processors available.


        Returns
        -------
        simulated_data : numpy array
            Array containing simulated data from the Monte Carlo Simulation
        """

        if not isinstance(reps, int) or reps < 1:
            raise ValueError('reps must be an integer greater than 0')

        # setting up job processes
        if n_jobs is None:
            n_jobs = 1
        elif not isinstance(n_jobs, int):
            raise ValueError('`n_jobs` must be an integer')
        else:
            n_jobs = min(os.cpu_count(), os.cpu_count() if n_jobs <= 0 else n_jobs)

        initial_value, initial_value_vol, x, custom_dist = self._format_simulation_parameters(nobs,
                                                                                              burn,
                                                                                              reps,
                                                                                              initial_value,
                                                                                              initial_value_vol,
                                                                                              x,
                                                                                              custom_dist)

        # formulate function calls to get simulated data
        functions = []
        for i in range(self.__n):
            r = custom_dist[..., i] if custom_dist is not None else None
            xx = x[i]
            iv = initial_value[i]
            ivv = initial_value_vol[i]

            func = partial(self[i].simulate_mc, nobs, reps, burn, iv, xx, ivv, None, r)
            functions.append(func)

        # switch between sequential and multiprocessing
        if n_jobs == 1:  # sequential
            sims = np.zeros((nobs, reps, self.__n))
            for i, func in enumerate(functions):
                sims[:, :, i] = func()

            return sims
        else:  # multiprocessing
            results = []

            with ProcessPoolExecutor(n_jobs) as P:
                for func in functions:
                    results.append(P.submit(func))

            results = np.asarray([r.result() for r in results])
            return np.moveaxis(results, 0, -1)

[docs]    def summary(self, short=False, dp=4):
        """
        Summary of fitted models

        :param short: bool, default False
            Whether to show short summary or full summary.
        :param dp: int, default 4
            Number of decimal places to show in short summary
        :return: SummaryList
            summary of fitted models
        """
        if short:
            return pd.DataFrame([m.summary(short, dp) for m in self], self._model_names)
        else:
            smry = SummaryList()
            for i, name in enumerate(self._model_names):
                s = self[i].summary(short, dp)
                s.add_header(name)
                smry.append(s)
            return smry

    def _format_simulation_parameters(self,
                                      nobs,
                                      burn,
                                      reps,
                                      initial_value,
                                      initial_value_vol,
                                      x,
                                      custom_dist):
        """Ensures the simulation are okay and return the formatted parameters"""

        # Format initial value
        if initial_value is None or isinstance(initial_value, (float, int)):
            initial_value = np.asarray([initial_value] * self.__n)

        # Format initial volatility value
        if initial_value_vol is None or isinstance(initial_value, (float, int)):
            initial_value_vol = np.asarray([initial_value_vol] * self.__n)

        # Format exogenous regressors
        if x is not None:
            if isinstance(x, np.ndarray):
                assert x.ndim == 3, ("If numpy array passed in as MUArch `x` (exog), make sure it is a 3 "
                                     "dimensional array where the last dimension is the number of elements")
                x = [x[i] for i in range(self.__n)]
            else:
                x = list(x)

                assert len(x) == self.__n, ("Exogenous variable's list should have the same number of elements as the "
                                            "number of models")

                for i, xx in enumerate(x):
                    if xx is None:
                        continue
                    elif isinstance(x[0], pd.DataFrame):
                        # convert all data frames to numpy array then move the model axis (first) to last
                        xx = np.moveaxis(np.array([df.values for df in x]), 0, -1)
                    else:
                        xx = np.asarray(xx)

                    if xx.ndim == 1:
                        xx = xx.reshape(-1, 1)
                    x[i] = xx
        else:
            x = [None] * self.__n

        # Format custom distribution
        if custom_dist is not None:
            horizon = max(m.simulation_horizon_required(nobs, burn) for m in self)
            if isinstance(reps, int):  # for monte carlo
                size = horizon, reps
            else:
                size = horizon

            if callable(custom_dist):
                custom_dist = np.asarray(custom_dist(size))
            else:
                custom_dist = np.asarray(custom_dist)

            if custom_dist.ndim == 1:
                custom_dist = custom_dist.reshape(1, -1)

            dim = custom_dist.shape[-1]
            assert dim == self.__n, (f"expected generator to return array with {self.__n} vectors in the last axis "
                                     f"(of shape) but got {dim}")

        return initial_value, initial_value_vol, x, custom_dist

    def _repr_html_(self):
        header = '<h2>Arch Models</h2>'
        models = '<br/>'.join(m._repr_html_() for m in self)
        return header + models

    def __getitem__(self, i: int):
        return self.__models[i]

    def __setitem__(self, i: int, model: UArch):
        assert isinstance(model, UArch), 'only UArch models are allowed to be set as a component of the MUArch object'
        self.__models[i] = model

    def __repr__(self):
        return self.__str__() + f', {hex(id(self))}'

    def __str__(self):
        txt = f'{"Arch Models":^80}\n' + '=' * 80 + '\n\n'
        txt += '\n\n\n'.join(str(m) for m in self)
        return txt

    def __len__(self):
        return self.__n