Source code for muarch.muarch

import os
from concurrent.futures import ProcessPoolExecutor
from functools import partial
from typing import Collection, Optional, Union

import numpy as np
import pandas as pd

from .exceptions import EndogInputError, InvalidModelError
from .summary import SummaryList
from .typings import Endog, Exog, RngGen
from .uarch import UArch


[docs]class MUArch: """ Multi-univariate ARCH model. Unlike a multivariate ARCH model, this fits each univariate time series individually. Any simulations returns simulations of each univariate series column bound together. """
[docs] def __init__(self, n: Union[int, Collection[UArch]], mean='Constant', lags=0, vol='GARCH', p=1, o=0, q=1, power=2.0, dist='Normal', hold_back=None, scale=1): r""" Initializes the MUArch model. The MUArch model holds multiple univariate models which are determined during fitting. If the models are not specified, the global default options will be used. Models can be individually specified after initializing the MUArch instance. Parameters ---------- n: int or list of UArch models Number of univariate models to fit. Alternatively, a list of UArch (univariate) models can be specified. mean: { 'zero', 'constant', 'harx', 'har', 'ar', 'arx', 'ls' }, optional Name of the global default mean model. Currently supported options are: * **Constant** (default) - Constant mean model * **Zero** - Zero mean model * **AR** - Autoregression model * **ARX** - Autoregression model with exogenous regressors. Falls back to **AR** if no exogenous regressors * **HAR** - Heterogeneous Autoregression model * **HARX** - Heterogeneous Autoregressions with exogenous regressors * **LS** - Least squares model For more information on the different models, check out the documentation at https://arch.readthedocs.io/en/latest/univariate/mean.html lags: int or list (int), optional Global default lag. Either a scalar integer value indicating lag length or a list of integers specifying lag locations. vol: { 'GARCH', 'ARCH', 'EGARCH', 'FIGARCH' and 'HARCH', 'CONSTANT' }, optional Name of the global default volatility model. Currently supported options are: * **GARCH** (default) - Standard GARCH process which can be used to specify the following models: * ARCH(p) * GARCH(p,q) * GJR-GARCH(p,o,q) * AVARCH(p) * AVGARCH(p,q) * TARCH(p,o,q) * Models with arbitrary, pre-specified powers * **ARCH** - ARCH process * **EGARCH** - EGARCH process * **FIGARCH** - Fractionally Integrated (FI) GARCH process * **HARCH** - Heterogeneous ARCH process * **Constant** (default) - Constant volatility process p: int, optional Global default lag order of the symmetric innovation o: int, optional Global default lag order of the asymmetric innovation q: int, optional Global default lag order of lagged volatility or equivalent power: float, optional Global default power to use with the innovations, Default is 2.0, which produces ARCH and related models. Using 1.0 produces AVARCH and related models. Other powers can be specified, although these should be strictly positive, and usually larger than 0.25. dist: { 'normal', 'gaussian', 'studentst', 't', 'skewstudent', 'skewt', 'ged', 'generalized error' }, optional Name of the global default distribution for the innovations. Currently supported options are: * **normal**, **gaussian** (default) - Standard Normal distribution * **t**, **studentst** - Standardized Student’s distribution * **skewstudent**, **skewt** - Standardized Skewed Student’s distribution. * **ged**, **generalized error" - Generalized Error Distribution hold_back: int Global default. Number of observations at the start of the sample to exclude when estimating model parameters. Used when comparing models with different lag lengths to estimate on the common sample. scale: float Global default factor to scale data up or down by. This is useful when your data is too small leading to numerical errors when fitting. It will be used to scale simulation data """ if isinstance(n, Collection): self._models = list(n) assert len(self) != 0, "If passing in a list of UArch models, list cannot be empty!" for i, m in enumerate(self._models): if not isinstance(m, UArch): raise InvalidModelError(i) self._model_names = [str(i) for i in range(len(self))] elif isinstance(n, int): self._models = [UArch(mean, lags, vol, p, o, q, power, dist, hold_back, scale) for _ in range(n)] self._model_names = [str(i) for i in range(n)] else: raise TypeError('`n` must either be an integer specifying the number of models or a list of UArch models')
[docs] def fit(self, y: Endog, x: Exog = None, update_freq=1, disp='off', cov_type='robust', show_warning=True, tol: float = None, options=None): """ Fits the MUArch model. If finer control over the MUArch models is required, set the UArch models separately. Otherwise, method will set the default parameters. Parameters ---------- y: {ndarray, Series} The dependent variable. If a vector is passed in, it is assumed that the same vector (endog) is used for all models. Otherwise, the last value of the shape must match the number of models x: list of {ndarray, None}, optional Exogenous regressors. Ignored if model does not permit exogenous regressors. If passed in, the first shape must match the number of models. update_freq: int, optional Frequency of iteration updates. Output is generated every `update_freq` iterations. Set to 0 to disable iterative output disp: 'final' or 'off' (default) Either 'final' to print optimization result or 'off' to display nothing cov_type: str, optional Estimation method of parameter covariance. Supported options are 'robust', which does not assume the Information Matrix Equality holds and 'classic' which does. In the ARCH literature, 'robust' corresponds to Bollerslev-Wooldridge covariance estimator. show_warning: bool, optional Flag indicating whether convergence warnings should be shown. tol: float, optional Tolerance for termination options: dict, optional Options to pass to `scipy.optimize.minimize`. Valid entries include 'ftol', 'eps', 'disp', and 'maxiter' Returns ------- MUArch Fitted self instance """ if y.ndim == 1: y = np.tile(y, len(self)) if y.shape[1] != len(self) or y.ndim > 2: # checks on input data raise EndogInputError if isinstance(y, pd.DataFrame): # setting names for params later self._model_names = list(y.columns) y = y.values else: self._model_names = [str(i) for i in range(len(self))] for i in range(len(self)): yy = y[:, i] xx = x[i] if x is not None else None if xx is not None: xx = np.asarray(xx) if xx.ndim == 1: xx = xx.reshape(len(xx), 1) self[i].fit(yy, xx, update_freq=update_freq, disp=disp, cov_type=cov_type, show_warning=show_warning, tol=tol, options=options)
@property def params(self): return pd.DataFrame([m.params for m in self], self._model_names)
[docs] def residuals(self, standardize=True) -> np.ndarray: """ Model residuals The residuals will be burnt by the maximum lag of the underlying models. For example, given 3 models - AR(1), AR(10), Constant with 400 data points each, the residuals will be 399, 390 and 400 long. The function will cut off the first 10 data points in this instance. Parameters ---------- standardize: bool, optional Whether to standardize residuals. Residuals are standardized by dividing it with the conditional volatility Returns ------- ndarray Residuals """ residuals = [mm.residuals(standardize) for mm in self] length = min([len(r) for r in residuals]) return np.asarray([r[-length:] for r in residuals]).T
[docs] def simulate(self, nobs, burn=500, initial_value=None, x=None, initial_value_vol=None, data_only=True, custom_dist: Optional[Union[RngGen, np.ndarray]] = None): """ Simulates data from the multiple ARMA-GARCH models Parameters ---------- nobs: int Length of series to simulate burn: int, optional Number of values to simulate to initialize the model and remove dependence on initial values initial_value: {ndarray, float}, optional Either a scalar value or `max(lags)` array set of initial values to use when initializing the model. If omitted, 0.0 is used. If array, the last column must be of the same size as the number of models x: {ndarray, list of ndarray}, optional If supplied as a list, this list should have the same number of elements as the number of models in the MUArch model. Each array inside is the specified exogenous variable for that particular model and this must be a nobs + burn by k matrix of exogenous variables to include in the simulation. Otherwise, leave the value as `None` to indicate no exogenous variables are used for simulation in the model. If an array is supplied directly, it means every model has an exogenous variable associated with it. In this case, it should be a 3 dimensional tensor where the first dimension represents the number of models. initial_value_vol: {ndarray, float}, optional An array or scalar to use when initializing the volatility process. If array, the last column must be of the same size as the number of models data_only: bool, default True If True, this returns only the simulated data, omits the volatility and error. In this case, it will return as a numpy array. Otherwise, it returns a data frame with the data, volatility and error custom_dist: {ndarray, Callable}, optional Optional density from which to simulate the innovations (Distribution) in the GARCH models. This is useful when working with the copula-GARCH model where each univariate model innovations has dependence on others. It is assumed that the values supplied are standardized [0, 1] innovations instead of the unstandardized residuals. The shape of the array must be at least as long as the simulation size required after accounting for burn and type of innovation process. If unsure, use :code:`simulation_size_required` to check. It must also account for the number of dimensions of the MUArch model. For example, if MUArch model is simulating a horizon of 120 time steps, 10000 trials and has 5 UArch models, the shape of the numpy array should be (120, 10000, 5). If a random number generator function is passed in, ensure that it only takes only argument and returns a numpy array. The argument can be an integer or a tuple of integers. In this case, the size will be automatically derived to save the user the trouble. Returns ------- simulated_data : {List[DataFrame], ndarray} List of DataFrame with columns data containing the simulated values, volatility, containing the conditional volatility and errors containing the errors used in the simulation See Also -------- UArch.simulation_horizon_required: Calculates the simulation size required """ initial_value, initial_value_vol, x, custom_dist = self._format_simulation_parameters(nobs, burn, None, initial_value, initial_value_vol, x, custom_dist) sims = [] for i in range(len(self)): r = custom_dist[..., i] if custom_dist is not None else None xx = x[i] iv = initial_value[i] ivv = initial_value_vol[i] sims.append(self[i].simulate(nobs, burn, iv, xx, ivv, data_only, None, r)) if data_only: sims = np.asarray(sims).T return sims
[docs] def simulate_mc(self, nobs, reps, burn=500, initial_value=None, x=None, initial_value_vol=None, custom_dist: Optional[Union[RngGen, np.ndarray]] = None, n_jobs: Optional[int] = None): """ Simulates data from the multiple ARCH-GARCH models. This function is specially crafted for Monte-Carlo simulations. Parameters ---------- nobs: int Length of series to simulate reps: int Number of repetitions burn: int, optional Number of values to simulate to initialize the model and remove dependence on initial values initial_value: {ndarray, float}, optional Either a scalar value or `max(lags)` array set of initial values to use when initializing the model. If omitted, 0.0 is used. If array, the last column must be of the same size as the number of models x: {ndarray, list of ndarray}, optional If supplied as a list, this list should have the same number of elements as the number of models in the MUArch model. Each array inside is the specified exogenous variable for that particular model and this must be a nobs + burn by k matrix of exogenous variables to include in the simulation. Otherwise, leave the value as `None` to indicate no exogenous variables are used for simulation in the model. If an array is supplied directly, it means every model has an exogenous variable associated with it. In this case, it should be a 3 dimensional tensor where the first dimension represents the number of models. initial_value_vol: {ndarray, float}, optional An array or scalar to use when initializing the volatility process. If array, the last column must be of the same size as the number of models custom_dist: {ndarray, Callable}, optional Optional density from which to simulate the innovations (Distribution) in the GARCH models. This is useful when working with the copula-GARCH model where each univariate model innovations has dependence on others. It is assumed that the values supplied are standardized [0, 1] innovations instead of the unstandardized residuals. The shape of the array must be at least as long as the simulation size required after accounting for burn and type of innovation process. If unsure, use :code:`simulation_size_required` to check. If a random number generator function is passed in, he size will be automatically derived to save the user the trouble. However, the function must: * take as it first argument an integer or a tuple of integer * have other parameters that are optional * return a numpy array n_jobs: int or None, optional The number of jobs to run in parallel for simulation. This is particularly useful when simulating large number of repetitions with more than 1 dimension. None defaults to using 1 processor. Any numbers less or equal to 0 means to use all processors. Even if a large number is used, it will be capped at the maximum number of processors available. Returns ------- simulated_data : numpy array Array containing simulated data from the Monte Carlo Simulation """ if not isinstance(reps, int) or reps < 1: raise ValueError('reps must be an integer greater than 0') # setting up job processes if n_jobs is None: n_jobs = 1 elif not isinstance(n_jobs, int): raise ValueError('`n_jobs` must be an integer or None') else: n_jobs = min(os.cpu_count(), os.cpu_count() if n_jobs <= 0 else n_jobs) initial_value, initial_value_vol, x, custom_dist = self._format_simulation_parameters(nobs, burn, reps, initial_value, initial_value_vol, x, custom_dist) # formulate function calls to get simulated data functions = [] for i in range(len(self)): r = custom_dist[..., i] if custom_dist is not None else None xx = x[i] iv = initial_value[i] ivv = initial_value_vol[i] func = partial(self[i].simulate_mc, nobs, reps, burn, iv, xx, ivv, None, r) functions.append(func) # switch between sequential and multiprocessing if n_jobs == 1: # sequential sims = np.zeros((nobs, reps, len(self))) for i, func in enumerate(functions): sims[:, :, i] = func() return sims else: # multiprocessing results = [] with ProcessPoolExecutor(n_jobs) as P: for func in functions: results.append(P.submit(func)) results = np.asarray([r.result() for r in results]) return np.moveaxis(results, 0, -1)
[docs] def summary(self, short=False, dp=4): """ Summary of fitted models :param short: bool, default False Whether to show short summary or full summary. :param dp: int, default 4 Number of decimal places to show in short summary :return: SummaryList summary of fitted models """ if short: return pd.DataFrame([m.summary(short, dp) for m in self], self._model_names) else: smry = SummaryList() for i, name in enumerate(self._model_names): s = self[i].summary(short, dp) s.add_header(name) smry.append(s) return smry
def _format_simulation_parameters(self, nobs, burn, reps, initial_value, initial_value_vol, x, custom_dist): """Ensures the simulation are okay and return the formatted parameters""" # Format initial value if initial_value is None or isinstance(initial_value, (float, int)): initial_value = np.asarray([initial_value] * len(self)) # Format initial volatility value if initial_value_vol is None or isinstance(initial_value, (float, int)): initial_value_vol = np.asarray([initial_value_vol] * len(self)) # Format exogenous regressors if x is not None: if isinstance(x, np.ndarray): assert x.ndim == 3, ("If numpy array passed in as MUArch `x` (exog), make sure it is a 3 " "dimensional array where the last dimension is the number of elements") x = [x[i] for i in range(len(self))] else: x = list(x) assert len(x) == len(self), ("Exogenous variable's list should have the same number of elements as the " "number of models") for i, xx in enumerate(x): if xx is None: continue elif isinstance(x[0], pd.DataFrame): # convert all data frames to numpy array then move the model axis (first) to last xx = np.moveaxis(np.array([df.values for df in x]), 0, -1) else: xx = np.asarray(xx) if xx.ndim == 1: xx = xx.reshape(-1, 1) x[i] = xx else: x = [None] * len(self) # Format custom distribution if custom_dist is not None: horizon = max(m.simulation_horizon(nobs, burn) for m in self) if isinstance(reps, int): # for monte carlo size = horizon, reps else: size = horizon if callable(custom_dist): custom_dist = np.asarray(custom_dist(size)) else: custom_dist = np.asarray(custom_dist) if custom_dist.ndim == 1: custom_dist = custom_dist.reshape(1, -1) dim = custom_dist.shape[-1] assert dim == len(self), (f"expected generator to return array with {len(self)} vectors in the last axis " f"(of shape) but got {dim}") return initial_value, initial_value_vol, x, custom_dist def _repr_html_(self): header = '<h2>Arch Models</h2>' models = '<br/>'.join(m._repr_html_() for m in self) return header + models def __getitem__(self, i: int): return self._models[i] def __setitem__(self, i: int, model: UArch): assert isinstance(model, UArch), 'only UArch models are allowed to be set as a component of the MUArch object' self._models[i] = model def __repr__(self): return self.__str__() + f', {hex(id(self))}' def __str__(self): txt = f'{"Arch Models":^80}\n' + '=' * 80 + '\n\n' txt += '\n\n\n'.join(str(m) for m in self) return txt def __len__(self): return len(self._models)