Principal Component Analysis (PCA) of various linux metrics

This is a personal project around causal impact analysis. Grey disruptions using windowed PCA. Sky is the limit

Starting point: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html


In [1]:
%matplotlib inline
import os
import sys

import pandas as pd
import pandas_datareader.data as web
import numpy as np

import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs
from arch import arch_model

import matplotlib.pyplot as plt
import matplotlib as mpl

#print('Machine: {}\n'.format(os.uname()))
#print(sys.version)

def tsplot(title, y, lags=None, figsize=(10, 8), style='bmh'):
    if not isinstance(y, pd.Series):
        y = pd.Series(y)
    with plt.style.context(style):    
        fig = plt.figure(figsize=figsize)
        #mpl.rcParams['font.family'] = 'Ubuntu Mono'
        layout = (3, 2)
        ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
        acf_ax = plt.subplot2grid(layout, (1, 0))
        pacf_ax = plt.subplot2grid(layout, (1, 1))
        qq_ax = plt.subplot2grid(layout, (2, 0))
        pp_ax = plt.subplot2grid(layout, (2, 1))
        
        y.plot(ax=ts_ax)
        title_with_infos="{:s} $(\mu={:.3f}, \sigma={:.3f})$".format(title,y.mean(),y.std())
        ts_ax.set_title(title_with_infos)
        smt.graphics.plot_acf(y, lags=lags, ax=acf_ax, alpha=0.5)
        smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.5)
        sm.qqplot(y, line='s', ax=qq_ax)
        qq_ax.set_title('QQ Plot')        
        scs.probplot(y, sparams=(y.mean(), y.std()), plot=pp_ax)

        plt.tight_layout()
        #print("@tsplot(); $\mu={:.3f}$, variance={:.3f}, sigma={:.3f}".format(y.mean(), y.var(), y.std()))
    return


/Users/sbergeron/anaconda/lib/python2.7/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools

In [5]:
df = pd.read_csv('../loganalyzer/resources/cache/cxseed/scheduling.schedule.builder.CatchupAsOfTodaySimulation/nmon/rawdata/cxseed-a-back.csv')
df.plot(x='Time', y=['NETETOTAL total-KB/s','NETETOTAL total-read-KB/s','NETETOTAL total-write-KB/s'], style='o-')


Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x11daf4290>

In [6]:
#pd.set_option('display.max_rows', len(df))
#print(df[['NETETOTAL total-KB/s','NETETOTAL total-read-KB/s','NETETOTAL total-write-KB/s']])
#list(df.columns.values)


In [2]:
np.random.seed(1)
n_samples = 1000

# plot of discrete white noise
randser = np.random.normal(size=n_samples)
tsplot('Discrete white noise', randser, lags=30)



In [3]:
x = w = randser
for t in range(n_samples):
    x[t] = x[t-1] + w[t]

_ = tsplot('Random walk', x, lags=30)
_ = tsplot('First difference of random walk', np.diff(x), lags=30)



In [ ]: