notebook.community

Edit and run



In [1]:

    
# Custom libraries
from datascienceutils import plotter
from datascienceutils import analyze

# Standard libraries
import json
%matplotlib inline
import datetime
import numpy as np
import pandas as pd
import random


from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.charts import Histogram
import bokeh
output_notebook(bokeh.resources.INLINE)









    





    
        
        Loading BokehJS ...



In [2]:

    
irisDf = pd.read_csv('./data/Iris.csv')
# Sample Timeseries  picked from here https://www.backblaze.com/b2/hard-drive-test-data.html
hdd2013Df = pd.read_csv('./data/hdd_2013-11-26.csv')



In [3]:

    
# Create classes for showing off correlation_analyze's heatmapping ability
def createClasses(x):
    rdm = random.random()
    if rdm < 0.3:
        return 'A'
    elif rdm > 0.3 and rdm < 0.6:
        return 'B' 
    else:
        return 'C'
irisDf['Class'] = irisDf['Species'].apply(createClasses)



In [4]:

    
irisDf.describe()









    Out[4]:






  
    
      
      Id
      SepalLengthCm
      SepalWidthCm
      PetalLengthCm
      PetalWidthCm
    
  
  
    
      count
      150.000000
      150.000000
      150.000000
      150.000000
      150.000000
    
    
      mean
      75.500000
      5.843333
      3.054000
      3.758667
      1.198667
    
    
      std
      43.445368
      0.828066
      0.433594
      1.764420
      0.763161
    
    
      min
      1.000000
      4.300000
      2.000000
      1.000000
      0.100000
    
    
      25%
      38.250000
      5.100000
      2.800000
      1.600000
      0.300000
    
    
      50%
      75.500000
      5.800000
      3.000000
      4.350000
      1.300000
    
    
      75%
      112.750000
      6.400000
      3.300000
      5.100000
      1.800000
    
    
      max
      150.000000
      7.900000
      4.400000
      6.900000
      2.500000



In [5]:

    
irisDf.head()









    Out[5]:






  
    
      
      Id
      SepalLengthCm
      SepalWidthCm
      PetalLengthCm
      PetalWidthCm
      Species
      Class
    
  
  
    
      0
      1
      5.1
      3.5
      1.4
      0.2
      Iris-setosa
      A
    
    
      1
      2
      4.9
      3.0
      1.4
      0.2
      Iris-setosa
      A
    
    
      2
      3
      4.7
      3.2
      1.3
      0.2
      Iris-setosa
      A
    
    
      3
      4
      4.6
      3.1
      1.5
      0.2
      Iris-setosa
      B
    
    
      4
      5
      5.0
      3.6
      1.4
      0.2
      Iris-setosa
      C



In [6]:

    
irisDf.corr()









    Out[6]:






  
    
      
      Id
      SepalLengthCm
      SepalWidthCm
      PetalLengthCm
      PetalWidthCm
    
  
  
    
      Id
      1.000000
      0.716676
      -0.397729
      0.882747
      0.899759
    
    
      SepalLengthCm
      0.716676
      1.000000
      -0.109369
      0.871754
      0.817954
    
    
      SepalWidthCm
      -0.397729
      -0.109369
      1.000000
      -0.420516
      -0.356544
    
    
      PetalLengthCm
      0.882747
      0.871754
      -0.420516
      1.000000
      0.962757
    
    
      PetalWidthCm
      0.899759
      0.817954
      -0.356544
      0.962757
      1.000000



In [7]:

    
irisDf.select_dtypes(include=[np.number]).columns









    Out[7]:





Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], dtype='object')



In [8]:

    
analyze.correlation_analyze(irisDf, exclude_columns='Id', 
                                categories=['Species', 'Class'], 
                                measures=['count', 'SepalLengthCm','SepalWidthCm',
                                           'PetalLengthCm', 'PetalWidthCm'])









    



# Correlation btw Numerical Columns






    






    
        
    







    



# Correlation btw Columns Species & Class by count






    






    
        
    







    



# Pandas correlation coefficients matrix
                     Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  \
Id             1.000000       0.716676     -0.397729       0.882747   
SepalLengthCm  0.716676       1.000000     -0.109369       0.871754   
SepalWidthCm  -0.397729      -0.109369      1.000000      -0.420516   
PetalLengthCm  0.882747       0.871754     -0.420516       1.000000   
PetalWidthCm   0.899759       0.817954     -0.356544       0.962757   

               PetalWidthCm  
Id                 0.899759  
SepalLengthCm      0.817954  
SepalWidthCm      -0.356544  
PetalLengthCm      0.962757  
PetalWidthCm       1.000000  
# Pandas co-variance coefficients matrix
                        Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  \
Id             1887.500000      25.782886     -7.492282      67.667785   
SepalLengthCm    25.782886       0.685694     -0.039268       1.273682   
SepalWidthCm     -7.492282      -0.039268      0.188004      -0.321713   
PetalLengthCm    67.667785       1.273682     -0.321713       3.113179   
PetalWidthCm     29.832215       0.516904     -0.117981       1.296387   

               PetalWidthCm  
Id                29.832215  
SepalLengthCm      0.516904  
SepalWidthCm      -0.117981  
PetalLengthCm      1.296387  
PetalWidthCm       0.582414



In [9]:

    
analyze.dist_analyze(irisDf)









    



/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:262: UserWarning: Path marker shapes currently not handled, defaulting to Circle
  warnings.warn("Path marker shapes currently not handled, defaulting to Circle")
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/matplotlib/artist.py:224: MatplotlibDeprecationWarning: get_axes has been deprecated in mpl 1.5, please use the
axes property.  A removal date has not been set.
  stacklevel=1)
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:295: UserWarning: Path marker sizes support is limited and may not display as expected
  warnings.warn("Path marker sizes support is limited and may not display as expected")



In [10]:

    
analyze.dist_analyze(irisDf, 'SepalLengthCm')









    



/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:262: UserWarning: Path marker shapes currently not handled, defaulting to Circle
  warnings.warn("Path marker shapes currently not handled, defaulting to Circle")
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/matplotlib/artist.py:224: MatplotlibDeprecationWarning: get_axes has been deprecated in mpl 1.5, please use the
axes property.  A removal date has not been set.
  stacklevel=1)
/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/bokeh/core/compat/bokeh_renderer.py:295: UserWarning: Path marker sizes support is limited and may not display as expected
  warnings.warn("Path marker sizes support is limited and may not display as expected")



In [11]:

    
analyze.regression_analyze(irisDf, 'SepalLengthCm', 'SepalWidthCm')









    






    
        
    







    



Regression Score
0.0119616328348



In [16]:

    
irisDf.head()









    Out[16]:






  
    
      
      Id
      SepalLengthCm
      SepalWidthCm
      PetalLengthCm
      PetalWidthCm
    
  
  
    
      0
      1
      5.1
      3.5
      1.4
      0.2
    
    
      1
      2
      4.9
      3.0
      1.4
      0.2
    
    
      2
      3
      4.7
      3.2
      1.3
      0.2
    
    
      3
      4
      4.6
      3.1
      1.5
      0.2
    
    
      4
      5
      5.0
      3.6
      1.4
      0.2



In [17]:

    
hdd2013Df.fillna(value=0, inplace=True)
hdd2013Df.describe()









    Out[17]:






  
    
      
      capacity_bytes
      failure
      smart_1_normalized
      smart_1_raw
      smart_2_normalized
      smart_2_raw
      smart_3_normalized
      smart_3_raw
      smart_4_normalized
      smart_4_raw
      ...
      smart_250_normalized
      smart_250_raw
      smart_251_normalized
      smart_251_raw
      smart_252_normalized
      smart_252_raw
      smart_254_normalized
      smart_254_raw
      smart_255_normalized
      smart_255_raw
    
  
  
    
      count
      2.655000e+04
      26550.000000
      26550.0
      2.655000e+04
      26550.0
      26550.0
      26550.0
      26550.0
      26550.0
      26550.0
      ...
      26550.0
      26550.0
      26550.0
      26550.0
      26550.0
      26550.0
      26550.0
      26550.0
      26550.0
      26550.0
    
    
      mean
      2.895978e+12
      0.000151
      0.0
      5.556346e+07
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      std
      8.723011e+11
      0.012274
      0.0
      7.741375e+07
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      min
      1.000205e+12
      0.000000
      0.0
      0.000000e+00
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      25%
      2.000399e+12
      0.000000
      0.0
      0.000000e+00
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      50%
      3.000593e+12
      0.000000
      0.0
      1.025500e+03
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      75%
      4.000787e+12
      0.000000
      0.0
      1.092058e+08
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      max
      4.000787e+12
      1.000000
      0.0
      1.041384e+09
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

8 rows × 82 columns



In [18]:

    
hdd2013Df.head()









    Out[18]:






  
    
      
      date
      serial_number
      model
      capacity_bytes
      failure
      smart_1_normalized
      smart_1_raw
      smart_2_normalized
      smart_2_raw
      smart_3_normalized
      ...
      smart_250_normalized
      smart_250_raw
      smart_251_normalized
      smart_251_raw
      smart_252_normalized
      smart_252_raw
      smart_254_normalized
      smart_254_raw
      smart_255_normalized
      smart_255_raw
    
  
  
    
      0
      2013-11-26
      MJ0351YNG9Z0XA
      Hitachi HDS5C3030ALA630
      3000592982016
      0
      0.0
      0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      2013-11-26
      MJ0351YNG9Z7LA
      Hitachi HDS5C3030ALA630
      3000592982016
      0
      0.0
      0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      2013-11-26
      MJ0351YNGABYAA
      Hitachi HDS5C3030ALA630
      3000592982016
      0
      0.0
      0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      2013-11-26
      PL1321LAG34XWH
      Hitachi HDS5C4040ALE630
      4000787030016
      0
      0.0
      0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      2013-11-26
      PL1311LAG2205A
      Hitachi HDS5C4040ALE630
      4000787030016
      0
      0.0
      0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 85 columns



In [19]:

    
hdd2013Df['date'] = hdd2013Df['date'].astype('datetime64[ns]')



In [20]:

    
hdd2013Df['date'] = [each + datetime.timedelta(0, i*45) for i, each in enumerate(hdd2013Df.date)]



In [21]:

    
analyze.time_series_analysis(hdd2013Df, timeCol='date', valueCol='smart_1_raw', seasonal={'freq': '30s'})









    



/home/anand/playspace/data-science-utils/datascienceutils/timeSeriesUtils.py:83: FutureWarning: how in .resample() is deprecated
the new syntax is .resample(...)..apply(<func>)
  new_df = new_df.resample(timeInterval, func)
/home/anand/playspace/data-science-utils/datascienceutils/timeSeriesUtils.py:10: FutureWarning: pd.rolling_mean is deprecated for DataFrame and will be removed in a future version, replace with 
	DataFrame.rolling(center=False,window=12).mean()
  calcStatsDf['rollingMean'] = pd.rolling_mean(timeseries, window=12)[valueCol]






    



Results of Dickey-Fuller Test:
Test Statistic                -6.208540e+00
p-value                        5.580408e-08
#Lags Used                     1.700000e+01
Number of Observations Used    6.460000e+02
Critical Value (1%)           -3.440513e+00
Critical Value (10%)          -2.569158e+00
Critical Value (5%)           -2.866024e+00
dtype: float64






    



/home/anand/playspace/data-science-utils/datascienceutils/timeSeriesUtils.py:11: FutureWarning: pd.rolling_std is deprecated for DataFrame and will be removed in a future version, replace with 
	DataFrame.rolling(center=False,window=12).std()
  calcStatsDf['rollingSTD']  = pd.rolling_std(timeseries, window=12)[valueCol]






    






    
        
    







    



30s






    



/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/matplotlib/figure.py:397: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure
  "matplotlib is currently using a non-GUI backend, "






    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-21-28bd060d14c5> in <module>()
----> 1 analyze.time_series_analysis(hdd2013Df, timeCol='date', valueCol='smart_1_raw', seasonal={'freq': '30s'})

/home/anand/playspace/data-science-utils/datascienceutils/analyze.py in time_series_analysis(df, timeCol, valueCol, timeInterval, plot_title, skip_stationarity, skip_autocorrelation, skip_seasonal_decompose, **kwargs)
    181         if 'seasonal' in kwargs:
    182             seasonal_args = kwargs.get('seasonal')
--> 183             tsu.seasonal_decompose(ts, **seasonal_args)
    184         else:
    185             tsu.seasonal_decompose(ts)

/home/anand/playspace/data-science-utils/datascienceutils/timeSeriesUtils.py in seasonal_decompose(timeseries_df, freq, **kwargs)
     54     print(freq)
     55     if not freq: freq = len(timeseries_df) - 2
---> 56     seasonal_components = sm.tsa.seasonal_decompose(timeseries_df, freq=freq, **kwargs)
     57     fig = seasonal_components.plot()
     58     return fig

/home/anand/anaconda3/envs/analytics/lib/python3.5/site-packages/statsmodels/tsa/seasonal.py in seasonal_decompose(x, model, filt, freq, two_sided)
     86 
     87     if filt is None:
---> 88         if freq % 2 == 0:  # split weights at ends
     89             filt = np.array([.5] + [1] * (freq - 1) + [.5]) / freq
     90         else:

TypeError: not all arguments converted during string formatting



In [ ]:

	Id	SepalLengthCm	SepalWidthCm	PetalLengthCm	PetalWidthCm
count	150.000000	150.000000	150.000000	150.000000	150.000000
mean	75.500000	5.843333	3.054000	3.758667	1.198667
std	43.445368	0.828066	0.433594	1.764420	0.763161
min	1.000000	4.300000	2.000000	1.000000	0.100000
25%	38.250000	5.100000	2.800000	1.600000	0.300000
50%	75.500000	5.800000	3.000000	4.350000	1.300000
75%	112.750000	6.400000	3.300000	5.100000	1.800000
max	150.000000	7.900000	4.400000	6.900000	2.500000

	Id	SepalLengthCm	SepalWidthCm	PetalLengthCm	PetalWidthCm	Species	Class
0	1	5.1	3.5	1.4	0.2	Iris-setosa	A
1	2	4.9	3.0	1.4	0.2	Iris-setosa	A
2	3	4.7	3.2	1.3	0.2	Iris-setosa	A
3	4	4.6	3.1	1.5	0.2	Iris-setosa	B
4	5	5.0	3.6	1.4	0.2	Iris-setosa	C

	Id	SepalLengthCm	SepalWidthCm	PetalLengthCm	PetalWidthCm
Id	1.000000	0.716676	-0.397729	0.882747	0.899759
SepalLengthCm	0.716676	1.000000	-0.109369	0.871754	0.817954
SepalWidthCm	-0.397729	-0.109369	1.000000	-0.420516	-0.356544
PetalLengthCm	0.882747	0.871754	-0.420516	1.000000	0.962757
PetalWidthCm	0.899759	0.817954	-0.356544	0.962757	1.000000

	Id	SepalLengthCm	SepalWidthCm	PetalLengthCm	PetalWidthCm
0	1	5.1	3.5	1.4	0.2
1	2	4.9	3.0	1.4	0.2
2	3	4.7	3.2	1.3	0.2
3	4	4.6	3.1	1.5	0.2
4	5	5.0	3.6	1.4	0.2

	capacity_bytes	failure	smart_1_normalized	smart_1_raw	smart_2_normalized	smart_2_raw	smart_3_normalized	smart_3_raw	smart_4_normalized	smart_4_raw	...	smart_250_normalized	smart_250_raw	smart_251_normalized	smart_251_raw	smart_252_normalized	smart_252_raw	smart_254_normalized	smart_254_raw	smart_255_normalized	smart_255_raw
count	2.655000e+04	26550.000000	26550.0	2.655000e+04	26550.0	26550.0	26550.0	26550.0	26550.0	26550.0	...	26550.0	26550.0	26550.0	26550.0	26550.0	26550.0	26550.0	26550.0	26550.0	26550.0
mean	2.895978e+12	0.000151	0.0	5.556346e+07	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
std	8.723011e+11	0.012274	0.0	7.741375e+07	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
min	1.000205e+12	0.000000	0.0	0.000000e+00	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
25%	2.000399e+12	0.000000	0.0	0.000000e+00	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
50%	3.000593e+12	0.000000	0.0	1.025500e+03	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
75%	4.000787e+12	0.000000	0.0	1.092058e+08	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
max	4.000787e+12	1.000000	0.0	1.041384e+09	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	date	serial_number	model	capacity_bytes	...
0	2013-11-26	MJ0351YNG9Z0XA	Hitachi HDS5C3030ALA630	3000592982016	...
1	2013-11-26	MJ0351YNG9Z7LA	Hitachi HDS5C3030ALA630	3000592982016	...
2	2013-11-26	MJ0351YNGABYAA	Hitachi HDS5C3030ALA630	3000592982016	...
3	2013-11-26	PL1321LAG34XWH	Hitachi HDS5C4040ALE630	4000787030016	...
4	2013-11-26	PL1311LAG2205A	Hitachi HDS5C4040ALE630	4000787030016	...

	Id	SepalLengthCm	SepalWidthCm	PetalLengthCm	PetalWidthCm
0	1	5.1	3.5	1.4	0.2
1	2	4.9	3.0	1.4	0.2
2	3	4.7	3.2	1.3	0.2
3	4	4.6	3.1	1.5	0.2
4	5	5.0	3.6	1.4	0.2

	Id	SepalLengthCm	SepalWidthCm	PetalLengthCm	PetalWidthCm
0	1	5.1	3.5	1.4	0.2
1	2	4.9	3.0	1.4	0.2
2	3	4.7	3.2	1.3	0.2
3	4	4.6	3.1	1.5	0.2
4	5	5.0	3.6	1.4	0.2