In [1]:
# Custom libraries
from datascienceutils import plotter
from datascienceutils import analyze
# Standard libraries
import json
%matplotlib inline
import datetime
import numpy as np
import pandas as pd
import random
from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.charts import Histogram
import bokeh
output_notebook(bokeh.resources.INLINE)
In [2]:
irisDf = pd.read_csv('./data/Iris.csv')
# Sample Timeseries picked from here https://www.backblaze.com/b2/hard-drive-test-data.html
hdd2013Df = pd.read_csv('./data/hdd_2013-11-26.csv')
In [3]:
# Create classes for showing off correlation_analyze's heatmapping ability
def createClasses(x):
rdm = random.random()
if rdm < 0.3:
return 'A'
elif rdm > 0.3 and rdm < 0.6:
return 'B'
else:
return 'C'
irisDf['Class'] = irisDf['Species'].apply(createClasses)
In [4]:
irisDf.describe()
Out[4]:
In [5]:
irisDf.head()
Out[5]:
In [6]:
irisDf.corr()
Out[6]:
In [7]:
irisDf.select_dtypes(include=[np.number]).columns
Out[7]:
In [8]:
analyze.correlation_analyze(irisDf, exclude_columns='Id',
categories=['Species', 'Class'],
measures=['count', 'SepalLengthCm','SepalWidthCm',
'PetalLengthCm', 'PetalWidthCm'])
In [9]:
analyze.dist_analyze(irisDf)
In [10]:
analyze.dist_analyze(irisDf, 'SepalLengthCm')
In [11]:
analyze.regression_analyze(irisDf, 'SepalLengthCm', 'SepalWidthCm')
In [16]:
irisDf.head()
Out[16]:
In [17]:
hdd2013Df.fillna(value=0, inplace=True)
hdd2013Df.describe()
Out[17]:
In [18]:
hdd2013Df.head()
Out[18]:
In [19]:
hdd2013Df['date'] = hdd2013Df['date'].astype('datetime64[ns]')
In [20]:
hdd2013Df['date'] = [each + datetime.timedelta(0, i*45) for i, each in enumerate(hdd2013Df.date)]
In [21]:
analyze.time_series_analysis(hdd2013Df, timeCol='date', valueCol='smart_1_raw', seasonal={'freq': '30s'})
In [ ]: