In [13]:
# -*- coding: utf-8 -*-

###############################################################################
# Information
###############################################################################
# Created by Linwood Creekmore 

# May 10, 2015

# https://plus.google.com/+LinwoodCreekmoreIII/

###############################################################################
# Imports
###############################################################################

import os
import numpy as np
import pandas as pd
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.preprocessing import Imputer
from scipy.spatial.distance import cdist
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn.metrics import silhouette_samples, silhouette_score


###############################################################################
# Data Load
###############################################################################
path = os.path.abspath(os.getcwd())
with open(os.path.normpath(os.path.join(os.path.dirname(path),'lin.csv'))) as in_data:
    skid_data = pd.DataFrame.from_csv(in_data, sep=',')


print ("Here is a list of all our calculated variables: \n\n")
print (list(enumerate(skid_data.columns[1:],start = 1)))

#Loading into the numpy array
as_array = np.asfarray(skid_data[['Average Velocity (mph)','Max Velocity', 'Velocity Stdev','Average Acceleration (mph per s)', 'Max Acceleration (mph per s)', ' Acceleration Stdev','Displacement','Total Distance Traveled','Max Direction Change per sec', ' Direction Stdev','Time (s)', 'Turns', 'Aggressive Turns', 'Stops', 'Large Deceleration Events', 'Deceleration Events', 'Max Deceleration Event']])
another_array = np.asfarray(skid_data[['Average Velocity (mph)','Displacement','Total Distance Traveled',' Direction Stdev','Time (s)', 'Max Deceleration Event']])


# preprocessing tricks
imputer = Imputer(missing_values="NaN", strategy="mean")
patched = imputer.fit_transform(as_array)
anotherpatched = imputer.fit_transform(another_array)

###############################################################################
# Pick a k using elbow method
###############################################################################

K = range(1,20)
meandisortions = []


for k in K:
	kmeans = KMeans(n_clusters = k)
	kmeans.fit(patched)
	meandisortions.append(sum(np.min(cdist(patched,kmeans.cluster_centers_,'euclidean'),axis=1))/patched.shape[0])

plt.plot(K,meandisortions,'bx-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
plt.show()


###############################################################################
# K-Means
###############################################################################

n_clusters=3 
cluster = KMeans(n_clusters=n_clusters, n_init = 10)
second = KMeans(n_clusters=n_clusters,n_init=10)
cluster.fit_predict(patched)
second.fit_predict(anotherpatched)
classified_data = cluster.labels_
morelabels = second.labels_

###############################################################################################
# Evaluate how good clusters are usng Silhouette Coefficient; lowest score is -1, highest is 1
###############################################################################################

silhouette_avg = silhouette_score(patched, classified_data)
print("For n_clusters =\n", n_clusters,
      "The average silhouette_score is :", silhouette_avg)

###############################################################################
# Ordinary Least Squares Report
###############################################################################

model = sm.OLS(classified_data, patched)
results = model.fit()
print (results.summary())

###############################################################################
# Using array of Dependent Variables with absolute t greater than 2;another patched
###############################################################################
print ("Here's another round with high absolute value t:\n\n")


silhouette_avg = silhouette_score(anotherpatched, morelabels)
print("For n_clusters =\n", n_clusters,
      "The average silhouette_score is :", silhouette_avg)


###############################################################################
# 2nd Ordinary Least Squares Report
###############################################################################

model = sm.OLS(morelabels, anotherpatched)
results = model.fit()
print (results.summary())


Here is a list of all our calculated variables: 


[(1, 'Average Velocity (mph)'), (2, 'Max Velocity'), (3, 'Velocity Stdev'), (4, 'Average Acceleration (mph per s)'), (5, 'Max Acceleration (mph per s)'), (6, ' Acceleration Stdev'), (7, 'Displacement'), (8, 'Total Distance Traveled'), (9, 'Max Direction Change per sec'), (10, ' Direction Stdev'), (11, 'Time (s)'), (12, 'Turns'), (13, 'Aggressive Turns'), (14, 'Stops'), (15, 'Large Deceleration Events'), (16, 'Deceleration Events'), (17, 'Max Deceleration Event')]
('For n_clusters =\n', 3, 'The average silhouette_score is :', 0.58511308599974932)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.681
Model:                            OLS   Adj. R-squared:                  0.680
Method:                 Least Squares   F-statistic:                     576.8
Date:                Sun, 10 May 2015   Prob (F-statistic):               0.00
Time:                        02:39:44   Log-Likelihood:                -4502.2
No. Observations:                4600   AIC:                             9038.
Df Residuals:                    4583   BIC:                             9148.
Df Model:                          17                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             0.0689      0.003     26.943      0.000         0.064     0.074
x2            -0.0046      0.002     -2.760      0.006        -0.008    -0.001
x3             0.0159      0.005      3.414      0.001         0.007     0.025
x4            -0.2751      0.048     -5.690      0.000        -0.370    -0.180
x5             0.0106      0.002      6.519      0.000         0.007     0.014
x6            -0.1187      0.028     -4.228      0.000        -0.174    -0.064
x7         -4.993e-06   5.66e-06     -0.882      0.378     -1.61e-05  6.11e-06
x8         -6.554e-05   6.47e-06    -10.132      0.000     -7.82e-05 -5.29e-05
x9         -3.158e-05      0.000     -0.098      0.922        -0.001     0.001
x10           -0.0042      0.000    -12.472      0.000        -0.005    -0.004
x11            0.0009    6.2e-05     15.297      0.000         0.001     0.001
x12           -0.0025      0.002     -1.392      0.164        -0.006     0.001
x13            0.0187      0.005      3.855      0.000         0.009     0.028
x14            0.0070      0.003      2.301      0.021         0.001     0.013
x15            0.0272      0.009      3.182      0.001         0.010     0.044
x16            0.0172      0.003      5.553      0.000         0.011     0.023
x17            0.0079      0.002      3.925      0.000         0.004     0.012
==============================================================================
Omnibus:                      316.895   Durbin-Watson:                   1.839
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              148.669
Skew:                           0.251   Prob(JB):                     5.21e-33
Kurtosis:                       2.276   Cond. No.                     4.98e+04
==============================================================================

Warnings:
[1] The condition number is large, 4.98e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
Here's another round with high absolute value t:


('For n_clusters =\n', 3, 'The average silhouette_score is :', 0.58522902193964066)
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.868
Model:                            OLS   Adj. R-squared:                  0.868
Method:                 Least Squares   F-statistic:                     5051.
Date:                Sun, 10 May 2015   Prob (F-statistic):               0.00
Time:                        02:39:57   Log-Likelihood:                -3267.7
No. Observations:                4600   AIC:                             6547.
Df Residuals:                    4594   BIC:                             6586.
Df Model:                           6                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             0.0634      0.001     51.906      0.000         0.061     0.066
x2         -3.973e-05   4.11e-06     -9.668      0.000     -4.78e-05 -3.17e-05
x3            -0.0001   4.33e-06    -30.194      0.000        -0.000    -0.000
x4             0.0018      0.000      9.195      0.000         0.001     0.002
x5             0.0012    3.2e-05     37.995      0.000         0.001     0.001
x6             0.0013      0.001      1.238      0.216        -0.001     0.003
==============================================================================
Omnibus:                       64.065   Durbin-Watson:                   1.876
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               65.862
Skew:                          -0.285   Prob(JB):                     4.99e-15
Kurtosis:                       2.862   Cond. No.                     1.60e+03
==============================================================================

Warnings:
[1] The condition number is large, 1.6e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

In [ ]:


In [ ]: