In [3]:

    
import pandas as pd
import numpy as np
import pydotplus
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
from IPython.display import Image

import data



In [4]:

    
data = pd.read_csv('../../Data/WeatherOutagesAllJerry.csv')



In [5]:

    
data = data.dropna(how = 'all')



In [6]:

    
data.head(5)









    Out[6]:






  
    
      
      Date
      Total_outages
      Equipment
      Trees
      Animals
      Lightning
      Day_length_hr
      Max_temp_F
      Avg_Temp_F
      Min_temp_F
      ...
      Avg_windspeed_mph
      Max_windgust_mph
      Precipitation_in
      Events
      Event_fog
      Event_rain
      Event_snow
      Event_thunderstorm
      Event_Hail
      Event_Tornado
    
  
  
    
      0
      9/11/00
      0.0
      0.0
      0.0
      0.0
      0.0
      12.783333
      66.0
      58.0
      50.0
      ...
      2.0
      9.0
      0.01
      Fog
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      9/12/00
      2.0
      1.0
      0.0
      1.0
      0.0
      12.716667
      75.0
      62.0
      52.0
      ...
      4.0
      9.0
      0.00
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      9/13/00
      1.0
      1.0
      0.0
      0.0
      0.0
      12.666667
      77.0
      64.0
      54.0
      ...
      7.0
      25.0
      0.00
      Fog
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      9/14/00
      0.0
      0.0
      0.0
      0.0
      0.0
      12.616667
      84.0
      71.0
      60.0
      ...
      12.0
      9.0
      0.00
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      9/15/00
      1.0
      1.0
      0.0
      0.0
      0.0
      12.550000
      73.0
      66.0
      59.0
      ...
      5.0
      9.0
      0.00
      NaN
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 27 columns



In [7]:

    
np.shape(data)









    Out[7]:





(5664, 27)



In [8]:

    
#data = data.fillna(0)
#data = data.round(4)



In [9]:

    
train,test=train_test_split(data,test_size=0.1,random_state=567)



In [17]:

    
x_train = train.iloc[:,6:19].join(train.iloc[:,21:])
y_train = train.iloc[:,1]

x_test = test.iloc[:,6:19].join(test.iloc[:,21:])
y_test = test.iloc[:,1]



In [18]:

    
# data normalization
x_train = preprocessing.normalize(x_train)
#y_train = preprocessing.normalize(y_train)

x_test = preprocessing.normalize(x_test)
#y_test = preprocessing.normalize(y_test)



In [ ]:

    
# data standardization
x_train_std = preprocessing.scale(x_train)
y_train_std = preprocessing.scale(y_train)



In [19]:

    
x_train.size









    Out[19]:





96843



In [20]:

    
y_test.size









    Out[20]:





567

Decision Tree



In [40]:

    
predictor = DecisionTreeRegressor(criterion='mse',max_leaf_nodes=6)



In [41]:

    
clf = predictor.fit(x_train,y_train)



In [42]:

    
y_trainpred = predictor.predict(x_train)
y_pred = predictor.predict(x_test)



In [43]:

    
score = mean_squared_error(y_test, y_pred)
print("Train error",mean_squared_error(y_train,y_trainpred))
print("Test error",mean_squared_error(y_test,y_pred))









    



Train error 3.36246724318
Test error 3.71908873168



In [44]:

    
tree_dot = export_graphviz(clf,out_file = 'tree_all_normalized.dot')



In [45]:

    
graph = pydotplus.graphviz.graph_from_dot_file('tree_all_normalized.dot')
Image(graph.create_png())









    Out[45]:



In [46]:

    
plt.figure(figsize=(5,5))
plt.scatter(y_test,y_pred,alpha=0.2)
plt.xlabel('real test value')
plt.ylabel('predicted test value')
plt.plot(np.arange(-5,20),np.arange(-5,20))









    Out[46]:





[<matplotlib.lines.Line2D at 0x10fe74cf8>]

Random Forests



In [53]:

    
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.cross_validation import ShuffleSplit
from sklearn.learning_curve import validation_curve



In [55]:

    
rf = RandomForestRegressor(n_estimators=100)
rf.fit(x_train,y_train)









    Out[55]:





RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)



In [56]:

    
y_trainpred = rf.predict(x_train)
y_pred = rf.predict(x_test)



In [57]:

    
print("Train error",mean_squared_error(y_train,y_trainpred))
print("Test error",mean_squared_error(y_test,y_pred))









    



Train error 0.587340357073
Test error 2.99659929453



In [62]:

    
plt.figure(figsize=(5,5))
plt.scatter(y_test,y_pred,alpha=0.2)
plt.xlabel('real test value')
plt.ylabel('predicted test value')
plt.plot(np.arange(0,20),np.arange(0,20))









    Out[62]:





[<matplotlib.lines.Line2D at 0x116be0fd0>]

Extra Trees



In [59]:

    
from sklearn.ensemble import (RandomForestRegressor, ExtraTreesRegressor,
                              AdaBoostRegressor)



In [72]:

    
et = ExtraTreesRegressor(n_estimators=100)
et.fit(x_train,y_train)









    Out[72]:





ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
          verbose=0, warm_start=False)



In [73]:

    
y_trainpred = et.predict(x_train)
y_pred = et.predict(x_test)



In [74]:

    
print("Train error",mean_squared_error(y_train,y_trainpred))
print("Test error",mean_squared_error(y_test,y_pred))









    



Train error 0.0
Test error 2.76110740741



In [75]:

    
plt.figure(figsize=(5,5))
plt.scatter(y_test,y_pred,alpha=0.2)
plt.xlabel('real test value')
plt.ylabel('predicted test value')
plt.plot(np.arange(0,20),np.arange(0,20))









    Out[75]:





[<matplotlib.lines.Line2D at 0x1169c4b70>]

AdaBoost



In [76]:

    
ab = AdaBoostRegressor()
ab.fit(x_train,y_train)









    Out[76]:





AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=50, random_state=None)



In [77]:

    
y_trainpred = ab.predict(x_train)
y_pred = ab.predict(x_test)



In [78]:

    
print("Train error",mean_squared_error(y_train,y_trainpred))
print("Test error",mean_squared_error(y_test,y_pred))









    



Train error 10.8160484877
Test error 11.9008246196



In [79]:

    
plt.figure(figsize=(5,5))
plt.scatter(y_test,y_pred,alpha=0.2)
plt.xlabel('real test value')
plt.ylabel('predicted test value')
plt.plot(np.arange(0,20),np.arange(0,20))









    Out[79]:





[<matplotlib.lines.Line2D at 0x1199c1978>]



In [ ]:

	Date	Total_outages	Equipment	Animals	Day_length_hr	Max_temp_F	Avg_Temp_F	Min_temp_F	...	Avg_windspeed_mph	Max_windgust_mph	Precipitation_in	Events	Event_fog
0	9/11/00	0.0	0.0	0.0	12.783333	66.0	58.0	50.0	...	2.0	9.0	0.01	Fog	1.0
1	9/12/00	2.0	1.0	1.0	12.716667	75.0	62.0	52.0	...	4.0	9.0	0.00	NaN	0.0
2	9/13/00	1.0	1.0	0.0	12.666667	77.0	64.0	54.0	...	7.0	25.0	0.00	Fog	1.0
3	9/14/00	0.0	0.0	0.0	12.616667	84.0	71.0	60.0	...	12.0	9.0	0.00	NaN	0.0
4	9/15/00	1.0	1.0	0.0	12.550000	73.0	66.0	59.0	...	5.0	9.0	0.00	NaN	0.0