In [3]:
import pandas as pd
import numpy as np
import pydotplus
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
from IPython.display import Image
In [4]:
data = pd.read_csv('../../Data/WeatherOutagesAllJerry.csv')
In [5]:
data = data.dropna(how = 'all')
In [6]:
data.head(5)
Out[6]:
In [7]:
np.shape(data)
Out[7]:
In [8]:
#data = data.fillna(0)
#data = data.round(4)
In [9]:
train,test=train_test_split(data,test_size=0.1,random_state=567)
In [17]:
x_train = train.iloc[:,6:19].join(train.iloc[:,21:])
y_train = train.iloc[:,1]
x_test = test.iloc[:,6:19].join(test.iloc[:,21:])
y_test = test.iloc[:,1]
In [18]:
# data normalization
x_train = preprocessing.normalize(x_train)
#y_train = preprocessing.normalize(y_train)
x_test = preprocessing.normalize(x_test)
#y_test = preprocessing.normalize(y_test)
In [ ]:
# data standardization
x_train_std = preprocessing.scale(x_train)
y_train_std = preprocessing.scale(y_train)
In [19]:
x_train.size
Out[19]:
In [20]:
y_test.size
Out[20]:
In [40]:
predictor = DecisionTreeRegressor(criterion='mse',max_leaf_nodes=6)
In [41]:
clf = predictor.fit(x_train,y_train)
In [42]:
y_trainpred = predictor.predict(x_train)
y_pred = predictor.predict(x_test)
In [43]:
score = mean_squared_error(y_test, y_pred)
print("Train error",mean_squared_error(y_train,y_trainpred))
print("Test error",mean_squared_error(y_test,y_pred))
In [44]:
tree_dot = export_graphviz(clf,out_file = 'tree_all_normalized.dot')
In [45]:
graph = pydotplus.graphviz.graph_from_dot_file('tree_all_normalized.dot')
Image(graph.create_png())
Out[45]:
In [46]:
plt.figure(figsize=(5,5))
plt.scatter(y_test,y_pred,alpha=0.2)
plt.xlabel('real test value')
plt.ylabel('predicted test value')
plt.plot(np.arange(-5,20),np.arange(-5,20))
Out[46]:
In [53]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.cross_validation import ShuffleSplit
from sklearn.learning_curve import validation_curve
In [55]:
rf = RandomForestRegressor(n_estimators=100)
rf.fit(x_train,y_train)
Out[55]:
In [56]:
y_trainpred = rf.predict(x_train)
y_pred = rf.predict(x_test)
In [57]:
print("Train error",mean_squared_error(y_train,y_trainpred))
print("Test error",mean_squared_error(y_test,y_pred))
In [62]:
plt.figure(figsize=(5,5))
plt.scatter(y_test,y_pred,alpha=0.2)
plt.xlabel('real test value')
plt.ylabel('predicted test value')
plt.plot(np.arange(0,20),np.arange(0,20))
Out[62]:
In [59]:
from sklearn.ensemble import (RandomForestRegressor, ExtraTreesRegressor,
AdaBoostRegressor)
In [72]:
et = ExtraTreesRegressor(n_estimators=100)
et.fit(x_train,y_train)
Out[72]:
In [73]:
y_trainpred = et.predict(x_train)
y_pred = et.predict(x_test)
In [74]:
print("Train error",mean_squared_error(y_train,y_trainpred))
print("Test error",mean_squared_error(y_test,y_pred))
In [75]:
plt.figure(figsize=(5,5))
plt.scatter(y_test,y_pred,alpha=0.2)
plt.xlabel('real test value')
plt.ylabel('predicted test value')
plt.plot(np.arange(0,20),np.arange(0,20))
Out[75]:
In [76]:
ab = AdaBoostRegressor()
ab.fit(x_train,y_train)
Out[76]:
In [77]:
y_trainpred = ab.predict(x_train)
y_pred = ab.predict(x_test)
In [78]:
print("Train error",mean_squared_error(y_train,y_trainpred))
print("Test error",mean_squared_error(y_test,y_pred))
In [79]:
plt.figure(figsize=(5,5))
plt.scatter(y_test,y_pred,alpha=0.2)
plt.xlabel('real test value')
plt.ylabel('predicted test value')
plt.plot(np.arange(0,20),np.arange(0,20))
Out[79]:
In [ ]: