In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import datetime as dt
%matplotlib inline
f = open('recs.csv')
data = pd.read_csv(f,sep=',', header='infer', parse_dates=[1])
In [5]:
data.shape
Out[5]:
In [6]:
X = data.iloc[:,0:839]
a = X['UR'].copy()
a[a == 'U'] = 1
a[a == 'R'] = 0
X['UR'] = a
X = X.drop('METROMICRO',axis=1).drop('DOEID',1)
X['NOCRCASH'] = X['NOCRCASH'].fillna(method = 'pad')
X['NKRGALNC'] = X['NKRGALNC'].fillna(method = 'pad')
In [37]:
y1 = data['TOTALBTUSPH']/data['TOTSQFT']
y2 = data['TOTALBTUCOL']/data['TOTSQFT']
In [195]:
goodies1 = y1 < 200
goodies2 = y2 < 25
temp = y1[y1<200]
X[goodies1]
X[goodies2]
Out[195]:
In [38]:
from sklearn.ensemble import ExtraTreesRegressor
model1 = ExtraTreesRegressor().fit(X,y1)
model2 = ExtraTreesRegressor().fit(X,y2)
In [39]:
a1 = model1.feature_importances_
a2 = model2.feature_importances_
In [63]:
d = {'fname':X.columns.values, 'score1':a1,'score2':a2}
fsl = pd.DataFrame(d)
fsl.sort_values(ascending=False, by='score1',inplace=True)
fsl.to_csv('feature_importance.csv')
In [174]:
fsl.sort_values(ascending=False, by='score1').head(10)
Out[174]:
In [198]:
# Remember to change both bests and newY!
bests = fsl.sort_values(ascending=False, by='score1')['fname'].values[0:10]
print(bests)
newX2 = X[bests]
newY = y1
#newX2 = data[['AIA_Zone','CDD30YR','USEWWAC','TEMPGONEAC','HDD30YR','TEMPNITEAC','ELCOOL','USECENAC','TOTUCSQFT','COOLTYPE']]
In [199]:
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.cross_validation import cross_val_score
s5 = [0]*len(levels1)
for i in levels1:
treeA2 = DTR(max_depth = i).fit(newX2,newY)
score = cross_val_score(treeA2, newX2, newY, cv=10)
s5[i-1] = np.mean(score)
print(i,'the cross validation result of treeA2 is: \n',np.mean(score))
In [200]:
fig1 = plt.figure()
ax1 = fig1.add_subplot(111)
plt.plot(levels1,s5)
ax1.set_xlabel('max_level')
ax1.set_ylabel('score')
# plt.ylim(0,0.7)
plt.title('10 fold CV for using the best 10 features')
plt.show
Out[200]:
In [178]:
levels2 = range(1,20,2)
s2 = [0]*len(levels2)
for i in range(0,len(levels2)):
d = levels2[i]
treeMeta = DTR(max_depth=d).fit(X, newY)
score2 = cross_val_score(treeMeta, X, newY, cv=10)
s2[i] = np.mean(score2)
print(i, 'the cross validation result of treeMeta is: \n',np.mean(score2))
In [179]:
fig2 = plt.figure(figsize = (15,6))
ax1 = fig2.add_subplot(121)
plt.plot(levels1,s5)
ax1.set_xlabel('max_level')
ax1.set_ylabel('score')
# plt.ylim(0,0.7)
plt.title('best 10 features')
plt.ylim(0.1,0.8)
plt.show
ax2 = fig2.add_subplot(122)
plt.plot(levels2,s2)
ax2.set_xlabel('max_level')
plt.title('all features')
plt.ylim(0.1,0.8)
plt.show
Out[179]:
In [95]:
randX = X.iloc[:,np.random.permutation(X.shape[1])]
randX = randX.iloc[:,0:10]
s3 = [0]*len(levels1)
for i in levels1:
treeRand = DTR(max_depth = i).fit(randX,newY)
score = cross_val_score(treeRand, randX, newY, cv=10)
s3[i-1] = np.mean(score)
print(i,'the cross validation result of treeA is: \n',np.mean(score))
In [181]:
humanX = X[['REPORTABLE_DOMAIN','HDD30YR','TYPEHUQ','Climate_Region_Pub','AIA_Zone','YEARMADE','WALLTYPE','ROOFTYPE','TOTSQFT','HEATOTH']]
#humanX = X[['REPORTABLE_DOMAIN','CDD30YR','HDD30YR','AIA_Zone','YEARMADE','WALLTYPE','ROOFTYPE','TOTSQFT','KOWNRENT','UR']]
s4 = [0]*len(levels1)
for i in levels1:
treeRand = DTR(max_depth = i).fit(humanX,newY)
score = cross_val_score(treeRand, humanX, newY, cv=10)
s4[i-1] = np.mean(score)
print(i,'the cross validation result of treeA is: \n',np.mean(score))
In [182]:
fig1 = plt.figure(figsize = (15,10))
ax1 = fig1.add_subplot(111)
plt.plot(levels1,s5,label = 'best ten features')
# plt.plot(levels1,s2)
plt.plot(levels1,s3,label = 'randomly picked')
plt.plot(levels1,s4,label = 'manually picked')
plt.legend()
ax1.set_xlabel('max_level')
ax1.set_ylabel('score')
# plt.ylim(0,0.7)
plt.title('10 fold CV for using 10 features')
plt.show
Out[182]:
In [183]:
treeA = DTR(max_depth = 9).fit(newX2,newY)
predict = treeA.predict(newX2)
d = {'actual':newY, 'predict':predict}
vs = pd.DataFrame(d)
vs.head()
Out[183]:
In [184]:
idx = np.asarray(list(vs.index))
np.log(idx+1)*25
Out[184]:
In [185]:
fig4 = plt.figure()
ax3 = fig4.add_subplot(111)
plt.plot(levels1,s4)
ax3.set_xlabel('max_level')
ax3.set_ylabel('score')
plt.title('10 fold CV for using 10 expert\'s pick')
plt.show
Out[185]:
In [186]:
import math
fig1 = plt.figure(figsize = (12,8))
#plt.plot((newY-predict),'r',alpha = 0.3,label = 'predicted')
vs.plot(kind='scatter', x='actual', y='predict', c=np.log(idx+1)*25);
#plt.xlim(0,12100)
plt.legend()
plt.show()
In [ ]: