In [79]:
import sklearn
import pandas
import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg1 = linear_model.BayesianRidge()
reg2 = linear_model.RANSACRegressor()
reg3 = linear_model.LassoLars(alpha=.1)
reg4 = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0])
reg5 = linear_model.ElasticNetCV()
reg6 = linear_model.SGDRegressor()
reg7 = linear_model.PassiveAggressiveRegressor()
reg8 = linear_model.TheilSenRegressor()
reg9 = linear_model.HuberRegressor()
from sklearn import svm
clf = svm.SVR(C=1.0, epsilon=0.2)
models = [clf, reg, reg1,reg2,reg3,reg4,reg5,reg6,reg7,reg9]
In [80]:
dataframe = pandas.read_csv('../datasets/UCIrvineCrimeData.csv')
In [81]:
dataframe = dataframe.replace('?',np.NAN)
dict1 = dataframe.isnull().sum().to_dict()
non_zero = []
for a in dict1.keys():
if dict1[a] > 100:
# print a
# print dict1[a]
non_zero.append(a)
# print non_zero
for elem in non_zero:
del dataframe[elem]
# Perhaps its better to remove this row.
# No reason in removing whole column.
dataframe= dataframe.dropna()
cols = list(dataframe.columns.values)
In [82]:
cols = [ x for x in cols if x not in ['fold', 'state', 'community', 'communityname', 'county'
,'ViolentCrimesPerPop']]
# cols = ['numbUrban', 'NumInShelters']
print len(cols)
# print cols[0]
cols1 = cols
all_errors = []
X = dataframe[list(cols1)].values
total_val = len(dataframe['ViolentCrimesPerPop'].values)
percent = 2/float(3)
edge_val = int(total_val*percent)
Y = np.asarray(dataframe['ViolentCrimesPerPop'].values)
for model in models:
model.fit(X[:edge_val], Y[:edge_val])
y_predict = [model.predict(X[edge_val:]) for model in models]
error = [0] * len(models)
for i in xrange(edge_val, total_val):
for k in xrange(len(models)):
print k
error[k] += (float(Y[i]) - y_predict[k][i-edge_val])**2
# print "Error of Estimates"
error = [er/float(total_val- edge_val) for er in error]
all_errors.append(error)
# print error
print "Im done"
In [83]:
# arr = np.array(all_errors)
# print np.min(arr, axis=0)
print error
best_f= np.argmin(arr, axis=0)
print "SVM ranked"
print arr[:,0].argsort()
print "Bayesian ranked"
print arr[:,2].argsort()
print best_f
bf = [cols[b1] for b1 in best_f]
print bf
# print np.min(arr,axis=1)
models_chosen = np.argmin(arr,axis=1)
# print models_chosen
fm = {}
for best in models_chosen:
if best in fm:
fm[best] +=1
else:
fm[best] = 1
print fm
In [53]:
cols[44]
Out[53]:
In [27]:
cols[50]
Out[27]:
In [ ]: