In [1]:
df = pd.read_csv('../resource/final_df1.csv')
y = df['rating(y)'].values
cat_X = df.ix[:,'0':]
cat_X.head(2)
Out[1]:
In [2]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import mean_absolute_error
In [3]:
cv = StratifiedKFold(y, n_folds=3, random_state=0)
i_range = []
score_range = []
sigma = []
for a in np.arange(0, 10, 0.01):
mnb = MultinomialNB(alpha = a)
#print('alpha :',a)
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = cat_X.ix[train_idx]
y_train = y[train_idx]
X_test = cat_X.ix[test_idx]
y_test = y[test_idx]
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
#print(' mean :', round(np.mean(scores), 4)," ", scores)
#print('--------------------------------------------------------')
#scores
#np.mean(scores)
i_range.append(a)
score_range.append(np.mean(scores))
sigma.append(np.std(scores))
best_idx = np.argmin(score_range)
best_alpha = i_range[best_idx]
best_score = score_range[best_idx]
sigma
plt.figure(figsize = (15, 5))
plt.plot(i_range, score_range)
plt.plot(i_range, np.array(score_range) + sigma, 'b--')
plt.plot(i_range, np.array(score_range) - sigma, 'b--')
plt.axhline(best_score + sigma[best_idx], linestyle=':', color='r')
plt.axvline(best_alpha, linestyle=':', color='r')
def find_nearest(array, value):
idx = (np.abs(array-value)).argmin()
return idx
plt.scatter(i_range[find_nearest(score_range, best_score+sigma[best_idx])], best_score+sigma[best_idx], s=100, c='red')
plt.xlim(0, 10)
plt.ylabel('CV score(mae)')
plt.xlabel('alpha')
print("best alpha : ", best_alpha)
print("best score : ", best_score)
print(' 1-sigma : ', round(sigma[best_idx], 4))
print('='*25)
print("sub_opt score : ", round(best_score + sigma[best_idx], 4))
print("sub_opt alpha : ", round( i_range[find_nearest(score_range, best_score+sigma[best_idx])] , 4))
In [4]:
cv = StratifiedKFold(y, n_folds=3, random_state=0)
i_range = []
score_range = []
sigma = []
for a in np.arange(0, 1, 0.01):
mnb = MultinomialNB(alpha = a)
#print('alpha :',a)
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = cat_X.ix[train_idx]
y_train = y[train_idx]
X_test = cat_X.ix[test_idx]
y_test = y[test_idx]
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
#print(' mean :', round(np.mean(scores), 4)," ", scores)
#print('--------------------------------------------------------')
#scores
#np.mean(scores)
i_range.append(a)
score_range.append(np.mean(scores))
sigma.append(np.std(scores))
best_idx = np.argmin(score_range)
best_alpha = i_range[best_idx]
best_score = score_range[best_idx]
sigma
plt.figure(figsize = (15, 5))
plt.plot(i_range, score_range)
plt.plot(i_range, np.array(score_range) + sigma, 'b--')
plt.plot(i_range, np.array(score_range) - sigma, 'b--')
plt.axhline(best_score + sigma[best_idx], linestyle=':', color='r')
plt.axvline(best_alpha, linestyle=':', color='r')
def find_nearest(array, value):
idx = (np.abs(array-value)).argmin()
return idx
plt.scatter(i_range[find_nearest(score_range, best_score+sigma[best_idx])], best_score+sigma[best_idx], s=100, c='red')
plt.xlim(0, 1)
plt.ylabel('CV score(mae)')
plt.xlabel('alpha')
print("best alpha : ", best_alpha)
print("best score : ", best_score)
print(' 1-sigma : ', round(sigma[best_idx], 4))
print('='*25)
print("sub_opt score : ", round(best_score + sigma[best_idx], 4))
print("sub_opt alpha : ", round( i_range[find_nearest(score_range, best_score+sigma[best_idx])] , 4))
In [5]:
cv = StratifiedKFold(y, n_folds=5, random_state=51)
i_range = []
score_range = []
sigma = []
for a in np.arange(0, 10, 0.01):
mnb = MultinomialNB(alpha = a)
#print('alpha :',a)
scores = np.zeros(5)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = cat_X.ix[train_idx]
y_train = y[train_idx]
X_test = cat_X.ix[test_idx]
y_test = y[test_idx]
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
#print(' mean :', round(np.mean(scores), 4)," ", scores)
#print('--------------------------------------------------------')
#scores
#np.mean(scores)
i_range.append(a)
score_range.append(np.mean(scores))
sigma.append(np.std(scores))
best_idx = np.argmin(score_range)
best_alpha = i_range[best_idx]
best_score = score_range[best_idx]
sigma
plt.figure(figsize = (15, 5))
plt.plot(i_range, score_range)
plt.plot(i_range, np.array(score_range) + sigma, 'b--')
plt.plot(i_range, np.array(score_range) - sigma, 'b--')
plt.axhline(best_score + sigma[best_idx], linestyle=':', color='r')
plt.axvline(best_alpha, linestyle=':', color='r')
def find_nearest(array, value):
idx = (np.abs(array-value)).argmin()
return idx
plt.scatter(i_range[find_nearest(score_range, best_score+sigma[best_idx])], best_score+sigma[best_idx], s=100, c='red')
plt.xlim(0, 10)
plt.ylabel('CV score(mae)')
plt.xlabel('alpha')
print("best alpha : ", best_alpha)
print("best score : ", best_score)
print(' 1-sigma : ', round(sigma[best_idx], 4))
print('='*25)
print("sub_opt score : ", round(best_score + sigma[best_idx], 4))
print("sub_opt alpha : ", round( i_range[find_nearest(score_range, best_score+sigma[best_idx])] , 4))
In [30]:
cv = StratifiedKFold(y, n_folds=5, random_state=51)
i_range = []
score_range = []
sigma = []
for a in np.arange(0, 1, 0.01):
mnb = MultinomialNB(alpha = a)
#print('alpha :',a)
scores = np.zeros(5)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = cat_X.ix[train_idx]
y_train = y[train_idx]
X_test = cat_X.ix[test_idx]
y_test = y[test_idx]
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
#print(' mean :', round(np.mean(scores), 4)," ", scores)
#print('--------------------------------------------------------')
#scores
#np.mean(scores)
i_range.append(a)
score_range.append(np.mean(scores))
sigma.append(np.std(scores))
best_idx = np.argmin(score_range)
best_alpha = i_range[best_idx]
best_score = score_range[best_idx]
sigma
plt.figure(figsize = (15, 5))
plt.plot(i_range, score_range)
plt.plot(i_range, np.array(score_range) + sigma, 'b--')
plt.plot(i_range, np.array(score_range) - sigma, 'b--')
plt.axhline(best_score + sigma[best_idx], linestyle=':', color='r')
plt.axvline(best_alpha, linestyle=':', color='r')
def find_nearest(array, value):
idx = (np.abs(array-value)).argmin()
return idx
plt.scatter(i_range[find_nearest(score_range, best_score+sigma[best_idx])], best_score+sigma[best_idx], s=100, c='red')
plt.xlim(0, 1)
plt.ylabel('CV score(mae)')
plt.xlabel('alpha')
print("best alpha : ", best_alpha)
print("best score : ", best_score)
print(' 1-sigma : ', round(sigma[best_idx], 4))
print('='*25)
print("sub_opt score : ", round(best_score + sigma[best_idx], 4))
print("sub_opt alpha : ", round( i_range[find_nearest(score_range, best_score+sigma[best_idx])] , 4))