In [1]:
df = pd.read_csv('../resource/final_df2.csv')
y = df['rating(y)'].values
real_X = df.ix[:,'avg_rating':'skew']
real_X.head(2)
Out[1]:
In [2]:
from sklearn.cross_validation import KFold, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_absolute_error
import itertools
gnb = GaussianNB()
cv = StratifiedKFold(y, n_folds=3, random_state=0)
In [3]:
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = real_X.ix[train_idx]
y_train = y[train_idx]
X_test = real_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
print('mean:', round(np.mean(scores), 4), ' ', scores)
In [4]:
combi_df1 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
for c in range(9):
new_X = real_X[[c]]
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
combi_df1.loc[len(combi_df1)] = [c, round(np.mean(scores), 4), scores]
#print(c, 'mean:', round(np.mean(scores), 4), ' ', scores)
In [5]:
print(len(combi_df1))
combi_df1.sort_values('mean', ascending=True).head(5)
Out[5]:
In [6]:
combi_df2 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(9), 2))
for idx in idx_list:
new_X = real_X[list(idx)]
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
combi_df2.loc[len(combi_df2)] = [idx, round(np.mean(scores), 4), scores]
#print(idx, 'mean:', round(np.mean(scores), 4), ' ', scores)
In [7]:
print(len(combi_df2))
combi_df2.sort_values('mean', ascending=True).head(5)
Out[7]:
In [8]:
combi_df3 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(9), 3))
for idx in idx_list:
new_X = real_X[list(idx)]
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
combi_df3.loc[len(combi_df3)] = [idx, round(np.mean(scores), 4), scores]
#print(idx, 'mean:', round(np.mean(scores), 4), ' ', scores)
In [9]:
print(len(combi_df3))
combi_df3.sort_values('mean').head(5)
Out[9]:
In [10]:
combi_df4 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(9), 4))
for idx in idx_list:
new_X = real_X[list(idx)]
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
combi_df4.loc[len(combi_df4)] = [idx, round(np.mean(scores), 4), scores]
In [11]:
print(len(combi_df4))
combi_df4.sort_values('mean').head(5)
Out[11]:
In [12]:
combi_df5 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(9), 5))
for idx in idx_list:
new_X = real_X[list(idx)]
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
combi_df5.loc[len(combi_df5)] = [idx, round(np.mean(scores), 4), scores]
In [13]:
print(len(combi_df5))
combi_df5.sort_values('mean').head(5)
Out[13]:
In [14]:
combi_df6 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(9), 6))
for idx in idx_list:
new_X = real_X[list(idx)]
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
combi_df6.loc[len(combi_df6)] = [idx, round(np.mean(scores), 4), scores]
In [15]:
print(len(combi_df6))
combi_df6.sort_values('mean').head(5)
Out[15]:
In [16]:
combi_df7 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(9), 7))
for idx in idx_list:
new_X = real_X[list(idx)]
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
combi_df7.loc[len(combi_df7)] = [idx, round(np.mean(scores), 4), scores]
In [17]:
print(len(combi_df7))
combi_df7.sort_values('mean').head(5)
Out[17]:
In [18]:
combi_df8 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(9), 8))
for idx in idx_list:
new_X = real_X[list(idx)]
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
combi_df8.loc[len(combi_df8)] = [idx, round(np.mean(scores), 4), scores]
In [19]:
print(len(combi_df8))
combi_df8.sort_values('mean').head(5)
Out[19]: