In [1]:
from scipy import misc
for i in range(1, 18):
print(i, misc.comb(17, i))
In [2]:
df = pd.read_csv('../resource/final_df1.csv')
y = df['rating(y)'].values
real_X = df.ix[:,'avg_rating':'star5']
cat_X = df.ix[:,'0':]
real_X.head(2)
Out[2]:
In [3]:
from sklearn.cross_validation import KFold, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_absolute_error
import itertools
gnb = GaussianNB()
cv = StratifiedKFold(y, n_folds=3, random_state=0)
In [4]:
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = real_X.ix[train_idx]
y_train = y[train_idx]
X_test = real_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
#scores
print('mean:', round(np.mean(scores), 4), ' ', scores)
In [5]:
combi_df1 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
for c in range(17):
new_X = real_X[[c]]
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
combi_df1.loc[len(combi_df1)] = [c, round(np.mean(scores), 4), scores]
#print(c, 'mean:', round(np.mean(scores), 4), ' ', scores)
In [6]:
print(len(combi_df1))
combi_df1.sort_values('mean', ascending=True).head(5)
Out[6]:
In [7]:
combi_df2 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(17), 2))
for idx in idx_list:
new_X = real_X[list(idx)]
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
combi_df2.loc[len(combi_df2)] = [idx, round(np.mean(scores), 4), scores]
#print(idx, 'mean:', round(np.mean(scores), 4), ' ', scores)
In [8]:
print(len(combi_df2))
combi_df2.sort_values('mean', ascending=True).head(5)
Out[8]:
In [9]:
combi_df3 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(17), 3))
for idx in idx_list:
new_X = real_X[list(idx)]
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
combi_df3.loc[len(combi_df3)] = [idx, round(np.mean(scores), 4), scores]
#print(idx, 'mean:', round(np.mean(scores), 4), ' ', scores)
In [10]:
print(len(combi_df3))
combi_df3.sort_values('mean').head(5)
Out[10]:
In [11]:
## 이것저것 3개 해봤을때 제일 좋은 스코어
## 휴리스틱하게 찾았을때 최적 스코어였으나 combi로 찾으니 2번째 순위의 조합이였음
gnb = GaussianNB()
new_X = real_X[[0, 1, 14, ]]
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
print('mean:', round(np.mean(scores), 4), ' ', scores)
In [12]:
gnb = GaussianNB()
new_X = real_X[[0, 16, 5]]
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
print('mean:', round(np.mean(scores), 4), ' ', scores)
In [13]:
combi_df4 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(17), 4))
for idx in idx_list:
new_X = real_X[list(idx)]
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
combi_df4.loc[len(combi_df4)] = [idx, round(np.mean(scores), 4), scores]
In [14]:
print(len(combi_df4))
combi_df4.sort_values('mean').head(5)
Out[14]:
In [15]:
combi_df5 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(17), 5))
for idx in idx_list:
new_X = real_X[list(idx)]
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
combi_df5.loc[len(combi_df5)] = [idx, round(np.mean(scores), 4), scores]
In [16]:
print(len(combi_df5))
combi_df5.sort_values('mean').head(5)
Out[16]:
In [17]:
combi_df6 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(17), 6))
for idx in idx_list:
new_X = real_X[list(idx)]
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
combi_df6.loc[len(combi_df6)] = [idx, round(np.mean(scores), 4), scores]
In [18]:
print(len(combi_df6))
combi_df6.sort_values('mean').head(5)
Out[18]:
In [ ]:
combi_df7 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(17), 7))
for idx in idx_list:
new_X = real_X[list(idx)]
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
combi_df7.loc[len(combi_df7)] = [idx, round(np.mean(scores), 4), scores]
In [ ]:
print(len(combi_df7))
combi_df7.sort_values('mean').head(5)
In [ ]:
combi_df8 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(17), 8))
for idx in idx_list:
new_X = real_X[list(idx)]
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
combi_df8.loc[len(combi_df8)] = [idx, round(np.mean(scores), 4), scores]
In [ ]:
print(len(combi_df8))
combi_df8.sort_values('mean').head(5)
In [19]:
combi_df12 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(17), 12))
for idx in idx_list:
new_X = real_X[list(idx)]
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
combi_df12.loc[len(combi_df12)] = [idx, round(np.mean(scores), 4), scores]
In [20]:
print(len(combi_df12))
combi_df12.sort_values('mean').head(5)
Out[20]:
In [21]:
combi_df13 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(17), 13))
for idx in idx_list:
new_X = real_X[list(idx)]
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = new_X.ix[train_idx]
y_train = y[train_idx]
X_test = new_X.ix[test_idx]
y_test = y[test_idx]
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
scores
combi_df13.loc[len(combi_df13)] = [idx, round(np.mean(scores), 4), scores]
In [22]:
print(len(combi_df13))
combi_df13.sort_values('mean').head(5)
Out[22]: