In [1]:
    
from scipy import misc
for i in range(1, 18):
    print(i, misc.comb(17, i))
    
    
In [2]:
    
df = pd.read_csv('../resource/final_df1.csv')
y = df['rating(y)'].values
real_X = df.ix[:,'avg_rating':'star5']
cat_X = df.ix[:,'0':]
real_X.head(2)
    
    Out[2]:
In [3]:
    
from sklearn.cross_validation import KFold, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_absolute_error
import itertools
gnb = GaussianNB()
cv = StratifiedKFold(y, n_folds=3, random_state=0)
    
In [4]:
    
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
    X_train = real_X.ix[train_idx]
    y_train = y[train_idx]
    X_test = real_X.ix[test_idx]
    y_test = y[test_idx]
    
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    
    scores[i] = mean_absolute_error(y_test, y_pred)
#scores
print('mean:', round(np.mean(scores), 4), '   ', scores)
    
    
In [5]:
    
combi_df1 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
for c in range(17):
    new_X = real_X[[c]]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)
    scores
    combi_df1.loc[len(combi_df1)] = [c, round(np.mean(scores), 4), scores]
    #print(c, 'mean:', round(np.mean(scores), 4), '   ', scores)
    
In [6]:
    
print(len(combi_df1))
combi_df1.sort_values('mean', ascending=True).head(5)
    
    
    Out[6]:
In [7]:
    
combi_df2 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(17), 2))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)
    scores
    combi_df2.loc[len(combi_df2)] = [idx, round(np.mean(scores), 4), scores]
    #print(idx, 'mean:', round(np.mean(scores), 4), '   ', scores)
    
In [8]:
    
print(len(combi_df2))
combi_df2.sort_values('mean', ascending=True).head(5)
    
    
    Out[8]:
In [9]:
    
combi_df3 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(17), 3))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)
    scores
    combi_df3.loc[len(combi_df3)] = [idx, round(np.mean(scores), 4), scores]
    #print(idx, 'mean:', round(np.mean(scores), 4), '   ', scores)
    
In [10]:
    
print(len(combi_df3))
combi_df3.sort_values('mean').head(5)
    
    
    Out[10]:
In [11]:
    
## 이것저것 3개 해봤을때 제일 좋은 스코어
## 휴리스틱하게 찾았을때 최적 스코어였으나 combi로 찾으니 2번째 순위의 조합이였음
gnb = GaussianNB()
new_X = real_X[[0, 1, 14,  ]]
for i, (train_idx, test_idx) in enumerate(cv):
    X_train = new_X.ix[train_idx]
    y_train = y[train_idx]
    X_test = new_X.ix[test_idx]
    y_test = y[test_idx]
    
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    
    scores[i] = mean_absolute_error(y_test, y_pred)
scores
print('mean:', round(np.mean(scores), 4), '   ', scores)
    
    
In [12]:
    
gnb = GaussianNB()
new_X = real_X[[0, 16, 5]]
for i, (train_idx, test_idx) in enumerate(cv):
    X_train = new_X.ix[train_idx]
    y_train = y[train_idx]
    X_test = new_X.ix[test_idx]
    y_test = y[test_idx]
    
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    
    scores[i] = mean_absolute_error(y_test, y_pred)
scores
print('mean:', round(np.mean(scores), 4), '   ', scores)
    
    
In [13]:
    
combi_df4 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(17), 4))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)
    scores
    combi_df4.loc[len(combi_df4)] = [idx, round(np.mean(scores), 4), scores]
    
In [14]:
    
print(len(combi_df4))
combi_df4.sort_values('mean').head(5)
    
    
    Out[14]:
In [15]:
    
combi_df5 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(17), 5))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)
    scores
    combi_df5.loc[len(combi_df5)] = [idx, round(np.mean(scores), 4), scores]
    
In [16]:
    
print(len(combi_df5))
combi_df5.sort_values('mean').head(5)
    
    
    Out[16]:
In [17]:
    
combi_df6 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(17), 6))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)
    scores
    combi_df6.loc[len(combi_df6)] = [idx, round(np.mean(scores), 4), scores]
    
In [18]:
    
print(len(combi_df6))
combi_df6.sort_values('mean').head(5)
    
    
    Out[18]:
In [ ]:
    
combi_df7 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(17), 7))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)
    scores
    combi_df7.loc[len(combi_df7)] = [idx, round(np.mean(scores), 4), scores]
    
In [ ]:
    
print(len(combi_df7))
combi_df7.sort_values('mean').head(5)
    
In [ ]:
    
combi_df8 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(17), 8))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)
    scores
    combi_df8.loc[len(combi_df8)] = [idx, round(np.mean(scores), 4), scores]
    
In [ ]:
    
print(len(combi_df8))
combi_df8.sort_values('mean').head(5)
    
In [19]:
    
combi_df12 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(17), 12))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)
    scores
    combi_df12.loc[len(combi_df12)] = [idx, round(np.mean(scores), 4), scores]
    
In [20]:
    
print(len(combi_df12))
combi_df12.sort_values('mean').head(5)
    
    
    Out[20]:
In [21]:
    
combi_df13 = pd.DataFrame(columns=['combi', 'mean', 'scores'])
idx_list = list(itertools.combinations(range(17), 13))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)
    scores
    combi_df13.loc[len(combi_df13)] = [idx, round(np.mean(scores), 4), scores]
    
In [22]:
    
print(len(combi_df13))
combi_df13.sort_values('mean').head(5)
    
    
    Out[22]: