Gaussian NB (real var)

  • opt feature selection : 'avg_rating'

17 combination 경우의 수


In [1]:
from scipy import misc
for i in range(1, 18):
    print(i, misc.comb(17, i))


1 17.0
2 136.0
3 680.0
4 2380.0
5 6188.0
6 12376.0
7 19448.0
8 24310.0
9 24310.0
10 19448.0
11 12376.0
12 6188.0
13 2380.0
14 680.0
15 136.0
16 17.0
17 1.0

In [2]:
df = pd.read_csv('../resource/final_df1.csv')
y = df['rating(y)'].values
real_X = df.ix[:,'avg_rating':'star5']
cat_X = df.ix[:,'0':]
real_X.head(2)


Out[2]:
avg_rating lee_rating eval_count wish_count cmt_count run_time year star0.5 star1 star1.5 star2 star2.5 star3 star3.5 star4 star4.5 star5
0 4.22683 4 13025 9796 2585 128 2015 7 10 14 83 50 1472 454 4509 4318 2108
1 2.99629 3 58122 3166 965 121 2013 1312 2238 2150 6749 6597 9397 16842 1367 9011 2459

StratifiedKFold (n_folds = 3)


In [3]:
from sklearn.cross_validation import KFold, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_absolute_error
import itertools

gnb = GaussianNB()
cv = StratifiedKFold(y, n_folds=3, random_state=0)

0. 모든피쳐 사용했을때


In [4]:
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
    X_train = real_X.ix[train_idx]
    y_train = y[train_idx]
    X_test = real_X.ix[test_idx]
    y_test = y[test_idx]
    
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    
    scores[i] = mean_absolute_error(y_test, y_pred)

#scores
print('mean:', round(np.mean(scores), 4), '   ', scores)


mean: 0.9482     [ 0.93442623  1.02197802  0.88826816]

1. 피쳐 한개씩 사용했을때(combi 1)

  • idx = 0, 16, 5, 12, 6, 7 .. 순으로 score 좋음

In [5]:
combi_df1 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

for c in range(17):
    new_X = real_X[[c]]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df1.loc[len(combi_df1)] = [c, round(np.mean(scores), 4), scores]
    #print(c, 'mean:', round(np.mean(scores), 4), '   ', scores)

In [6]:
print(len(combi_df1))
combi_df1.sort_values('mean', ascending=True).head(5)


17
Out[6]:
combi mean scores
0 0 0.6374 [0.655737704918, 0.692307692308, 0.564245810056]
16 16 0.7811 [0.819672131148, 0.758241758242, 0.765363128492]
5 5 0.7867 [0.814207650273, 0.763736263736, 0.782122905028]
12 12 0.8308 [0.83606557377, 0.840659340659, 0.815642458101]
6 6 0.8310 [0.825136612022, 0.818681318681, 0.849162011173]

2. 피쳐 2개씩 사용했을때(combi 2)


In [7]:
combi_df2 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(17), 2))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df2.loc[len(combi_df2)] = [idx, round(np.mean(scores), 4), scores]
    #print(idx, 'mean:', round(np.mean(scores), 4), '   ', scores)

In [8]:
print(len(combi_df2))
combi_df2.sort_values('mean', ascending=True).head(5)


136
Out[8]:
combi mean scores
4 (0, 5) 0.6541 [0.661202185792, 0.708791208791, 0.59217877095]
5 (0, 6) 0.6559 [0.661202185792, 0.708791208791, 0.597765363128]
13 (0, 14) 0.6592 [0.693989071038, 0.741758241758, 0.541899441341]
0 (0, 1) 0.6913 [0.628415300546, 0.752747252747, 0.692737430168]
8 (0, 9) 0.7007 [0.655737704918, 0.708791208791, 0.737430167598]

3. 피쳐 3개씩 사용했을때(combi 3)


In [9]:
combi_df3 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(17), 3))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df3.loc[len(combi_df3)] = [idx, round(np.mean(scores), 4), scores]
    #print(idx, 'mean:', round(np.mean(scores), 4), '   ', scores)

In [10]:
print(len(combi_df3))
combi_df3.sort_values('mean').head(5)


680
Out[10]:
combi mean scores
54 (0, 5, 6) 0.6633 [0.666666666667, 0.708791208791, 0.614525139665]
12 (0, 1, 14) 0.6666 [0.715846994536, 0.730769230769, 0.553072625698]
96 (0, 9, 14) 0.6708 [0.661202185792, 0.703296703297, 0.648044692737]
57 (0, 5, 9) 0.6858 [0.655737704918, 0.703296703297, 0.698324022346]
102 (0, 10, 14) 0.6869 [0.754098360656, 0.697802197802, 0.608938547486]

In [11]:
## 이것저것 3개 해봤을때 제일 좋은 스코어
## 휴리스틱하게 찾았을때 최적 스코어였으나 combi로 찾으니 2번째 순위의 조합이였음
gnb = GaussianNB()
new_X = real_X[[0, 1, 14,  ]]


for i, (train_idx, test_idx) in enumerate(cv):
    X_train = new_X.ix[train_idx]
    y_train = y[train_idx]
    X_test = new_X.ix[test_idx]
    y_test = y[test_idx]
    
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    
    scores[i] = mean_absolute_error(y_test, y_pred)

scores
print('mean:', round(np.mean(scores), 4), '   ', scores)


mean: 0.6666     [ 0.71584699  0.73076923  0.55307263]

3-1. 상위 3개 피쳐(avg_rating, star5, run_time)으로만 했을때 스코어 낮게나왔음


In [12]:
gnb = GaussianNB()
new_X = real_X[[0, 16, 5]]


for i, (train_idx, test_idx) in enumerate(cv):
    X_train = new_X.ix[train_idx]
    y_train = y[train_idx]
    X_test = new_X.ix[test_idx]
    y_test = y[test_idx]
    
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    
    scores[i] = mean_absolute_error(y_test, y_pred)

scores
print('mean:', round(np.mean(scores), 4), '   ', scores)


mean: 0.9109     [ 0.98360656  0.97802198  0.77094972]

4. 피쳐 4개씩 사용했을때(combi 4)


In [13]:
combi_df4 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(17), 4))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df4.loc[len(combi_df4)] = [idx, round(np.mean(scores), 4), scores]

In [14]:
print(len(combi_df4))
combi_df4.sort_values('mean').head(5)


2380
Out[14]:
combi mean scores
371 (0, 5, 9, 14) 0.6707 [0.672131147541, 0.714285714286, 0.625698324022]
416 (0, 6, 9, 14) 0.6709 [0.650273224044, 0.703296703297, 0.659217877095]
401 (0, 6, 7, 14) 0.6744 [0.688524590164, 0.697802197802, 0.63687150838]
66 (0, 1, 7, 14) 0.6814 [0.710382513661, 0.741758241758, 0.59217877095]
356 (0, 5, 7, 14) 0.6854 [0.704918032787, 0.714285714286, 0.63687150838]

5. 피쳐 5개씩 사용시(combi 5)


In [15]:
combi_df5 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(17), 5))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df5.loc[len(combi_df5)] = [idx, round(np.mean(scores), 4), scores]

In [16]:
print(len(combi_df5))
combi_df5.sort_values('mean').head(5)


6188
Out[16]:
combi mean scores
266 (0, 1, 5, 9, 14) 0.6745 [0.666666666667, 0.714285714286, 0.642458100559]
1346 (0, 5, 6, 9, 14) 0.6782 [0.688524590164, 0.692307692308, 0.653631284916]
251 (0, 1, 5, 7, 14) 0.6834 [0.704918032787, 0.730769230769, 0.614525139665]
1331 (0, 5, 6, 7, 14) 0.6835 [0.693989071038, 0.730769230769, 0.625698324022]
296 (0, 1, 6, 7, 14) 0.6871 [0.704918032787, 0.736263736264, 0.620111731844]

6. 피쳐 6개씩 사용시(combi 6)


In [17]:
combi_df6 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(17), 6))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df6.loc[len(combi_df6)] = [idx, round(np.mean(scores), 4), scores]

In [18]:
print(len(combi_df6))
combi_df6.sort_values('mean').head(5)


12376
Out[18]:
combi mean scores
876 (0, 1, 5, 6, 7, 14) 0.6798 [0.68306010929, 0.736263736264, 0.620111731844]
891 (0, 1, 5, 6, 9, 14) 0.6821 [0.644808743169, 0.714285714286, 0.687150837989]
991 (0, 1, 5, 9, 12, 14) 0.6853 [0.693989071038, 0.747252747253, 0.614525139665]
994 (0, 1, 5, 9, 13, 14) 0.6948 [0.710382513661, 0.686813186813, 0.687150837989]
1062 (0, 1, 6, 7, 12, 14) 0.6978 [0.754098360656, 0.747252747253, 0.59217877095]

7. 피쳐 7개씩 사용시(combi 7)


In [ ]:
combi_df7 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(17), 7))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df7.loc[len(combi_df7)] = [idx, round(np.mean(scores), 4), scores]

In [ ]:
print(len(combi_df7))
combi_df7.sort_values('mean').head(5)

8. 피쳐 8개씩 사용시(combi 8)


In [ ]:
combi_df8 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(17), 8))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df8.loc[len(combi_df8)] = [idx, round(np.mean(scores), 4), scores]

In [ ]:
print(len(combi_df8))
combi_df8.sort_values('mean').head(5)

12. 피쳐 12개씩 사용했을때(combi 12)


In [19]:
combi_df12 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(17), 12))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df12.loc[len(combi_df12)] = [idx, round(np.mean(scores), 4), scores]

In [20]:
print(len(combi_df12))
combi_df12.sort_values('mean').head(5)


6188
Out[20]:
combi mean scores
1813 (0, 1, 2, 5, 6, 7, 8, 9, 12, 13, 14, 16) 0.7687 [0.775956284153, 0.708791208791, 0.821229050279]
1854 (0, 1, 2, 5, 6, 7, 9, 11, 12, 13, 14, 16) 0.7745 [0.743169398907, 0.708791208791, 0.871508379888]
2948 (0, 1, 5, 6, 7, 8, 9, 10, 12, 13, 14, 16) 0.7759 [0.792349726776, 0.71978021978, 0.815642458101]
2965 (0, 1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15) 0.7760 [0.781420765027, 0.714285714286, 0.832402234637]
1930 (0, 1, 2, 5, 7, 9, 10, 11, 12, 13, 14, 15) 0.7778 [0.792349726776, 0.708791208791, 0.832402234637]

13. 피쳐 13개씩 사용했을때(combi 13)


In [21]:
combi_df13 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(17), 13))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df13.loc[len(combi_df13)] = [idx, round(np.mean(scores), 4), scores]

In [22]:
print(len(combi_df13))
combi_df13.sort_values('mean').head(5)


2380
Out[22]:
combi mean scores
963 (0, 1, 2, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15) 0.7799 [0.770491803279, 0.697802197802, 0.871508379888]
939 (0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16) 0.7833 [0.803278688525, 0.725274725275, 0.821229050279]
946 (0, 1, 2, 5, 6, 7, 8, 9, 10, 12, 13, 14, 16) 0.7834 [0.803278688525, 0.708791208791, 0.837988826816]
945 (0, 1, 2, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15) 0.7869 [0.814207650273, 0.71978021978, 0.826815642458]
1361 (0, 1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16) 0.7873 [0.770491803279, 0.708791208791, 0.882681564246]