Gaussian NB (real var)

opt feature selection : 'avg_rating'

17 combination 경우의 수



In [1]:

    
from scipy import misc
for i in range(1, 18):
    print(i, misc.comb(17, i))



In [2]:

    
df = pd.read_csv('../resource/final_df1.csv')
y = df['rating(y)'].values
real_X = df.ix[:,'avg_rating':'star5']
cat_X = df.ix[:,'0':]
real_X.head(2)

StratifiedKFold (n_folds = 3)



In [3]:

    
from sklearn.cross_validation import KFold, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_absolute_error
import itertools

gnb = GaussianNB()
cv = StratifiedKFold(y, n_folds=3, random_state=0)

0. 모든피쳐 사용했을때



In [4]:

    
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
    X_train = real_X.ix[train_idx]
    y_train = y[train_idx]
    X_test = real_X.ix[test_idx]
    y_test = y[test_idx]
    
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    
    scores[i] = mean_absolute_error(y_test, y_pred)

#scores
print('mean:', round(np.mean(scores), 4), '   ', scores)









    



mean: 0.9482     [ 0.93442623  1.02197802  0.88826816]

1. 피쳐 한개씩 사용했을때(combi 1)

idx = 0, 16, 5, 12, 6, 7 .. 순으로 score 좋음



In [5]:

    
combi_df1 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

for c in range(17):
    new_X = real_X[[c]]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df1.loc[len(combi_df1)] = [c, round(np.mean(scores), 4), scores]
    #print(c, 'mean:', round(np.mean(scores), 4), '   ', scores)



In [6]:

    
print(len(combi_df1))
combi_df1.sort_values('mean', ascending=True).head(5)









    



17






    Out[6]:






  
    
      
      combi
      mean
      scores
    
  
  
    
      0
      0
      0.6374
      [0.655737704918, 0.692307692308, 0.564245810056]
    
    
      16
      16
      0.7811
      [0.819672131148, 0.758241758242, 0.765363128492]
    
    
      5
      5
      0.7867
      [0.814207650273, 0.763736263736, 0.782122905028]
    
    
      12
      12
      0.8308
      [0.83606557377, 0.840659340659, 0.815642458101]
    
    
      6
      6
      0.8310
      [0.825136612022, 0.818681318681, 0.849162011173]

2. 피쳐 2개씩 사용했을때(combi 2)



In [7]:

    
combi_df2 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(17), 2))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df2.loc[len(combi_df2)] = [idx, round(np.mean(scores), 4), scores]
    #print(idx, 'mean:', round(np.mean(scores), 4), '   ', scores)



In [8]:

    
print(len(combi_df2))
combi_df2.sort_values('mean', ascending=True).head(5)









    



136






    Out[8]:






  
    
      
      combi
      mean
      scores
    
  
  
    
      4
      (0, 5)
      0.6541
      [0.661202185792, 0.708791208791, 0.59217877095]
    
    
      5
      (0, 6)
      0.6559
      [0.661202185792, 0.708791208791, 0.597765363128]
    
    
      13
      (0, 14)
      0.6592
      [0.693989071038, 0.741758241758, 0.541899441341]
    
    
      0
      (0, 1)
      0.6913
      [0.628415300546, 0.752747252747, 0.692737430168]
    
    
      8
      (0, 9)
      0.7007
      [0.655737704918, 0.708791208791, 0.737430167598]

3. 피쳐 3개씩 사용했을때(combi 3)



In [9]:

    
combi_df3 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(17), 3))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df3.loc[len(combi_df3)] = [idx, round(np.mean(scores), 4), scores]
    #print(idx, 'mean:', round(np.mean(scores), 4), '   ', scores)



In [10]:

    
print(len(combi_df3))
combi_df3.sort_values('mean').head(5)









    



680






    Out[10]:






  
    
      
      combi
      mean
      scores
    
  
  
    
      54
      (0, 5, 6)
      0.6633
      [0.666666666667, 0.708791208791, 0.614525139665]
    
    
      12
      (0, 1, 14)
      0.6666
      [0.715846994536, 0.730769230769, 0.553072625698]
    
    
      96
      (0, 9, 14)
      0.6708
      [0.661202185792, 0.703296703297, 0.648044692737]
    
    
      57
      (0, 5, 9)
      0.6858
      [0.655737704918, 0.703296703297, 0.698324022346]
    
    
      102
      (0, 10, 14)
      0.6869
      [0.754098360656, 0.697802197802, 0.608938547486]



In [11]:

    
## 이것저것 3개 해봤을때 제일 좋은 스코어
## 휴리스틱하게 찾았을때 최적 스코어였으나 combi로 찾으니 2번째 순위의 조합이였음
gnb = GaussianNB()
new_X = real_X[[0, 1, 14,  ]]


for i, (train_idx, test_idx) in enumerate(cv):
    X_train = new_X.ix[train_idx]
    y_train = y[train_idx]
    X_test = new_X.ix[test_idx]
    y_test = y[test_idx]
    
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    
    scores[i] = mean_absolute_error(y_test, y_pred)

scores
print('mean:', round(np.mean(scores), 4), '   ', scores)









    



mean: 0.6666     [ 0.71584699  0.73076923  0.55307263]

3-1. 상위 3개 피쳐(avg_rating, star5, run_time)으로만 했을때 스코어 낮게나왔음



In [12]:

    
gnb = GaussianNB()
new_X = real_X[[0, 16, 5]]


for i, (train_idx, test_idx) in enumerate(cv):
    X_train = new_X.ix[train_idx]
    y_train = y[train_idx]
    X_test = new_X.ix[test_idx]
    y_test = y[test_idx]
    
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    
    scores[i] = mean_absolute_error(y_test, y_pred)

scores
print('mean:', round(np.mean(scores), 4), '   ', scores)









    



mean: 0.9109     [ 0.98360656  0.97802198  0.77094972]

4. 피쳐 4개씩 사용했을때(combi 4)



In [13]:

    
combi_df4 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(17), 4))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df4.loc[len(combi_df4)] = [idx, round(np.mean(scores), 4), scores]



In [14]:

    
print(len(combi_df4))
combi_df4.sort_values('mean').head(5)









    



2380






    Out[14]:






  
    
      
      combi
      mean
      scores
    
  
  
    
      371
      (0, 5, 9, 14)
      0.6707
      [0.672131147541, 0.714285714286, 0.625698324022]
    
    
      416
      (0, 6, 9, 14)
      0.6709
      [0.650273224044, 0.703296703297, 0.659217877095]
    
    
      401
      (0, 6, 7, 14)
      0.6744
      [0.688524590164, 0.697802197802, 0.63687150838]
    
    
      66
      (0, 1, 7, 14)
      0.6814
      [0.710382513661, 0.741758241758, 0.59217877095]
    
    
      356
      (0, 5, 7, 14)
      0.6854
      [0.704918032787, 0.714285714286, 0.63687150838]

5. 피쳐 5개씩 사용시(combi 5)



In [15]:

    
combi_df5 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(17), 5))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df5.loc[len(combi_df5)] = [idx, round(np.mean(scores), 4), scores]



In [16]:

    
print(len(combi_df5))
combi_df5.sort_values('mean').head(5)









    



6188






    Out[16]:






  
    
      
      combi
      mean
      scores
    
  
  
    
      266
      (0, 1, 5, 9, 14)
      0.6745
      [0.666666666667, 0.714285714286, 0.642458100559]
    
    
      1346
      (0, 5, 6, 9, 14)
      0.6782
      [0.688524590164, 0.692307692308, 0.653631284916]
    
    
      251
      (0, 1, 5, 7, 14)
      0.6834
      [0.704918032787, 0.730769230769, 0.614525139665]
    
    
      1331
      (0, 5, 6, 7, 14)
      0.6835
      [0.693989071038, 0.730769230769, 0.625698324022]
    
    
      296
      (0, 1, 6, 7, 14)
      0.6871
      [0.704918032787, 0.736263736264, 0.620111731844]

6. 피쳐 6개씩 사용시(combi 6)



In [17]:

    
combi_df6 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(17), 6))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df6.loc[len(combi_df6)] = [idx, round(np.mean(scores), 4), scores]



In [18]:

    
print(len(combi_df6))
combi_df6.sort_values('mean').head(5)









    



12376






    Out[18]:






  
    
      
      combi
      mean
      scores
    
  
  
    
      876
      (0, 1, 5, 6, 7, 14)
      0.6798
      [0.68306010929, 0.736263736264, 0.620111731844]
    
    
      891
      (0, 1, 5, 6, 9, 14)
      0.6821
      [0.644808743169, 0.714285714286, 0.687150837989]
    
    
      991
      (0, 1, 5, 9, 12, 14)
      0.6853
      [0.693989071038, 0.747252747253, 0.614525139665]
    
    
      994
      (0, 1, 5, 9, 13, 14)
      0.6948
      [0.710382513661, 0.686813186813, 0.687150837989]
    
    
      1062
      (0, 1, 6, 7, 12, 14)
      0.6978
      [0.754098360656, 0.747252747253, 0.59217877095]

7. 피쳐 7개씩 사용시(combi 7)



In [ ]:

    
combi_df7 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(17), 7))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df7.loc[len(combi_df7)] = [idx, round(np.mean(scores), 4), scores]



In [ ]:

    
print(len(combi_df7))
combi_df7.sort_values('mean').head(5)

8. 피쳐 8개씩 사용시(combi 8)



In [ ]:

    
combi_df8 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(17), 8))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df8.loc[len(combi_df8)] = [idx, round(np.mean(scores), 4), scores]



In [ ]:

    
print(len(combi_df8))
combi_df8.sort_values('mean').head(5)

12. 피쳐 12개씩 사용했을때(combi 12)



In [19]:

    
combi_df12 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(17), 12))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df12.loc[len(combi_df12)] = [idx, round(np.mean(scores), 4), scores]



In [20]:

    
print(len(combi_df12))
combi_df12.sort_values('mean').head(5)









    



6188






    Out[20]:






  
    
      
      combi
      mean
      scores
    
  
  
    
      1813
      (0, 1, 2, 5, 6, 7, 8, 9, 12, 13, 14, 16)
      0.7687
      [0.775956284153, 0.708791208791, 0.821229050279]
    
    
      1854
      (0, 1, 2, 5, 6, 7, 9, 11, 12, 13, 14, 16)
      0.7745
      [0.743169398907, 0.708791208791, 0.871508379888]
    
    
      2948
      (0, 1, 5, 6, 7, 8, 9, 10, 12, 13, 14, 16)
      0.7759
      [0.792349726776, 0.71978021978, 0.815642458101]
    
    
      2965
      (0, 1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15)
      0.7760
      [0.781420765027, 0.714285714286, 0.832402234637]
    
    
      1930
      (0, 1, 2, 5, 7, 9, 10, 11, 12, 13, 14, 15)
      0.7778
      [0.792349726776, 0.708791208791, 0.832402234637]

13. 피쳐 13개씩 사용했을때(combi 13)



In [21]:

    
combi_df13 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(17), 13))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df13.loc[len(combi_df13)] = [idx, round(np.mean(scores), 4), scores]



In [22]:

    
print(len(combi_df13))
combi_df13.sort_values('mean').head(5)









    



2380






    Out[22]:






  
    
      
      combi
      mean
      scores
    
  
  
    
      963
      (0, 1, 2, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15)
      0.7799
      [0.770491803279, 0.697802197802, 0.871508379888]
    
    
      939
      (0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16)
      0.7833
      [0.803278688525, 0.725274725275, 0.821229050279]
    
    
      946
      (0, 1, 2, 5, 6, 7, 8, 9, 10, 12, 13, 14, 16)
      0.7834
      [0.803278688525, 0.708791208791, 0.837988826816]
    
    
      945
      (0, 1, 2, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15)
      0.7869
      [0.814207650273, 0.71978021978, 0.826815642458]
    
    
      1361
      (0, 1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16)
      0.7873
      [0.770491803279, 0.708791208791, 0.882681564246]

	avg_rating	lee_rating	eval_count	wish_count	cmt_count	run_time	year	star0.5	star1	star1.5	star2	star2.5	star3	star3.5	star4	star4.5	star5
0	4.22683	4	13025	9796	2585	128	2015	7	10	14	83	50	1472	454	4509	4318	2108
1	2.99629	3	58122	3166	965	121	2013	1312	2238	2150	6749	6597	9397	16842	1367	9011	2459

	combi	mean	scores
0	0	0.6374	[0.655737704918, 0.692307692308, 0.564245810056]
16	16	0.7811	[0.819672131148, 0.758241758242, 0.765363128492]
5	5	0.7867	[0.814207650273, 0.763736263736, 0.782122905028]
12	12	0.8308	[0.83606557377, 0.840659340659, 0.815642458101]
6	6	0.8310	[0.825136612022, 0.818681318681, 0.849162011173]

	combi	mean	scores
4	(0, 5)	0.6541	[0.661202185792, 0.708791208791, 0.59217877095]
5	(0, 6)	0.6559	[0.661202185792, 0.708791208791, 0.597765363128]
13	(0, 14)	0.6592	[0.693989071038, 0.741758241758, 0.541899441341]
0	(0, 1)	0.6913	[0.628415300546, 0.752747252747, 0.692737430168]
8	(0, 9)	0.7007	[0.655737704918, 0.708791208791, 0.737430167598]

	combi	mean	scores
54	(0, 5, 6)	0.6633	[0.666666666667, 0.708791208791, 0.614525139665]
12	(0, 1, 14)	0.6666	[0.715846994536, 0.730769230769, 0.553072625698]
96	(0, 9, 14)	0.6708	[0.661202185792, 0.703296703297, 0.648044692737]
57	(0, 5, 9)	0.6858	[0.655737704918, 0.703296703297, 0.698324022346]
102	(0, 10, 14)	0.6869	[0.754098360656, 0.697802197802, 0.608938547486]

	combi	mean	scores
371	(0, 5, 9, 14)	0.6707	[0.672131147541, 0.714285714286, 0.625698324022]
416	(0, 6, 9, 14)	0.6709	[0.650273224044, 0.703296703297, 0.659217877095]
401	(0, 6, 7, 14)	0.6744	[0.688524590164, 0.697802197802, 0.63687150838]
66	(0, 1, 7, 14)	0.6814	[0.710382513661, 0.741758241758, 0.59217877095]
356	(0, 5, 7, 14)	0.6854	[0.704918032787, 0.714285714286, 0.63687150838]

	combi	mean	scores
266	(0, 1, 5, 9, 14)	0.6745	[0.666666666667, 0.714285714286, 0.642458100559]
1346	(0, 5, 6, 9, 14)	0.6782	[0.688524590164, 0.692307692308, 0.653631284916]
251	(0, 1, 5, 7, 14)	0.6834	[0.704918032787, 0.730769230769, 0.614525139665]
1331	(0, 5, 6, 7, 14)	0.6835	[0.693989071038, 0.730769230769, 0.625698324022]
296	(0, 1, 6, 7, 14)	0.6871	[0.704918032787, 0.736263736264, 0.620111731844]

	combi	mean	scores
876	(0, 1, 5, 6, 7, 14)	0.6798	[0.68306010929, 0.736263736264, 0.620111731844]
891	(0, 1, 5, 6, 9, 14)	0.6821	[0.644808743169, 0.714285714286, 0.687150837989]
991	(0, 1, 5, 9, 12, 14)	0.6853	[0.693989071038, 0.747252747253, 0.614525139665]
994	(0, 1, 5, 9, 13, 14)	0.6948	[0.710382513661, 0.686813186813, 0.687150837989]
1062	(0, 1, 6, 7, 12, 14)	0.6978	[0.754098360656, 0.747252747253, 0.59217877095]

	combi	mean	scores
1813	(0, 1, 2, 5, 6, 7, 8, 9, 12, 13, 14, 16)	0.7687	[0.775956284153, 0.708791208791, 0.821229050279]
1854	(0, 1, 2, 5, 6, 7, 9, 11, 12, 13, 14, 16)	0.7745	[0.743169398907, 0.708791208791, 0.871508379888]
2948	(0, 1, 5, 6, 7, 8, 9, 10, 12, 13, 14, 16)	0.7759	[0.792349726776, 0.71978021978, 0.815642458101]
2965	(0, 1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15)	0.7760	[0.781420765027, 0.714285714286, 0.832402234637]
1930	(0, 1, 2, 5, 7, 9, 10, 11, 12, 13, 14, 15)	0.7778	[0.792349726776, 0.708791208791, 0.832402234637]

	combi	mean	scores
963	(0, 1, 2, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15)	0.7799	[0.770491803279, 0.697802197802, 0.871508379888]
939	(0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16)	0.7833	[0.803278688525, 0.725274725275, 0.821229050279]
946	(0, 1, 2, 5, 6, 7, 8, 9, 10, 12, 13, 14, 16)	0.7834	[0.803278688525, 0.708791208791, 0.837988826816]
945	(0, 1, 2, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15)	0.7869	[0.814207650273, 0.71978021978, 0.826815642458]
1361	(0, 1, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16)	0.7873	[0.770491803279, 0.708791208791, 0.882681564246]