In [1]:
df = pd.read_csv('../resource/final_df2.csv')
y = df['rating(y)'].values
real_X = df.ix[:,'avg_rating':'skew']
real_X.head(2)


Out[1]:
avg_rating lee_rating eval_count wish_count cmt_count run_time year std skew
0 4.22683 4 13025 9796 2585 128 2015 0.625521 -1.002260
1 2.99629 3 58122 3166 965 121 2013 1.059023 -0.347552

In [2]:
from sklearn.cross_validation import KFold, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_absolute_error
import itertools

gnb = GaussianNB()
cv = StratifiedKFold(y, n_folds=3, random_state=0)

In [3]:
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
    X_train = real_X.ix[train_idx]
    y_train = y[train_idx]
    X_test = real_X.ix[test_idx]
    y_test = y[test_idx]
    
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    
    scores[i] = mean_absolute_error(y_test, y_pred)

scores
print('mean:', round(np.mean(scores), 4), '   ', scores)


mean: 1.0493     [ 1.03825137  1.12087912  0.98882682]

In [4]:
combi_df1 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

for c in range(9):
    new_X = real_X[[c]]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df1.loc[len(combi_df1)] = [c, round(np.mean(scores), 4), scores]
    #print(c, 'mean:', round(np.mean(scores), 4), '   ', scores)

In [5]:
print(len(combi_df1))
combi_df1.sort_values('mean', ascending=True).head(5)


9
Out[5]:
combi mean scores
0 0 0.6374 [0.655737704918, 0.692307692308, 0.564245810056]
8 8 0.6414 [0.633879781421, 0.67032967033, 0.620111731844]
7 7 0.7570 [0.79781420765, 0.758241758242, 0.715083798883]
5 5 0.7867 [0.814207650273, 0.763736263736, 0.782122905028]
6 6 0.8310 [0.825136612022, 0.818681318681, 0.849162011173]

In [6]:
combi_df2 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(9), 2))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df2.loc[len(combi_df2)] = [idx, round(np.mean(scores), 4), scores]
    #print(idx, 'mean:', round(np.mean(scores), 4), '   ', scores)

In [7]:
print(len(combi_df2))
combi_df2.sort_values('mean', ascending=True).head(5)


36
Out[7]:
combi mean scores
4 (0, 5) 0.6541 [0.661202185792, 0.708791208791, 0.59217877095]
5 (0, 6) 0.6559 [0.661202185792, 0.708791208791, 0.597765363128]
32 (5, 8) 0.6762 [0.68306010929, 0.708791208791, 0.63687150838]
34 (6, 8) 0.6912 [0.693989071038, 0.686813186813, 0.692737430168]
0 (0, 1) 0.6913 [0.628415300546, 0.752747252747, 0.692737430168]

In [8]:
combi_df3 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(9), 3))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df3.loc[len(combi_df3)] = [idx, round(np.mean(scores), 4), scores]
    #print(idx, 'mean:', round(np.mean(scores), 4), '   ', scores)

In [9]:
print(len(combi_df3))
combi_df3.sort_values('mean').head(5)


84
Out[9]:
combi mean scores
22 (0, 5, 6) 0.6633 [0.666666666667, 0.708791208791, 0.614525139665]
26 (0, 6, 8) 0.6910 [0.688524590164, 0.71978021978, 0.664804469274]
3 (0, 1, 5) 0.6928 [0.677595628415, 0.758241758242, 0.642458100559]
82 (5, 7, 8) 0.6967 [0.710382513661, 0.675824175824, 0.703910614525]
4 (0, 1, 6) 0.6970 [0.655737704918, 0.703296703297, 0.731843575419]

In [10]:
combi_df4 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(9), 4))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df4.loc[len(combi_df4)] = [idx, round(np.mean(scores), 4), scores]

In [11]:
print(len(combi_df4))
combi_df4.sort_values('mean').head(5)


126
Out[11]:
combi mean scores
15 (0, 1, 5, 6) 0.7040 [0.672131147541, 0.758241758242, 0.68156424581]
16 (0, 1, 5, 7) 0.7111 [0.72131147541, 0.747252747253, 0.664804469274]
55 (0, 6, 7, 8) 0.7130 [0.732240437158, 0.730769230769, 0.675977653631]
53 (0, 5, 6, 8) 0.7132 [0.699453551913, 0.736263736264, 0.703910614525]
54 (0, 5, 7, 8) 0.7188 [0.737704918033, 0.686813186813, 0.731843575419]

In [12]:
combi_df5 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(9), 5))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df5.loc[len(combi_df5)] = [idx, round(np.mean(scores), 4), scores]

In [13]:
print(len(combi_df5))
combi_df5.sort_values('mean').head(5)


126
Out[13]:
combi mean scores
32 (0, 1, 5, 6, 8) 0.7098 [0.655737704918, 0.730769230769, 0.743016759777]
69 (0, 5, 6, 7, 8) 0.7297 [0.743169398907, 0.725274725275, 0.720670391061]
31 (0, 1, 5, 6, 7) 0.7298 [0.68306010929, 0.796703296703, 0.709497206704]
33 (0, 1, 5, 7, 8) 0.7300 [0.704918032787, 0.730769230769, 0.754189944134]
104 (1, 5, 6, 7, 8) 0.7317 [0.688524590164, 0.769230769231, 0.737430167598]

In [14]:
combi_df6 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(9), 6))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df6.loc[len(combi_df6)] = [idx, round(np.mean(scores), 4), scores]

In [15]:
print(len(combi_df6))
combi_df6.sort_values('mean').head(5)


84
Out[15]:
combi mean scores
54 (0, 3, 5, 6, 7, 8) 0.7443 [0.737704918033, 0.78021978022, 0.715083798883]
34 (0, 1, 5, 6, 7, 8) 0.7483 [0.699453551913, 0.796703296703, 0.748603351955]
28 (0, 1, 3, 5, 7, 8) 0.7496 [0.79781420765, 0.758241758242, 0.692737430168]
27 (0, 1, 3, 5, 6, 8) 0.7591 [0.726775956284, 0.818681318681, 0.731843575419]
29 (0, 1, 3, 6, 7, 8) 0.7702 [0.732240437158, 0.824175824176, 0.754189944134]

In [16]:
combi_df7 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(9), 7))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df7.loc[len(combi_df7)] = [idx, round(np.mean(scores), 4), scores]

In [17]:
print(len(combi_df7))
combi_df7.sort_values('mean').head(5)


36
Out[17]:
combi mean scores
19 (0, 1, 3, 5, 6, 7, 8) 0.7591 [0.726775956284, 0.813186813187, 0.737430167598]
20 (0, 1, 4, 5, 6, 7, 8) 0.7883 [0.814207650273, 0.807692307692, 0.743016759777]
14 (0, 1, 2, 5, 6, 7, 8) 0.8255 [0.841530054645, 0.774725274725, 0.860335195531]
17 (0, 1, 3, 4, 5, 7, 8) 0.8635 [0.934426229508, 0.846153846154, 0.810055865922]
27 (0, 3, 4, 5, 6, 7, 8) 0.8678 [0.868852459016, 0.840659340659, 0.893854748603]

In [18]:
combi_df8 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(9), 8))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df8.loc[len(combi_df8)] = [idx, round(np.mean(scores), 4), scores]

In [19]:
print(len(combi_df8))
combi_df8.sort_values('mean').head(5)


9
Out[19]:
combi mean scores
6 (0, 1, 3, 4, 5, 6, 7, 8) 0.8619 [0.852459016393, 0.912087912088, 0.821229050279]
4 (0, 1, 2, 3, 5, 6, 7, 8) 0.8840 [0.868852459016, 0.93956043956, 0.843575418994]
0 (0, 1, 2, 3, 4, 5, 6, 7) 1.0493 [1.03825136612, 1.12087912088, 0.988826815642]
1 (0, 1, 2, 3, 4, 5, 6, 8) 1.0493 [1.03825136612, 1.12087912088, 0.988826815642]
7 (0, 2, 3, 4, 5, 6, 7, 8) 1.0493 [1.03825136612, 1.12087912088, 0.988826815642]