notebook.community

Edit and run



In [1]:

    
df = pd.read_csv('../resource/final_df2.csv')
y = df['rating(y)'].values
real_X = df.ix[:,'avg_rating':'skew']
real_X.head(2)



In [2]:

    
from sklearn.cross_validation import KFold, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_absolute_error
import itertools

gnb = GaussianNB()
cv = StratifiedKFold(y, n_folds=3, random_state=0)



In [3]:

    
scores = np.zeros(3)
for i, (train_idx, test_idx) in enumerate(cv):
    X_train = real_X.ix[train_idx]
    y_train = y[train_idx]
    X_test = real_X.ix[test_idx]
    y_test = y[test_idx]
    
    gnb.fit(X_train, y_train)
    y_pred = gnb.predict(X_test)
    
    scores[i] = mean_absolute_error(y_test, y_pred)

scores
print('mean:', round(np.mean(scores), 4), '   ', scores)









    



mean: 1.0493     [ 1.03825137  1.12087912  0.98882682]



In [4]:

    
combi_df1 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

for c in range(9):
    new_X = real_X[[c]]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df1.loc[len(combi_df1)] = [c, round(np.mean(scores), 4), scores]
    #print(c, 'mean:', round(np.mean(scores), 4), '   ', scores)



In [5]:

    
print(len(combi_df1))
combi_df1.sort_values('mean', ascending=True).head(5)









    



9






    Out[5]:






  
    
      
      combi
      mean
      scores
    
  
  
    
      0
      0
      0.6374
      [0.655737704918, 0.692307692308, 0.564245810056]
    
    
      8
      8
      0.6414
      [0.633879781421, 0.67032967033, 0.620111731844]
    
    
      7
      7
      0.7570
      [0.79781420765, 0.758241758242, 0.715083798883]
    
    
      5
      5
      0.7867
      [0.814207650273, 0.763736263736, 0.782122905028]
    
    
      6
      6
      0.8310
      [0.825136612022, 0.818681318681, 0.849162011173]



In [6]:

    
combi_df2 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(9), 2))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df2.loc[len(combi_df2)] = [idx, round(np.mean(scores), 4), scores]
    #print(idx, 'mean:', round(np.mean(scores), 4), '   ', scores)



In [7]:

    
print(len(combi_df2))
combi_df2.sort_values('mean', ascending=True).head(5)









    



36






    Out[7]:






  
    
      
      combi
      mean
      scores
    
  
  
    
      4
      (0, 5)
      0.6541
      [0.661202185792, 0.708791208791, 0.59217877095]
    
    
      5
      (0, 6)
      0.6559
      [0.661202185792, 0.708791208791, 0.597765363128]
    
    
      32
      (5, 8)
      0.6762
      [0.68306010929, 0.708791208791, 0.63687150838]
    
    
      34
      (6, 8)
      0.6912
      [0.693989071038, 0.686813186813, 0.692737430168]
    
    
      0
      (0, 1)
      0.6913
      [0.628415300546, 0.752747252747, 0.692737430168]



In [8]:

    
combi_df3 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(9), 3))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df3.loc[len(combi_df3)] = [idx, round(np.mean(scores), 4), scores]
    #print(idx, 'mean:', round(np.mean(scores), 4), '   ', scores)



In [9]:

    
print(len(combi_df3))
combi_df3.sort_values('mean').head(5)









    



84






    Out[9]:






  
    
      
      combi
      mean
      scores
    
  
  
    
      22
      (0, 5, 6)
      0.6633
      [0.666666666667, 0.708791208791, 0.614525139665]
    
    
      26
      (0, 6, 8)
      0.6910
      [0.688524590164, 0.71978021978, 0.664804469274]
    
    
      3
      (0, 1, 5)
      0.6928
      [0.677595628415, 0.758241758242, 0.642458100559]
    
    
      82
      (5, 7, 8)
      0.6967
      [0.710382513661, 0.675824175824, 0.703910614525]
    
    
      4
      (0, 1, 6)
      0.6970
      [0.655737704918, 0.703296703297, 0.731843575419]



In [10]:

    
combi_df4 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(9), 4))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df4.loc[len(combi_df4)] = [idx, round(np.mean(scores), 4), scores]



In [11]:

    
print(len(combi_df4))
combi_df4.sort_values('mean').head(5)









    



126






    Out[11]:






  
    
      
      combi
      mean
      scores
    
  
  
    
      15
      (0, 1, 5, 6)
      0.7040
      [0.672131147541, 0.758241758242, 0.68156424581]
    
    
      16
      (0, 1, 5, 7)
      0.7111
      [0.72131147541, 0.747252747253, 0.664804469274]
    
    
      55
      (0, 6, 7, 8)
      0.7130
      [0.732240437158, 0.730769230769, 0.675977653631]
    
    
      53
      (0, 5, 6, 8)
      0.7132
      [0.699453551913, 0.736263736264, 0.703910614525]
    
    
      54
      (0, 5, 7, 8)
      0.7188
      [0.737704918033, 0.686813186813, 0.731843575419]



In [12]:

    
combi_df5 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(9), 5))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df5.loc[len(combi_df5)] = [idx, round(np.mean(scores), 4), scores]



In [13]:

    
print(len(combi_df5))
combi_df5.sort_values('mean').head(5)









    



126






    Out[13]:






  
    
      
      combi
      mean
      scores
    
  
  
    
      32
      (0, 1, 5, 6, 8)
      0.7098
      [0.655737704918, 0.730769230769, 0.743016759777]
    
    
      69
      (0, 5, 6, 7, 8)
      0.7297
      [0.743169398907, 0.725274725275, 0.720670391061]
    
    
      31
      (0, 1, 5, 6, 7)
      0.7298
      [0.68306010929, 0.796703296703, 0.709497206704]
    
    
      33
      (0, 1, 5, 7, 8)
      0.7300
      [0.704918032787, 0.730769230769, 0.754189944134]
    
    
      104
      (1, 5, 6, 7, 8)
      0.7317
      [0.688524590164, 0.769230769231, 0.737430167598]



In [14]:

    
combi_df6 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(9), 6))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df6.loc[len(combi_df6)] = [idx, round(np.mean(scores), 4), scores]



In [15]:

    
print(len(combi_df6))
combi_df6.sort_values('mean').head(5)









    



84






    Out[15]:






  
    
      
      combi
      mean
      scores
    
  
  
    
      54
      (0, 3, 5, 6, 7, 8)
      0.7443
      [0.737704918033, 0.78021978022, 0.715083798883]
    
    
      34
      (0, 1, 5, 6, 7, 8)
      0.7483
      [0.699453551913, 0.796703296703, 0.748603351955]
    
    
      28
      (0, 1, 3, 5, 7, 8)
      0.7496
      [0.79781420765, 0.758241758242, 0.692737430168]
    
    
      27
      (0, 1, 3, 5, 6, 8)
      0.7591
      [0.726775956284, 0.818681318681, 0.731843575419]
    
    
      29
      (0, 1, 3, 6, 7, 8)
      0.7702
      [0.732240437158, 0.824175824176, 0.754189944134]



In [16]:

    
combi_df7 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(9), 7))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df7.loc[len(combi_df7)] = [idx, round(np.mean(scores), 4), scores]



In [17]:

    
print(len(combi_df7))
combi_df7.sort_values('mean').head(5)









    



36






    Out[17]:






  
    
      
      combi
      mean
      scores
    
  
  
    
      19
      (0, 1, 3, 5, 6, 7, 8)
      0.7591
      [0.726775956284, 0.813186813187, 0.737430167598]
    
    
      20
      (0, 1, 4, 5, 6, 7, 8)
      0.7883
      [0.814207650273, 0.807692307692, 0.743016759777]
    
    
      14
      (0, 1, 2, 5, 6, 7, 8)
      0.8255
      [0.841530054645, 0.774725274725, 0.860335195531]
    
    
      17
      (0, 1, 3, 4, 5, 7, 8)
      0.8635
      [0.934426229508, 0.846153846154, 0.810055865922]
    
    
      27
      (0, 3, 4, 5, 6, 7, 8)
      0.8678
      [0.868852459016, 0.840659340659, 0.893854748603]



In [18]:

    
combi_df8 = pd.DataFrame(columns=['combi', 'mean', 'scores'])

idx_list = list(itertools.combinations(range(9), 8))
for idx in idx_list:
    new_X = real_X[list(idx)]
    
    scores = np.zeros(3)
    for i, (train_idx, test_idx) in enumerate(cv):
        X_train = new_X.ix[train_idx]
        y_train = y[train_idx]
        X_test = new_X.ix[test_idx]
        y_test = y[test_idx]
    
        gnb.fit(X_train, y_train)
        y_pred = gnb.predict(X_test)
    
        scores[i] = mean_absolute_error(y_test, y_pred)

    scores
    combi_df8.loc[len(combi_df8)] = [idx, round(np.mean(scores), 4), scores]



In [19]:

    
print(len(combi_df8))
combi_df8.sort_values('mean').head(5)









    



9






    Out[19]:






  
    
      
      combi
      mean
      scores
    
  
  
    
      6
      (0, 1, 3, 4, 5, 6, 7, 8)
      0.8619
      [0.852459016393, 0.912087912088, 0.821229050279]
    
    
      4
      (0, 1, 2, 3, 5, 6, 7, 8)
      0.8840
      [0.868852459016, 0.93956043956, 0.843575418994]
    
    
      0
      (0, 1, 2, 3, 4, 5, 6, 7)
      1.0493
      [1.03825136612, 1.12087912088, 0.988826815642]
    
    
      1
      (0, 1, 2, 3, 4, 5, 6, 8)
      1.0493
      [1.03825136612, 1.12087912088, 0.988826815642]
    
    
      7
      (0, 2, 3, 4, 5, 6, 7, 8)
      1.0493
      [1.03825136612, 1.12087912088, 0.988826815642]

	avg_rating	lee_rating	eval_count	wish_count	cmt_count	run_time	year	std	skew
0	4.22683	4	13025	9796	2585	128	2015	0.625521	-1.002260
1	2.99629	3	58122	3166	965	121	2013	1.059023	-0.347552

	combi	mean	scores
0	0	0.6374	[0.655737704918, 0.692307692308, 0.564245810056]
8	8	0.6414	[0.633879781421, 0.67032967033, 0.620111731844]
7	7	0.7570	[0.79781420765, 0.758241758242, 0.715083798883]
5	5	0.7867	[0.814207650273, 0.763736263736, 0.782122905028]
6	6	0.8310	[0.825136612022, 0.818681318681, 0.849162011173]

	combi	mean	scores
4	(0, 5)	0.6541	[0.661202185792, 0.708791208791, 0.59217877095]
5	(0, 6)	0.6559	[0.661202185792, 0.708791208791, 0.597765363128]
32	(5, 8)	0.6762	[0.68306010929, 0.708791208791, 0.63687150838]
34	(6, 8)	0.6912	[0.693989071038, 0.686813186813, 0.692737430168]
0	(0, 1)	0.6913	[0.628415300546, 0.752747252747, 0.692737430168]

	combi	mean	scores
22	(0, 5, 6)	0.6633	[0.666666666667, 0.708791208791, 0.614525139665]
26	(0, 6, 8)	0.6910	[0.688524590164, 0.71978021978, 0.664804469274]
3	(0, 1, 5)	0.6928	[0.677595628415, 0.758241758242, 0.642458100559]
82	(5, 7, 8)	0.6967	[0.710382513661, 0.675824175824, 0.703910614525]
4	(0, 1, 6)	0.6970	[0.655737704918, 0.703296703297, 0.731843575419]

	combi	mean	scores
15	(0, 1, 5, 6)	0.7040	[0.672131147541, 0.758241758242, 0.68156424581]
16	(0, 1, 5, 7)	0.7111	[0.72131147541, 0.747252747253, 0.664804469274]
55	(0, 6, 7, 8)	0.7130	[0.732240437158, 0.730769230769, 0.675977653631]
53	(0, 5, 6, 8)	0.7132	[0.699453551913, 0.736263736264, 0.703910614525]
54	(0, 5, 7, 8)	0.7188	[0.737704918033, 0.686813186813, 0.731843575419]

	combi	mean	scores
32	(0, 1, 5, 6, 8)	0.7098	[0.655737704918, 0.730769230769, 0.743016759777]
69	(0, 5, 6, 7, 8)	0.7297	[0.743169398907, 0.725274725275, 0.720670391061]
31	(0, 1, 5, 6, 7)	0.7298	[0.68306010929, 0.796703296703, 0.709497206704]
33	(0, 1, 5, 7, 8)	0.7300	[0.704918032787, 0.730769230769, 0.754189944134]
104	(1, 5, 6, 7, 8)	0.7317	[0.688524590164, 0.769230769231, 0.737430167598]

	combi	mean	scores
54	(0, 3, 5, 6, 7, 8)	0.7443	[0.737704918033, 0.78021978022, 0.715083798883]
34	(0, 1, 5, 6, 7, 8)	0.7483	[0.699453551913, 0.796703296703, 0.748603351955]
28	(0, 1, 3, 5, 7, 8)	0.7496	[0.79781420765, 0.758241758242, 0.692737430168]
27	(0, 1, 3, 5, 6, 8)	0.7591	[0.726775956284, 0.818681318681, 0.731843575419]
29	(0, 1, 3, 6, 7, 8)	0.7702	[0.732240437158, 0.824175824176, 0.754189944134]

	combi	mean	scores
19	(0, 1, 3, 5, 6, 7, 8)	0.7591	[0.726775956284, 0.813186813187, 0.737430167598]
20	(0, 1, 4, 5, 6, 7, 8)	0.7883	[0.814207650273, 0.807692307692, 0.743016759777]
14	(0, 1, 2, 5, 6, 7, 8)	0.8255	[0.841530054645, 0.774725274725, 0.860335195531]
17	(0, 1, 3, 4, 5, 7, 8)	0.8635	[0.934426229508, 0.846153846154, 0.810055865922]
27	(0, 3, 4, 5, 6, 7, 8)	0.8678	[0.868852459016, 0.840659340659, 0.893854748603]

	combi	mean	scores
6	(0, 1, 3, 4, 5, 6, 7, 8)	0.8619	[0.852459016393, 0.912087912088, 0.821229050279]
4	(0, 1, 2, 3, 5, 6, 7, 8)	0.8840	[0.868852459016, 0.93956043956, 0.843575418994]
0	(0, 1, 2, 3, 4, 5, 6, 7)	1.0493	[1.03825136612, 1.12087912088, 0.988826815642]
1	(0, 1, 2, 3, 4, 5, 6, 8)	1.0493	[1.03825136612, 1.12087912088, 0.988826815642]
7	(0, 2, 3, 4, 5, 6, 7, 8)	1.0493	[1.03825136612, 1.12087912088, 0.988826815642]