In [1]:
df = pd.read_csv('../resource/final_df3.csv')
sample = df.title
y = df['rating(y)'].values
real_X = df[['avg_rating']].values
cat_X = df.text.fillna("").values
In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(cat_X)
tfidf_vect = TfidfVectorizer()
X_tfidf = tfidf_vect.fit_transform(cat_X)
In [3]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import mean_absolute_error
In [4]:
cv = StratifiedKFold(y, n_folds=5, random_state=51)
i_range = []
score_range = []
sigma = []
for a in np.arange(0, 2, 0.01):
mnb = MultinomialNB(alpha = a)
scores = np.zeros(5)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = X_counts[train_idx]
y_train = y[train_idx]
X_test = X_counts[test_idx]
y_test = y[test_idx]
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
i_range.append(a)
score_range.append(np.mean(scores))
sigma.append(np.std(scores))
best_idx = np.argmin(score_range)
best_alpha = i_range[best_idx]
best_score = score_range[best_idx]
sigma
plt.figure(figsize = (15, 5))
plt.plot(i_range, score_range)
plt.plot(i_range, np.array(score_range) + sigma, 'b--')
plt.plot(i_range, np.array(score_range) - sigma, 'b--')
plt.axhline(best_score + sigma[best_idx], linestyle=':', color='r')
plt.axvline(best_alpha, linestyle=':', color='r')
def find_nearest(array, value):
idx = (np.abs(array-value)).argmin()
return idx
sub_alpha = i_range[find_nearest(score_range, best_score+sigma[best_idx])]
sub_score = best_score+sigma[best_idx]
plt.scatter(sub_alpha, sub_score, s=100, c='red')
plt.xlim(0, 2)
plt.ylabel('CV score(mae)')
plt.xlabel('alpha')
print("best alpha : ", best_alpha)
print("best score : ", best_score)
print(' 1-sigma : ', round(sigma[best_idx], 4))
print('='*25)
print("sub_opt alpha : ", sub_alpha)
print("sub_opt score : ", sub_score)
In [5]:
cv = StratifiedKFold(y, n_folds=5, random_state=51)
i_range = []
score_range = []
sigma = []
for a in np.arange(0, 1, 0.01):
mnb = MultinomialNB(alpha = a)
scores = np.zeros(5)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = X_tfidf[train_idx]
y_train = y[train_idx]
X_test = X_tfidf[test_idx]
y_test = y[test_idx]
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
i_range.append(a)
score_range.append(np.mean(scores))
sigma.append(np.std(scores))
best_idx = np.argmin(score_range)
best_alpha = i_range[best_idx]
best_score = score_range[best_idx]
sigma
plt.figure(figsize = (15, 5))
plt.plot(i_range, score_range)
plt.plot(i_range, np.array(score_range) + sigma, 'b--')
plt.plot(i_range, np.array(score_range) - sigma, 'b--')
plt.axhline(best_score + sigma[best_idx], linestyle=':', color='r')
plt.axvline(best_alpha, linestyle=':', color='r')
def find_nearest(array, value):
idx = (np.abs(array-value)).argmin()
return idx
sub_alpha = i_range[find_nearest(score_range, best_score+sigma[best_idx])]
sub_score = best_score+sigma[best_idx]
plt.scatter(sub_alpha, sub_score, s=100, c='red')
plt.xlim(0, 1)
plt.ylabel('CV score(mae)')
plt.xlabel('alpha')
print("best alpha : ", best_alpha)
print("best score : ", best_score)
print(' 1-sigma : ', round(sigma[best_idx], 4))
print('='*25)
print("sub_opt alpha : ", sub_alpha)
print("sub_opt score : ", sub_score)
In [6]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])
In [7]:
from sklearn.grid_search import GridSearchCV
parameters = {
'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), ],
'tfidf__use_idf' : [True, False],
'clf__alpha' : np.arange(0, 1, 0.01),
}
gs_clf = GridSearchCV(text_clf, parameters, cv=5, scoring='mean_absolute_error', n_jobs=-1)
gs_clf = gs_clf.fit(cat_X, y)
In [8]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
print("{name}: {best}".format(
name=param_name, best=best_parameters[param_name]
))
print("="*25)
print('score :', score)
In [9]:
cv = StratifiedKFold(y, n_folds=5, random_state=51)
i_range = []
score_range = []
sigma = []
for a in np.arange(0, 0.45, 0.01):
text_clf = Pipeline([
('vect', CountVectorizer(ngram_range=(1, 2))),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB(alpha=a)),
])
scores = np.zeros(5)
for i, (train_idx, test_idx) in enumerate(cv):
X_train = cat_X[train_idx]
y_train = y[train_idx]
X_test = cat_X[test_idx]
y_test = y[test_idx]
text_clf.fit(X_train, y_train)
y_pred = text_clf.predict(X_test)
scores[i] = mean_absolute_error(y_test, y_pred)
i_range.append(a)
score_range.append(np.mean(scores))
sigma.append(np.std(scores))
best_idx = np.argmin(score_range)
best_alpha = i_range[best_idx]
best_score = score_range[best_idx]
sigma
plt.figure(figsize = (15, 5))
plt.plot(i_range, score_range)
plt.plot(i_range, np.array(score_range) + sigma, 'b--')
plt.plot(i_range, np.array(score_range) - sigma, 'b--')
plt.axhline(best_score + sigma[best_idx], linestyle=':', color='r')
plt.axvline(best_alpha, linestyle=':', color='r')
def find_nearest(array, value):
idx = (np.abs(array-value)).argmin()
return idx
sub_alpha = i_range[find_nearest(score_range, best_score+sigma[best_idx])]
sub_score = best_score+sigma[best_idx]
plt.scatter(sub_alpha, sub_score, s=100, c='red')
plt.xlim(0, 0.45)
plt.ylabel('CV score(mae)')
plt.xlabel('alpha')
print("best alpha : ", best_alpha)
print("best score : ", best_score)
print(' 1-sigma : ', round(sigma[best_idx], 4))
print('='*25)
print("sub_opt alpha : ", sub_alpha)
print("sub_opt score : ", sub_score)
In [10]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
gnb = GaussianNB()
mnb = Pipeline([
('vect', CountVectorizer(ngram_range=(1, 2),)),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB(alpha=0.3)),
])
gnb.fit(real_X, y)
gnb_pred = gnb.predict(real_X)
gnb_prob = gnb.predict_proba(real_X)
mnb.fit(cat_X, y)
mnb_pred = mnb.predict(cat_X)
mnb_prob = mnb.predict_proba(cat_X)
mix_prob = np.multiply(gnb_prob, mnb_prob)
mix_prob.shape
Out[10]:
In [11]:
def softmax(w, t=1.0):
"""Calculate the softmax of a list of numbers w.
Parameters
----------
w : list of numbers
t : float
Return
------
a list of the same length as w of non-negative numbers
Examples
--------
>>> softmax([0.1, 0.2])
array([ 0.47502081, 0.52497919])
>>> softmax([-0.1, 0.2])
array([ 0.42555748, 0.57444252])
>>> softmax([0.9, -10])
array([ 9.99981542e-01, 1.84578933e-05])
>>> softmax([0, 10])
array([ 4.53978687e-05, 9.99954602e-01])
"""
e = np.exp(np.array(w) / t)
dist = e / np.sum(e)
return dist
In [12]:
mix_prob_softmax = np.zeros((544, 5))
for i in range(544):
mix_prob_softmax[i] = softmax(mix_prob[i])
mix_prob_softmax
np.sum(mix_prob_softmax[0])
Out[12]:
In [13]:
mix_pred = np.zeros(544, )
for i in range(544):
mix_pred[i] = np.argmax(mix_prob_softmax[i])
mix_pred += 1 # 별점은 1점부터 5점까지이므로(int)
mix_pred
Out[13]:
In [14]:
test_df = pd.read_excel('../resource/test_df.xlsx')
test_sample = test_df.title
test_y = test_df['my_rating'].values
test_real_X = test_df[['avg_rating']].values
test_cat_X = test_df.text
test_watcha_y = test_df['watcha_rating'].values
In [15]:
gnb_test_pred = gnb.predict(test_real_X)
gnb_test_prob = gnb.predict_proba(test_real_X)
mnb_test_pred = mnb.predict(test_cat_X)
mnb_test_prob = mnb.predict_proba(test_cat_X)
mix_test_prob = np.multiply(gnb_test_prob, mnb_test_prob)
mix_test_prob_softmax = np.zeros((12, 5))
for i in range(12):
mix_test_prob_softmax[i] = softmax(mix_test_prob[i])
mix_test_prob_softmax
np.sum(mix_test_prob_softmax[0])
mix_test_pred = np.zeros(12, )
for i in range(12):
mix_test_pred[i] = np.argmax(mix_test_prob_softmax[i])
mix_test_pred += 1 # 별점은 1점부터 5점까지이므로(int)
mix_test_pred
Out[15]:
In [16]:
test_df['predict'] = mix_test_pred
test_df
Out[16]:
In [17]:
mix_score = mean_absolute_error(mix_test_pred, test_y)
watcha_score = mean_absolute_error(test_watcha_y, test_y)
print('mix_score :', mix_score)
print('watcha_score :', watcha_score)
In [18]:
# watcha_rating을 반올림하여 정수로변환하여 스코어 측정해봄
test_watchar_round_y = np.round(test_watcha_y,)
mean_absolute_error(test_watchar_round_y, test_y)
Out[18]: