In [1]:
import pandas as pd
In [93]:
main_df = pd.read_csv("data/webnovel/main_df.csv", index_col=0, encoding="cp949")
In [94]:
main_df.head()
Out[94]:
In [95]:
X_data = main_df[["genre", "name", "episode_total", "is_fin"]]
In [96]:
name_size = dict(main_df.groupby("name").size())
In [97]:
X_data["name"] = X_data["name"].apply(lambda x: "기타" if name_size[x] == 1 else x)
In [98]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
In [99]:
lab_name = LabelEncoder()
In [100]:
X_data["name"] = lab_name.fit_transform(X_data["name"])
In [101]:
lab_genre = LabelEncoder()
In [102]:
X_data["genre"] = lab_genre.fit_transform(X_data["genre"])
In [103]:
import statsmodels.api as sm
In [104]:
y = main_df[["concern_count"]]
In [105]:
Xy_data = pd.concat([X_data, y], axis=1)
In [108]:
model = sm.OLS.from_formula("concern_count ~ C(genre) + C(name) + episode_total + is_fin", Xy_data)
In [109]:
result = model.fit()
In [110]:
print(result.summary())
In [22]:
episode_df = pd.read_csv("data/episode_df.csv", index_col=0)
episode_df["ID"] = episode_df["ID"].astype("int64")
In [63]:
novel_19_df = pd.read_csv("data/novel_19.csv", encoding="cp949", index_col=0).dropna()
In [64]:
text_df = main_df.merge(episode_df, on="ID")[["ID", "genre", "text"]]
In [65]:
text_df = pd.concat([text_df, novel_19_df]).reset_index(drop=True)
In [66]:
X_train = text_df[text_df["ID"] != 466391].ix[:,2]
y_train = text_df[text_df["ID"] != 466391].ix[:,1]
X_test = text_df[text_df["ID"] == 466391].ix[:,2]
y_test = text_df[text_df["ID"] == 466391].ix[:,1]
In [43]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import LabelKFold
from sklearn.metrics import classification_report
In [89]:
vect = TfidfVectorizer()
In [90]:
%%time
vect.fit(test_df.ix[:,2])
Out[90]:
In [133]:
def make_predict_proba(ID, vect):
X_train = text_df[text_df["ID"] != ID].ix[:,2]
y_train = text_df[text_df["ID"] != ID].ix[:,1]
X_test = text_df[text_df["ID"] == ID].ix[:,2]
model = MultinomialNB()
model.fit(vect.transform(X_train), y_train)
predict = model.predict_proba(vect.transform(X_test))
predict_df = pd.DataFrame(predict, columns=[101, 102, 103, 104, 106, 108, 109])
if text_df[text_df["ID"] == ID]["ID"].unique()[0] == 101:
predict_df["genre_proba"] = predict[:, 0]
elif text_df[text_df["ID"] == ID]["ID"].unique()[0] == 102:
predict_df["genre_proba"] = predict[:, 1]
elif text_df[text_df["ID"] == ID]["ID"].unique()[0] == 103:
predict_df["genre_proba"] = predict[:, 2]
elif text_df[text_df["ID"] == ID]["ID"].unique()[0] == 104:
predict_df["genre_proba"] = predict[:, 3]
elif text_df[text_df["ID"] == ID]["ID"].unique()[0] == 106:
predict_df["genre_proba"] = predict[:, 4]
elif text_df[text_df["ID"] == ID]["ID"].unique()[0] == 108:
predict_df["genre_proba"] = predict[:, 5]
predict_df["ID"] = ID
return predict_df
In [134]:
def make_predict_proba_df():
vect = TfidfVectorizer()
vect.fit(text_df.ix[:,2])
proba_df = pd.DataFrame(columns = ["ID", 101, 102, 103, 104, 106, 108, 109])
for ID in main_df["ID"]:
predict_proba_df = make_predict_proba(ID, vect)
proba_df = pd.concat([proba_df, predict_proba_df])
proba_df.reset_index(drop=True)
proba_df.to_csv("data/proba_df.csv")
return proba_df
In [135]:
%%time
proba_df = make_predict_proba_df()
In [247]:
proba_df = proba_df.reset_index()
In [250]:
text_df.head()
Out[250]:
In [252]:
predict_list = []
for i in range(len(proba_df)):
predict_list.append(np.argmax(proba_df.loc[i][2:]))
In [257]:
print(classification_report(text_df[text_df["genre"] != 109]["genre"], predict_list))
In [179]:
group_proba_df = proba_df.groupby("ID", as_index=False).agg({101: np.mean, 102: np.mean, 103: np.mean, 104: np.mean, 106: np.mean, 108: np.mean, 109: np.mean})
In [180]:
all_df= main_df.merge(group_proba_df, on="ID")
In [181]:
all_df.rename(columns={101:"romance", 102:"SFfantasy", 103:"matial", 104:"mystery", 106:"lightnovel", 108:"fusion", 109:"adult",}, inplace=True)
In [192]:
X_data = all_df[["genre", "episodes_count", "romance", "SFfantasy", "matial", "mystery", "lightnovel", "fusion", "adult"]]
In [153]:
y_data = all_df[["concern_count"]]
In [188]:
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
In [189]:
scaler = StandardScaler()
In [194]:
X_data = pd.DataFrame(scaler.fit_transform(X_data), columns=["genre", "episodes_count", "romance", "SFfantasy", "matial", "mystery", "lightnovel", "fusion", "adult"])
In [196]:
model2 = sm.OLS(y_data, sm.add_constant(X_data))
In [197]:
print(model2.fit().summary())
In [208]:
len(episode_df)
Out[208]:
In [209]:
len(proba_df)
Out[209]:
In [221]:
stack_df = pd.DataFrame(np.hstack([episode_df, proba_df]), columns=list(episode_df.columns)+list(proba_df.columns))
In [227]:
stack_df.rename(columns={101:"romance", 102:"SFfantasy", 103:"matial", 104:"mystery", 106:"lightnovel", 108:"fusion", 109:"adult",}, inplace=True)
In [231]:
stack_df.ix[:,[2,3,4,5,8,9,10,11,12,13,14]]
Out[231]:
In [237]:
model = sm.OLS.from_formula("score ~ is_first + romance + SFfantasy + matial + mystery + lightnovel + fusion + adult", stack_df)
In [240]:
result = model.fit()
In [ ]:
print(result.sumarry())
In [259]:
all_df
Out[259]:
In [267]:
model = sm.OLS.from_formula("concern_count ~ C(genre) + episodes_count + romance + SFfantasy + matial", all_df)
In [268]:
result = model.fit()
In [269]:
print(result.summary())
In [150]:
all_df.head(0)
Out[150]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [28]:
from konlpy.tag import Twitter
pos_tagger = Twitter()
In [ ]:
def tokenize_pos(doc):
return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
CountVectorizer(tokenizer=tokenize_pos)
In [ ]:
pos_tagger = Twit
In [45]:
from konlpy.tag import Twitter
pos_tagger = Twitter()
def tokenize_pos(doc):
return ['/'.join(t) for t in pos_tagger.pos(doc, norm=True, stem=True)]
model = Pipeline([
('vect', CountVectorizer(tokenizer=tokenize_pos)),
('clf', MultinomialNB()),
])
In [98]:
X_data = new_df["text"]
y_data = new_df["genre"]
label_data = new_df["ID"]
In [99]:
cv = LabelKFold(label_data, n_folds=2)
In [101]:
for train_index, test_index in cv:
model.fit(X_data.loc[train_index], y_data.loc[train_index])
print(classification_report(y_data.loc[test_index], model.predict(X_data[test_index]), digits=4))
In [50]:
In [51]:
In [68]:
%%time
predict = model.predict(X_test)
print(classification_report(y_test, model.predict(X_test), digits=4))
Out[68]:
In [72]:
print(_)
In [74]:
model.predict_proba(X_test)
Out[74]:
In [91]:
predict[2]
Out[91]:
In [272]:
sum(all_df["comments_count"])
Out[272]:
In [ ]: