In [1]:
import pandas as pd
import numpy as np
In [2]:
!ls
In [4]:
!cat ../input/train.csv | head -n 5
In [5]:
df = pd.read_csv("../input/train.csv")
In [6]:
df.head(7)
Out[6]:
In [7]:
y_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
const_answer = df[y_names].mean().values
const_answer
Out[7]:
In [8]:
df_sample_ans = pd.read_csv("../input/sample_submission.csv")
In [9]:
df_sample_ans.head()
Out[9]:
In [10]:
df_test = pd.read_csv("../input/test.csv")
df_test.head()
Out[10]:
In [11]:
df_my_const_ans = pd.DataFrame()
df_my_const_ans["id"] = df_test["id"].values
for i, name in enumerate(y_names):
df_my_const_ans[name] = np.ones(len(df_test)) * const_answer[i]
In [12]:
df_my_const_ans.head()
Out[12]:
In [13]:
df_my_const_ans.to_csv("my_const_ans.csv", index=False)
In [14]:
!cat my_const_ans.csv | head -n 5
In [15]:
!cat sample_submission.csv | head -n 5
In [16]:
df_my_const_ans_2 = pd.DataFrame()
df_my_const_ans_2["id"] = df_test["id"].values
for i, name in enumerate(y_names):
df_my_const_ans_2[name] = np.zeros(len(df_test))
def save_prediction(df, name):
df.to_csv(name, index=False)
save_prediction(df_my_const_ans_2, "my_const_ans_2.csv")
In [17]:
!cat my_const_ans_2.csv | head -n 5
In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
In [36]:
%%time
vectorizer = TfidfVectorizer(max_features=10000, stop_words="english",ngram_range=(1,4))
vectorizer.fit(df["comment_text"])
In [37]:
%%time
test_v = vectorizer.transform(df["comment_text"])
In [38]:
%%time
train_v = vectorizer.transform(df_test["comment_text"])
In [39]:
from sklearn.linear_model import LogisticRegression
import tqdm
In [41]:
df_prediction = pd.DataFrame()
df_prediction["id"] = df_test["id"].values
for name in tqdm.tqdm(y_names):
LR = LogisticRegression()
LR.fit(test_v, df[name].values)
df_prediction[name] = LR.predict_proba(train_v)[:, 1]
# print(LR.predict_proba(train_v)[:5])
In [42]:
df_prediction.head()
Out[42]:
In [43]:
save_prediction(df_prediction, "tfid_logreg_prediction_10k.csv")
In [ ]: