In [1]:
import pandas as pd
import numpy as np

In [2]:
!ls


__notebook_source__.ipynb

In [4]:
!cat ../input/train.csv | head -n 5


"id","comment_text","toxic","severe_toxic","obscene","threat","insult","identity_hate"
"0000997932d777bf","Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0
"000103f0d9cfb60f","D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0
"000113f07ec002fd","Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0
cat: write error: Broken pipe

In [5]:
df = pd.read_csv("../input/train.csv")

In [6]:
df.head(7)


Out[6]:
id comment_text toxic severe_toxic obscene threat insult identity_hate
0 0000997932d777bf Explanation\nWhy the edits made under my usern... 0 0 0 0 0 0
1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... 0 0 0 0 0 0
2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... 0 0 0 0 0 0
3 0001b41b1c6bb37e "\nMore\nI can't make any real suggestions on ... 0 0 0 0 0 0
4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... 0 0 0 0 0 0
5 00025465d4725e87 "\n\nCongratulations from me as well, use the ... 0 0 0 0 0 0
6 0002bcb3da6cb337 COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK 1 1 1 0 1 0

In [7]:
y_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
const_answer = df[y_names].mean().values
const_answer


Out[7]:
array([ 0.09584448,  0.00999555,  0.05294822,  0.00299553,  0.04936361,
        0.00880486])


In [8]:
df_sample_ans = pd.read_csv("../input/sample_submission.csv")

In [9]:
df_sample_ans.head()


Out[9]:
id toxic severe_toxic obscene threat insult identity_hate
0 00001cee341fdb12 0.5 0.5 0.5 0.5 0.5 0.5
1 0000247867823ef7 0.5 0.5 0.5 0.5 0.5 0.5
2 00013b17ad220c46 0.5 0.5 0.5 0.5 0.5 0.5
3 00017563c3f7919a 0.5 0.5 0.5 0.5 0.5 0.5
4 00017695ad8997eb 0.5 0.5 0.5 0.5 0.5 0.5


In [10]:
df_test = pd.read_csv("../input/test.csv")
df_test.head()


Out[10]:
id comment_text
0 00001cee341fdb12 Yo bitch Ja Rule is more succesful then you'll...
1 0000247867823ef7 == From RfC == \n\n The title is fine as it is...
2 00013b17ad220c46 " \n\n == Sources == \n\n * Zawe Ashton on Lap...
3 00017563c3f7919a :If you have a look back at the source, the in...
4 00017695ad8997eb I don't anonymously edit articles at all.

In [11]:
df_my_const_ans = pd.DataFrame()
df_my_const_ans["id"] = df_test["id"].values

for i, name in enumerate(y_names):
    df_my_const_ans[name] = np.ones(len(df_test)) * const_answer[i]

In [12]:
df_my_const_ans.head()


Out[12]:
id toxic severe_toxic obscene threat insult identity_hate
0 00001cee341fdb12 0.095844 0.009996 0.052948 0.002996 0.049364 0.008805
1 0000247867823ef7 0.095844 0.009996 0.052948 0.002996 0.049364 0.008805
2 00013b17ad220c46 0.095844 0.009996 0.052948 0.002996 0.049364 0.008805
3 00017563c3f7919a 0.095844 0.009996 0.052948 0.002996 0.049364 0.008805
4 00017695ad8997eb 0.095844 0.009996 0.052948 0.002996 0.049364 0.008805

In [13]:
df_my_const_ans.to_csv("my_const_ans.csv", index=False)

In [14]:
!cat my_const_ans.csv | head -n 5


id,toxic,severe_toxic,obscene,threat,insult,identity_hate
00001cee341fdb12,0.09584448302009764,0.009995550569965721,0.052948217407925,0.002995531769557125,0.04936360616904074,0.00880485802558109
0000247867823ef7,0.09584448302009764,0.009995550569965721,0.052948217407925,0.002995531769557125,0.04936360616904074,0.00880485802558109
00013b17ad220c46,0.09584448302009764,0.009995550569965721,0.052948217407925,0.002995531769557125,0.04936360616904074,0.00880485802558109
00017563c3f7919a,0.09584448302009764,0.009995550569965721,0.052948217407925,0.002995531769557125,0.04936360616904074,0.00880485802558109
cat: write error: Broken pipe

In [15]:
!cat sample_submission.csv | head -n 5


cat: sample_submission.csv: No such file or directory


In [16]:
df_my_const_ans_2 = pd.DataFrame()
df_my_const_ans_2["id"] = df_test["id"].values

for i, name in enumerate(y_names):
    df_my_const_ans_2[name] = np.zeros(len(df_test))

def save_prediction(df, name):
    df.to_csv(name, index=False)
    
save_prediction(df_my_const_ans_2, "my_const_ans_2.csv")

In [17]:
!cat my_const_ans_2.csv | head -n 5


id,toxic,severe_toxic,obscene,threat,insult,identity_hate
00001cee341fdb12,0.0,0.0,0.0,0.0,0.0,0.0
0000247867823ef7,0.0,0.0,0.0,0.0,0.0,0.0
00013b17ad220c46,0.0,0.0,0.0,0.0,0.0,0.0
00017563c3f7919a,0.0,0.0,0.0,0.0,0.0,0.0
cat: write error: Broken pipe


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
%%time
vectorizer = TfidfVectorizer(max_features=10000, stop_words="english",ngram_range=(1,4))
vectorizer.fit(df["comment_text"])


CPU times: user 1min 40s, sys: 3.4 s, total: 1min 44s
Wall time: 1min 44s

In [37]:
%%time
test_v = vectorizer.transform(df["comment_text"])


CPU times: user 23.9 s, sys: 44 ms, total: 23.9 s
Wall time: 23.9 s

In [38]:
%%time
train_v = vectorizer.transform(df_test["comment_text"])


CPU times: user 20.6 s, sys: 16 ms, total: 20.6 s
Wall time: 20.6 s

In [39]:
from sklearn.linear_model import LogisticRegression
import tqdm

In [41]:
df_prediction = pd.DataFrame()
df_prediction["id"] = df_test["id"].values

for name in tqdm.tqdm(y_names):
    LR = LogisticRegression()
    LR.fit(test_v, df[name].values)
    df_prediction[name] = LR.predict_proba(train_v)[:, 1]
    # print(LR.predict_proba(train_v)[:5])


100%|██████████| 6/6 [00:10<00:00,  1.70s/it]

In [42]:
df_prediction.head()


Out[42]:
id toxic severe_toxic obscene threat insult identity_hate
0 00001cee341fdb12 0.998795 0.303645 0.998578 0.048328 0.953897 0.317764
1 0000247867823ef7 0.009675 0.003067 0.005122 0.001660 0.008082 0.002929
2 00013b17ad220c46 0.012895 0.001593 0.005902 0.000790 0.008677 0.002338
3 00017563c3f7919a 0.004818 0.002405 0.003940 0.001048 0.004142 0.001030
4 00017695ad8997eb 0.051644 0.003045 0.010046 0.001802 0.018701 0.002593

In [43]:
save_prediction(df_prediction, "tfid_logreg_prediction_10k.csv")

In [ ]: