notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np



In [2]:

    
!ls









    



__notebook_source__.ipynb



In [4]:

    
!cat ../input/train.csv | head -n 5









    



"id","comment_text","toxic","severe_toxic","obscene","threat","insult","identity_hate"
"0000997932d777bf","Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0
"000103f0d9cfb60f","D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0
"000113f07ec002fd","Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0
cat: write error: Broken pipe



In [5]:

    
df = pd.read_csv("../input/train.csv")



In [6]:

    
df.head(7)









    Out[6]:







  
    
      
      id
      comment_text
      toxic
      severe_toxic
      obscene
      threat
      insult
      identity_hate
    
  
  
    
      0
      0000997932d777bf
      Explanation\nWhy the edits made under my usern...
      0
      0
      0
      0
      0
      0
    
    
      1
      000103f0d9cfb60f
      D'aww! He matches this background colour I'm s...
      0
      0
      0
      0
      0
      0
    
    
      2
      000113f07ec002fd
      Hey man, I'm really not trying to edit war. It...
      0
      0
      0
      0
      0
      0
    
    
      3
      0001b41b1c6bb37e
      "\nMore\nI can't make any real suggestions on ...
      0
      0
      0
      0
      0
      0
    
    
      4
      0001d958c54c6e35
      You, sir, are my hero. Any chance you remember...
      0
      0
      0
      0
      0
      0
    
    
      5
      00025465d4725e87
      "\n\nCongratulations from me as well, use the ...
      0
      0
      0
      0
      0
      0
    
    
      6
      0002bcb3da6cb337
      COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK
      1
      1
      1
      0
      1
      0



In [7]:

    
y_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
const_answer = df[y_names].mean().values
const_answer









    Out[7]:





array([ 0.09584448,  0.00999555,  0.05294822,  0.00299553,  0.04936361,
        0.00880486])



In [8]:

    
df_sample_ans = pd.read_csv("../input/sample_submission.csv")



In [9]:

    
df_sample_ans.head()









    Out[9]:







  
    
      
      id
      toxic
      severe_toxic
      obscene
      threat
      insult
      identity_hate
    
  
  
    
      0
      00001cee341fdb12
      0.5
      0.5
      0.5
      0.5
      0.5
      0.5
    
    
      1
      0000247867823ef7
      0.5
      0.5
      0.5
      0.5
      0.5
      0.5
    
    
      2
      00013b17ad220c46
      0.5
      0.5
      0.5
      0.5
      0.5
      0.5
    
    
      3
      00017563c3f7919a
      0.5
      0.5
      0.5
      0.5
      0.5
      0.5
    
    
      4
      00017695ad8997eb
      0.5
      0.5
      0.5
      0.5
      0.5
      0.5



In [10]:

    
df_test = pd.read_csv("../input/test.csv")
df_test.head()









    Out[10]:







  
    
      
      id
      comment_text
    
  
  
    
      0
      00001cee341fdb12
      Yo bitch Ja Rule is more succesful then you'll...
    
    
      1
      0000247867823ef7
      == From RfC == \n\n The title is fine as it is...
    
    
      2
      00013b17ad220c46
      " \n\n == Sources == \n\n * Zawe Ashton on Lap...
    
    
      3
      00017563c3f7919a
      :If you have a look back at the source, the in...
    
    
      4
      00017695ad8997eb
      I don't anonymously edit articles at all.



In [11]:

    
df_my_const_ans = pd.DataFrame()
df_my_const_ans["id"] = df_test["id"].values

for i, name in enumerate(y_names):
    df_my_const_ans[name] = np.ones(len(df_test)) * const_answer[i]



In [12]:

    
df_my_const_ans.head()









    Out[12]:







  
    
      
      id
      toxic
      severe_toxic
      obscene
      threat
      insult
      identity_hate
    
  
  
    
      0
      00001cee341fdb12
      0.095844
      0.009996
      0.052948
      0.002996
      0.049364
      0.008805
    
    
      1
      0000247867823ef7
      0.095844
      0.009996
      0.052948
      0.002996
      0.049364
      0.008805
    
    
      2
      00013b17ad220c46
      0.095844
      0.009996
      0.052948
      0.002996
      0.049364
      0.008805
    
    
      3
      00017563c3f7919a
      0.095844
      0.009996
      0.052948
      0.002996
      0.049364
      0.008805
    
    
      4
      00017695ad8997eb
      0.095844
      0.009996
      0.052948
      0.002996
      0.049364
      0.008805



In [13]:

    
df_my_const_ans.to_csv("my_const_ans.csv", index=False)



In [14]:

    
!cat my_const_ans.csv | head -n 5









    



id,toxic,severe_toxic,obscene,threat,insult,identity_hate
00001cee341fdb12,0.09584448302009764,0.009995550569965721,0.052948217407925,0.002995531769557125,0.04936360616904074,0.00880485802558109
0000247867823ef7,0.09584448302009764,0.009995550569965721,0.052948217407925,0.002995531769557125,0.04936360616904074,0.00880485802558109
00013b17ad220c46,0.09584448302009764,0.009995550569965721,0.052948217407925,0.002995531769557125,0.04936360616904074,0.00880485802558109
00017563c3f7919a,0.09584448302009764,0.009995550569965721,0.052948217407925,0.002995531769557125,0.04936360616904074,0.00880485802558109
cat: write error: Broken pipe



In [15]:

    
!cat sample_submission.csv | head -n 5









    



cat: sample_submission.csv: No such file or directory



In [16]:

    
df_my_const_ans_2 = pd.DataFrame()
df_my_const_ans_2["id"] = df_test["id"].values

for i, name in enumerate(y_names):
    df_my_const_ans_2[name] = np.zeros(len(df_test))

def save_prediction(df, name):
    df.to_csv(name, index=False)
    
save_prediction(df_my_const_ans_2, "my_const_ans_2.csv")



In [17]:

    
!cat my_const_ans_2.csv | head -n 5









    



id,toxic,severe_toxic,obscene,threat,insult,identity_hate
00001cee341fdb12,0.0,0.0,0.0,0.0,0.0,0.0
0000247867823ef7,0.0,0.0,0.0,0.0,0.0,0.0
00013b17ad220c46,0.0,0.0,0.0,0.0,0.0,0.0
00017563c3f7919a,0.0,0.0,0.0,0.0,0.0,0.0
cat: write error: Broken pipe



In [18]:

    
from sklearn.feature_extraction.text import TfidfVectorizer



In [36]:

    
%%time
vectorizer = TfidfVectorizer(max_features=10000, stop_words="english",ngram_range=(1,4))
vectorizer.fit(df["comment_text"])









    



CPU times: user 1min 40s, sys: 3.4 s, total: 1min 44s
Wall time: 1min 44s



In [37]:

    
%%time
test_v = vectorizer.transform(df["comment_text"])









    



CPU times: user 23.9 s, sys: 44 ms, total: 23.9 s
Wall time: 23.9 s



In [38]:

    
%%time
train_v = vectorizer.transform(df_test["comment_text"])









    



CPU times: user 20.6 s, sys: 16 ms, total: 20.6 s
Wall time: 20.6 s



In [39]:

    
from sklearn.linear_model import LogisticRegression
import tqdm



In [41]:

    
df_prediction = pd.DataFrame()
df_prediction["id"] = df_test["id"].values

for name in tqdm.tqdm(y_names):
    LR = LogisticRegression()
    LR.fit(test_v, df[name].values)
    df_prediction[name] = LR.predict_proba(train_v)[:, 1]
    # print(LR.predict_proba(train_v)[:5])









    



100%|██████████| 6/6 [00:10<00:00,  1.70s/it]



In [42]:

    
df_prediction.head()









    Out[42]:







  
    
      
      id
      toxic
      severe_toxic
      obscene
      threat
      insult
      identity_hate
    
  
  
    
      0
      00001cee341fdb12
      0.998795
      0.303645
      0.998578
      0.048328
      0.953897
      0.317764
    
    
      1
      0000247867823ef7
      0.009675
      0.003067
      0.005122
      0.001660
      0.008082
      0.002929
    
    
      2
      00013b17ad220c46
      0.012895
      0.001593
      0.005902
      0.000790
      0.008677
      0.002338
    
    
      3
      00017563c3f7919a
      0.004818
      0.002405
      0.003940
      0.001048
      0.004142
      0.001030
    
    
      4
      00017695ad8997eb
      0.051644
      0.003045
      0.010046
      0.001802
      0.018701
      0.002593



In [43]:

    
save_prediction(df_prediction, "tfid_logreg_prediction_10k.csv")



In [ ]:

	id	comment_text	toxic	severe_toxic	obscene	insult
0	0000997932d777bf	Explanation\nWhy the edits made under my usern...	0	0	0	0
1	000103f0d9cfb60f	D'aww! He matches this background colour I'm s...	0	0	0	0
2	000113f07ec002fd	Hey man, I'm really not trying to edit war. It...	0	0	0	0
3	0001b41b1c6bb37e	"\nMore\nI can't make any real suggestions on ...	0	0	0	0
4	0001d958c54c6e35	You, sir, are my hero. Any chance you remember...	0	0	0	0
5	00025465d4725e87	"\n\nCongratulations from me as well, use the ...	0	0	0	0
6	0002bcb3da6cb337	COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK	1	1	1	1

	id	toxic	severe_toxic	obscene	threat	insult	identity_hate
0	00001cee341fdb12	0.5	0.5	0.5	0.5	0.5	0.5
1	0000247867823ef7	0.5	0.5	0.5	0.5	0.5	0.5
2	00013b17ad220c46	0.5	0.5	0.5	0.5	0.5	0.5
3	00017563c3f7919a	0.5	0.5	0.5	0.5	0.5	0.5
4	00017695ad8997eb	0.5	0.5	0.5	0.5	0.5	0.5

	id	comment_text
0	00001cee341fdb12	Yo bitch Ja Rule is more succesful then you'll...
1	0000247867823ef7	== From RfC == \n\n The title is fine as it is...
2	00013b17ad220c46	" \n\n == Sources == \n\n * Zawe Ashton on Lap...
3	00017563c3f7919a	:If you have a look back at the source, the in...
4	00017695ad8997eb	I don't anonymously edit articles at all.

	id	toxic	severe_toxic	obscene	threat	insult	identity_hate
0	00001cee341fdb12	0.095844	0.009996	0.052948	0.002996	0.049364	0.008805
1	0000247867823ef7	0.095844	0.009996	0.052948	0.002996	0.049364	0.008805
2	00013b17ad220c46	0.095844	0.009996	0.052948	0.002996	0.049364	0.008805
3	00017563c3f7919a	0.095844	0.009996	0.052948	0.002996	0.049364	0.008805
4	00017695ad8997eb	0.095844	0.009996	0.052948	0.002996	0.049364	0.008805

	id	toxic	severe_toxic	obscene	threat	insult	identity_hate
0	00001cee341fdb12	0.998795	0.303645	0.998578	0.048328	0.953897	0.317764
1	0000247867823ef7	0.009675	0.003067	0.005122	0.001660	0.008082	0.002929
2	00013b17ad220c46	0.012895	0.001593	0.005902	0.000790	0.008677	0.002338
3	00017563c3f7919a	0.004818	0.002405	0.003940	0.001048	0.004142	0.001030
4	00017695ad8997eb	0.051644	0.003045	0.010046	0.001802	0.018701	0.002593