notebook.community

Edit and run



In [1]:

    
import pandas as pd
%matplotlib inline



In [4]:

    
df = pd.read_csv("~/detox/data/wikilabels/toxicity_sample_raw.csv")



In [5]:

    
df.shape









    Out[5]:





(15930, 13)



In [6]:

    
df.head()









    Out[6]:






  
    
      
      id
      source
      query
      comment_text
      avg_score
      frac_pos
      frac_neg
      count_-2
      count_-1
      count_0
      count_1
      count_2
      rev_id
    
  
  
    
      0
      1087503978
      wiki
      article_blocked
      "\n\nThe so-called Wikipedia:Synthesis in this...
      0.2
      0.4
      0.1
      1.0
      0.0
      5.0
      4.0
      0.0
      605378703.0
    
    
      1
      1087420422
      wiki
      user_blocked
      YOU'RE BLOCKED FOR 700 DAYS. \n SHUSHING COW
      -0.8
      0.0
      0.8
      0.0
      8.0
      2.0
      0.0
      0.0
      614033876.0
    
    
      2
      1087438277
      wiki
      user_blocked
      i'm sorry for vandalising
      0.4
      0.4
      0.0
      0.0
      0.0
      6.0
      4.0
      0.0
      214047294.0
    
    
      3
      1087399589
      wiki
      user_blocked
      please, please killyourself you whiney little ...
      -1.5
      0.0
      1.0
      5.0
      5.0
      0.0
      0.0
      0.0
      239408401.0
    
    
      4
      1087420682
      wiki
      user_blocked
      "\n\nRequest for help on mediation board\nHell...
      0.6
      0.6
      0.0
      0.0
      0.0
      4.0
      6.0
      0.0
      698623252.0



In [7]:

    
df['avg_score'].hist()









    Out[7]:





<matplotlib.axes._subplots.AxesSubplot at 0x114c72198>



In [8]:

    
(df['avg_score'] < 0).value_counts()









    Out[8]:





False    12972
True      2958
Name: avg_score, dtype: int64



In [9]:

    
df.head(1000)['query'].value_counts()









    Out[9]:





user_blocked       331
user_random        277
article_blocked    203
article_random     189
Name: query, dtype: int64



In [10]:

    
df.sample(15930).head(1000)['query'].value_counts()









    Out[10]:





user_blocked       320
user_random        293
article_blocked    195
article_random     192
Name: query, dtype: int64



In [11]:

    
df = df.sample(15930)



In [12]:

    
df['rev_id'].value_counts().value_counts()









    Out[12]:





1    15930
Name: rev_id, dtype: int64



In [16]:

    
for i in range(8):
    df[['rev_id', 'query', 'comment_text']][i*2000:(i+1)*2000].to_csv("~/detox/data/wikilabels/toxicity_data_%d.csv" % i, index = False)



In [18]:

    
pd.read_csv("~/detox/data/wikilabels/toxicity_data_7.csv").shape









    Out[18]:





(1930, 3)



In [ ]:

	id	source	query	comment_text	avg_score	frac_pos	frac_neg	count_-2	count_-1	count_0	count_1	rev_id
0	1087503978	wiki	article_blocked	"\n\nThe so-called Wikipedia:Synthesis in this...	0.2	0.4	0.1	1.0	0.0	5.0	4.0	605378703.0
1	1087420422	wiki	user_blocked	YOU'RE BLOCKED FOR 700 DAYS. \n SHUSHING COW	-0.8	0.0	0.8	0.0	8.0	2.0	0.0	614033876.0
2	1087438277	wiki	user_blocked	i'm sorry for vandalising	0.4	0.4	0.0	0.0	0.0	6.0	4.0	214047294.0
3	1087399589	wiki	user_blocked	please, please killyourself you whiney little ...	-1.5	0.0	1.0	5.0	5.0	0.0	0.0	239408401.0
4	1087420682	wiki	user_blocked	"\n\nRequest for help on mediation board\nHell...	0.6	0.6	0.0	0.0	0.0	4.0	6.0	698623252.0