In [1]:
import pandas as pd
%matplotlib inline

In [4]:
df = pd.read_csv("~/detox/data/wikilabels/toxicity_sample_raw.csv")

In [5]:
df.shape


Out[5]:
(15930, 13)

In [6]:
df.head()


Out[6]:
id source query comment_text avg_score frac_pos frac_neg count_-2 count_-1 count_0 count_1 count_2 rev_id
0 1087503978 wiki article_blocked "\n\nThe so-called Wikipedia:Synthesis in this... 0.2 0.4 0.1 1.0 0.0 5.0 4.0 0.0 605378703.0
1 1087420422 wiki user_blocked YOU'RE BLOCKED FOR 700 DAYS. \n SHUSHING COW -0.8 0.0 0.8 0.0 8.0 2.0 0.0 0.0 614033876.0
2 1087438277 wiki user_blocked i'm sorry for vandalising 0.4 0.4 0.0 0.0 0.0 6.0 4.0 0.0 214047294.0
3 1087399589 wiki user_blocked please, please killyourself you whiney little ... -1.5 0.0 1.0 5.0 5.0 0.0 0.0 0.0 239408401.0
4 1087420682 wiki user_blocked "\n\nRequest for help on mediation board\nHell... 0.6 0.6 0.0 0.0 0.0 4.0 6.0 0.0 698623252.0

In [7]:
df['avg_score'].hist()


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x114c72198>

In [8]:
(df['avg_score'] < 0).value_counts()


Out[8]:
False    12972
True      2958
Name: avg_score, dtype: int64

In [9]:
df.head(1000)['query'].value_counts()


Out[9]:
user_blocked       331
user_random        277
article_blocked    203
article_random     189
Name: query, dtype: int64

In [10]:
df.sample(15930).head(1000)['query'].value_counts()


Out[10]:
user_blocked       320
user_random        293
article_blocked    195
article_random     192
Name: query, dtype: int64

In [11]:
df = df.sample(15930)

In [12]:
df['rev_id'].value_counts().value_counts()


Out[12]:
1    15930
Name: rev_id, dtype: int64

In [16]:
for i in range(8):
    df[['rev_id', 'query', 'comment_text']][i*2000:(i+1)*2000].to_csv("~/detox/data/wikilabels/toxicity_data_%d.csv" % i, index = False)

In [18]:
pd.read_csv("~/detox/data/wikilabels/toxicity_data_7.csv").shape


Out[18]:
(1930, 3)

In [ ]: