In [1]:
import pandas as pd
%matplotlib inline
In [4]:
df = pd.read_csv("~/detox/data/wikilabels/toxicity_sample_raw.csv")
In [5]:
df.shape
Out[5]:
In [6]:
df.head()
Out[6]:
In [7]:
df['avg_score'].hist()
Out[7]:
In [8]:
(df['avg_score'] < 0).value_counts()
Out[8]:
In [9]:
df.head(1000)['query'].value_counts()
Out[9]:
In [10]:
df.sample(15930).head(1000)['query'].value_counts()
Out[10]:
In [11]:
df = df.sample(15930)
In [12]:
df['rev_id'].value_counts().value_counts()
Out[12]:
In [16]:
for i in range(8):
df[['rev_id', 'query', 'comment_text']][i*2000:(i+1)*2000].to_csv("~/detox/data/wikilabels/toxicity_data_%d.csv" % i, index = False)
In [18]:
pd.read_csv("~/detox/data/wikilabels/toxicity_data_7.csv").shape
Out[18]:
In [ ]: