The goal of this notebook is to split the set of annotated comments into fixed train,dev,test splits. The catch is that we want to ensure that a certain subset of comments labeled more than 20 times ends up in the test split. This is so that we can do a clean baselines experiment.


In [1]:
import pandas as pd
from sklearn.cross_validation import train_test_split


/Users/ellerywulczyn/miniconda3/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
df = pd.read_csv('../../data/annotations/clean/annotations.tsv', sep='\t')
df.index = df['rev_id']
df['counts'] = df['rev_id'].value_counts()

In [3]:
df_revs = df.drop_duplicates(subset = ['rev_id'])[['rev_id', 'sample', 'counts']]
n_revs = df_revs.shape[0]
print(n_revs)


116179

In [4]:
# choose how many baseline revs to choose

In [5]:
df_revs.query("counts >=20")['sample'].value_counts()


Out[5]:
blocked    6715
random     4285
Name: sample, dtype: int64

In [6]:
n_baseline_revs_per_sample = 4000

In [7]:
# create data frame of just baseline revs

In [8]:
df_baseline_random_revs = df_revs.query("counts >=20 and sample == 'random'")\
.sample(n=n_baseline_revs_per_sample, random_state = 12)[['rev_id']]

df_baseline_blocked_revs = df_revs.query("counts >=20 and sample == 'blocked'")\
.sample(n=n_baseline_revs_per_sample, random_state = 12)[['rev_id']]

df_baseline_revs = pd.concat([df_baseline_random_revs, df_baseline_blocked_revs])
df_baseline_revs['baseline'] = True

In [9]:
# create data frame of non baseline revs

In [10]:
df_non_baseline_revs = df_revs[['rev_id']]
df_non_baseline_revs = df_non_baseline_revs.merge(df_baseline_revs, how = 'left', on = 'rev_id')
df_non_baseline_revs = df_non_baseline_revs.fillna(False)
df_non_baseline_revs = df_non_baseline_revs.query("baseline == False")[['rev_id']]
df_baseline_revs = df_baseline_revs[['rev_id']]
n_non_baseline_revs = df_non_baseline_revs.shape[0]
print(n_non_baseline_revs)


108179

In [11]:
# make 3:1:1 split, but put all baseline revs in test

In [12]:
train_fraction = (0.6 * n_revs) / n_non_baseline_revs
temp, train = train_test_split(df_non_baseline_revs, random_state = 12, test_size = train_fraction)
dev_fraction = (0.2 * n_revs) / ((1-train_fraction) * n_non_baseline_revs)
test, dev = train_test_split(temp, random_state = 12, test_size = dev_fraction)
test = pd.concat([test, df_baseline_revs])

In [13]:
print(train.shape)
print(dev.shape)
print(test.shape)


(69708, 1)
(23236, 1)
(23235, 1)

In [14]:
d_train = df.merge(train, how='inner', on = 'rev_id')
d_train.to_csv('../../data/annotations/split/train/annotations.tsv', index=False, sep='\t')

d_dev = df.merge(dev, how='inner', on = 'rev_id')
d_dev.to_csv('../../data/annotations/split/dev/annotations.tsv', index=False, sep='\t')

d_test = df.merge(test, how='inner', on = 'rev_id')
d_test.to_csv('../../data/annotations/split/test/annotations.tsv', index=False, sep='\t')

d_baseline = df.merge(df_baseline_revs, how='inner', on = 'rev_id')
d_baseline.to_csv('../../data/annotations/split/baseline/annotations.tsv', index=False, sep='\t')

In [15]:
print(d_train.shape)
print(d_dev.shape)
print(d_test.shape)
print(d_baseline.shape)


(764050, 23)
(254449, 23)
(350459, 23)
(183489, 23)

In [16]:
# check numcols matched when you read in data
print( pd.read_csv('../../data/annotations/split/train/annotations.tsv', sep='\t').shape)
print( pd.read_csv('../../data/annotations/split/dev/annotations.tsv', sep='\t').shape)
print( pd.read_csv('../../data/annotations/split/test/annotations.tsv', sep='\t').shape)
print( pd.read_csv('../../data/annotations/split/baseline/annotations.tsv', sep='\t').shape)


(764050, 23)
(254449, 23)
(350459, 23)
(183489, 23)

In [17]:
# check that splits are distinct in terms of ids
col = 'rev_id'
print(d_train.merge(d_dev, how = 'inner', on = col).shape)
print(d_train.merge(d_test, how = 'inner', on = col).shape)
print(d_test.merge(d_dev, how = 'inner', on = col).shape)
print(d_train.merge(d_baseline, how = 'inner', on = col).shape)
print(d_dev.merge(d_baseline, how = 'inner', on = col).shape)


(0, 45)
(0, 45)
(0, 45)
(0, 45)
(0, 45)

In [18]:
# check that test and baseline splits ovserlap
print(d_test.merge(d_baseline, how = 'inner', on = col).shape)


(4338373, 45)

In [19]:
# check that splits are distinct in terms of text
col = 'clean_diff'
print(d_train.merge(d_dev, how = 'inner', on = col).shape)
print(d_train.merge(d_test, how = 'inner', on = col).shape)
print(d_test.merge(d_dev, how = 'inner', on = col).shape)


(0, 45)
(0, 45)
(0, 45)

In [20]:
d_baseline.drop_duplicates(subset='rev_id').groupby(['ns', 'sample'])['ns', 'sample'].count()


Out[20]:
ns sample
ns sample
article blocked 1160 1160
random 1625 1625
user blocked 2840 2840
random 2375 2375

In [ ]: