The goal of this notebook is to split the set of annotated comments into fixed train,dev,test splits. The catch is that we want to ensure that a certain subset of comments labeled more than 20 times ends up in the test split. This is so that we can do a clean baselines experiment.
In [1]:
import pandas as pd
from sklearn.cross_validation import train_test_split
In [2]:
df = pd.read_csv('../../data/annotations/clean/annotations.tsv', sep='\t')
df.index = df['rev_id']
df['counts'] = df['rev_id'].value_counts()
In [3]:
df_revs = df.drop_duplicates(subset = ['rev_id'])[['rev_id', 'sample', 'counts']]
n_revs = df_revs.shape[0]
print(n_revs)
In [4]:
# choose how many baseline revs to choose
In [5]:
df_revs.query("counts >=20")['sample'].value_counts()
Out[5]:
In [6]:
n_baseline_revs_per_sample = 4000
In [7]:
# create data frame of just baseline revs
In [8]:
df_baseline_random_revs = df_revs.query("counts >=20 and sample == 'random'")\
.sample(n=n_baseline_revs_per_sample, random_state = 12)[['rev_id']]
df_baseline_blocked_revs = df_revs.query("counts >=20 and sample == 'blocked'")\
.sample(n=n_baseline_revs_per_sample, random_state = 12)[['rev_id']]
df_baseline_revs = pd.concat([df_baseline_random_revs, df_baseline_blocked_revs])
df_baseline_revs['baseline'] = True
In [9]:
# create data frame of non baseline revs
In [10]:
df_non_baseline_revs = df_revs[['rev_id']]
df_non_baseline_revs = df_non_baseline_revs.merge(df_baseline_revs, how = 'left', on = 'rev_id')
df_non_baseline_revs = df_non_baseline_revs.fillna(False)
df_non_baseline_revs = df_non_baseline_revs.query("baseline == False")[['rev_id']]
df_baseline_revs = df_baseline_revs[['rev_id']]
n_non_baseline_revs = df_non_baseline_revs.shape[0]
print(n_non_baseline_revs)
In [11]:
# make 3:1:1 split, but put all baseline revs in test
In [12]:
train_fraction = (0.6 * n_revs) / n_non_baseline_revs
temp, train = train_test_split(df_non_baseline_revs, random_state = 12, test_size = train_fraction)
dev_fraction = (0.2 * n_revs) / ((1-train_fraction) * n_non_baseline_revs)
test, dev = train_test_split(temp, random_state = 12, test_size = dev_fraction)
test = pd.concat([test, df_baseline_revs])
In [13]:
print(train.shape)
print(dev.shape)
print(test.shape)
In [14]:
d_train = df.merge(train, how='inner', on = 'rev_id')
d_train.to_csv('../../data/annotations/split/train/annotations.tsv', index=False, sep='\t')
d_dev = df.merge(dev, how='inner', on = 'rev_id')
d_dev.to_csv('../../data/annotations/split/dev/annotations.tsv', index=False, sep='\t')
d_test = df.merge(test, how='inner', on = 'rev_id')
d_test.to_csv('../../data/annotations/split/test/annotations.tsv', index=False, sep='\t')
d_baseline = df.merge(df_baseline_revs, how='inner', on = 'rev_id')
d_baseline.to_csv('../../data/annotations/split/baseline/annotations.tsv', index=False, sep='\t')
In [15]:
print(d_train.shape)
print(d_dev.shape)
print(d_test.shape)
print(d_baseline.shape)
In [16]:
# check numcols matched when you read in data
print( pd.read_csv('../../data/annotations/split/train/annotations.tsv', sep='\t').shape)
print( pd.read_csv('../../data/annotations/split/dev/annotations.tsv', sep='\t').shape)
print( pd.read_csv('../../data/annotations/split/test/annotations.tsv', sep='\t').shape)
print( pd.read_csv('../../data/annotations/split/baseline/annotations.tsv', sep='\t').shape)
In [17]:
# check that splits are distinct in terms of ids
col = 'rev_id'
print(d_train.merge(d_dev, how = 'inner', on = col).shape)
print(d_train.merge(d_test, how = 'inner', on = col).shape)
print(d_test.merge(d_dev, how = 'inner', on = col).shape)
print(d_train.merge(d_baseline, how = 'inner', on = col).shape)
print(d_dev.merge(d_baseline, how = 'inner', on = col).shape)
In [18]:
# check that test and baseline splits ovserlap
print(d_test.merge(d_baseline, how = 'inner', on = col).shape)
In [19]:
# check that splits are distinct in terms of text
col = 'clean_diff'
print(d_train.merge(d_dev, how = 'inner', on = col).shape)
print(d_train.merge(d_test, how = 'inner', on = col).shape)
print(d_test.merge(d_dev, how = 'inner', on = col).shape)
In [20]:
d_baseline.drop_duplicates(subset='rev_id').groupby(['ns', 'sample'])['ns', 'sample'].count()
Out[20]:
In [ ]: