The goal of this notebook is to split the set of annotated comments into fixed train,dev,test splits. The catch is that we want to ensure that a certain subset of comments labeled more than 20 times ends up in the test split. This is so that we can do a clean baselines experiment.



In [1]:

    
import pandas as pd
from sklearn.cross_validation import train_test_split









    



/Users/ellerywulczyn/miniconda3/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [2]:

    
df = pd.read_csv('../../data/annotations/clean/annotations.tsv', sep='\t')
df.index = df['rev_id']
df['counts'] = df['rev_id'].value_counts()



In [3]:

    
df_revs = df.drop_duplicates(subset = ['rev_id'])[['rev_id', 'sample', 'counts']]
n_revs = df_revs.shape[0]
print(n_revs)



In [4]:

    
# choose how many baseline revs to choose



In [5]:

    
df_revs.query("counts >=20")['sample'].value_counts()









    Out[5]:





blocked    6715
random     4285
Name: sample, dtype: int64



In [6]:

    
n_baseline_revs_per_sample = 4000



In [7]:

    
# create data frame of just baseline revs



In [8]:

    
df_baseline_random_revs = df_revs.query("counts >=20 and sample == 'random'")\
.sample(n=n_baseline_revs_per_sample, random_state = 12)[['rev_id']]

df_baseline_blocked_revs = df_revs.query("counts >=20 and sample == 'blocked'")\
.sample(n=n_baseline_revs_per_sample, random_state = 12)[['rev_id']]

df_baseline_revs = pd.concat([df_baseline_random_revs, df_baseline_blocked_revs])
df_baseline_revs['baseline'] = True



In [9]:

    
# create data frame of non baseline revs



In [10]:

    
df_non_baseline_revs = df_revs[['rev_id']]
df_non_baseline_revs = df_non_baseline_revs.merge(df_baseline_revs, how = 'left', on = 'rev_id')
df_non_baseline_revs = df_non_baseline_revs.fillna(False)
df_non_baseline_revs = df_non_baseline_revs.query("baseline == False")[['rev_id']]
df_baseline_revs = df_baseline_revs[['rev_id']]
n_non_baseline_revs = df_non_baseline_revs.shape[0]
print(n_non_baseline_revs)



In [11]:

    
# make 3:1:1 split, but put all baseline revs in test



In [12]:

    
train_fraction = (0.6 * n_revs) / n_non_baseline_revs
temp, train = train_test_split(df_non_baseline_revs, random_state = 12, test_size = train_fraction)
dev_fraction = (0.2 * n_revs) / ((1-train_fraction) * n_non_baseline_revs)
test, dev = train_test_split(temp, random_state = 12, test_size = dev_fraction)
test = pd.concat([test, df_baseline_revs])



In [13]:

    
print(train.shape)
print(dev.shape)
print(test.shape)









    



(69708, 1)
(23236, 1)
(23235, 1)



In [14]:

    
d_train = df.merge(train, how='inner', on = 'rev_id')
d_train.to_csv('../../data/annotations/split/train/annotations.tsv', index=False, sep='\t')

d_dev = df.merge(dev, how='inner', on = 'rev_id')
d_dev.to_csv('../../data/annotations/split/dev/annotations.tsv', index=False, sep='\t')

d_test = df.merge(test, how='inner', on = 'rev_id')
d_test.to_csv('../../data/annotations/split/test/annotations.tsv', index=False, sep='\t')

d_baseline = df.merge(df_baseline_revs, how='inner', on = 'rev_id')
d_baseline.to_csv('../../data/annotations/split/baseline/annotations.tsv', index=False, sep='\t')



In [15]:

    
print(d_train.shape)
print(d_dev.shape)
print(d_test.shape)
print(d_baseline.shape)









    



(764050, 23)
(254449, 23)
(350459, 23)
(183489, 23)



In [16]:

    
# check numcols matched when you read in data
print( pd.read_csv('../../data/annotations/split/train/annotations.tsv', sep='\t').shape)
print( pd.read_csv('../../data/annotations/split/dev/annotations.tsv', sep='\t').shape)
print( pd.read_csv('../../data/annotations/split/test/annotations.tsv', sep='\t').shape)
print( pd.read_csv('../../data/annotations/split/baseline/annotations.tsv', sep='\t').shape)









    



(764050, 23)
(254449, 23)
(350459, 23)
(183489, 23)



In [17]:

    
# check that splits are distinct in terms of ids
col = 'rev_id'
print(d_train.merge(d_dev, how = 'inner', on = col).shape)
print(d_train.merge(d_test, how = 'inner', on = col).shape)
print(d_test.merge(d_dev, how = 'inner', on = col).shape)
print(d_train.merge(d_baseline, how = 'inner', on = col).shape)
print(d_dev.merge(d_baseline, how = 'inner', on = col).shape)









    



(0, 45)
(0, 45)
(0, 45)
(0, 45)
(0, 45)



In [18]:

    
# check that test and baseline splits ovserlap
print(d_test.merge(d_baseline, how = 'inner', on = col).shape)









    



(4338373, 45)



In [19]:

    
# check that splits are distinct in terms of text
col = 'clean_diff'
print(d_train.merge(d_dev, how = 'inner', on = col).shape)
print(d_train.merge(d_test, how = 'inner', on = col).shape)
print(d_test.merge(d_dev, how = 'inner', on = col).shape)









    



(0, 45)
(0, 45)
(0, 45)



In [20]:

    
d_baseline.drop_duplicates(subset='rev_id').groupby(['ns', 'sample'])['ns', 'sample'].count()



In [ ]:

		ns	sample
ns	sample
article	blocked	1160	1160
article	random	1625	1625
user	blocked	2840	2840
user	random	2375	2375