In [1]:
import pandas as pd
import random
import numpy as np

In [2]:
path = '../../data/twitter-hate-speech-classifier-clean.csv'
cvpath = '../../data/twitter_cross_val.csv'
testpath = '../../data/twitter_test.csv'

In [3]:
df = pd.read_csv(path)

In [4]:
df.head()


Out[4]:
Unnamed: 0 tweet_text label
0 0 Warning: penny boards will make you a faggot 1
1 1 Fuck dykes 1
2 2 @sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon... 1
3 3 "@jayswaggkillah: "@JacklynAnnn: @jayswaggkill... 1
4 4 @Zhugstubble You heard me bitch but any way I'... 1

In [17]:
numtweets = len(df.index)
print numtweets


14175

In [18]:
population = range(numtweets)
crossValind = random.sample(population,10000)
testind = [x for x in population if x not in crossValind]

In [19]:
print len(crossValind)
print len(testind)


10000
4175

In [21]:
cvset = set(crossValind)
tset = set(testind)
cvset.intersection(tset)


Out[21]:
set()

In [22]:
dfcv = df.ix[crossValind,['tweet_text','label']]
dft = df.ix[testind,['tweet_text','label']]

In [24]:
dfcv['label'].value_counts()


Out[24]:
1    5034
0    4966
Name: label, dtype: int64

In [25]:
dft['label'].value_counts()


Out[25]:
1    2091
0    2084
Name: label, dtype: int64

In [26]:
dfcv.to_csv(cvpath)
dft.to_csv(testpath)

In [ ]: