In [1]:
import pandas as pd
import random
import numpy as np
In [2]:
path = '../../data/twitter-hate-speech-classifier-clean.csv'
cvpath = '../../data/twitter_cross_val.csv'
testpath = '../../data/twitter_test.csv'
In [3]:
df = pd.read_csv(path)
In [4]:
df.head()
Out[4]:
In [17]:
numtweets = len(df.index)
print numtweets
In [18]:
population = range(numtweets)
crossValind = random.sample(population,10000)
testind = [x for x in population if x not in crossValind]
In [19]:
print len(crossValind)
print len(testind)
In [21]:
cvset = set(crossValind)
tset = set(testind)
cvset.intersection(tset)
Out[21]:
In [22]:
dfcv = df.ix[crossValind,['tweet_text','label']]
dft = df.ix[testind,['tweet_text','label']]
In [24]:
dfcv['label'].value_counts()
Out[24]:
In [25]:
dft['label'].value_counts()
Out[25]:
In [26]:
dfcv.to_csv(cvpath)
dft.to_csv(testpath)
In [ ]: