notebook.community

Edit and run



In [1]:

    
import pandas as pd
import random
import numpy as np



In [2]:

    
path = '../../data/twitter-hate-speech-classifier-clean.csv'
cvpath = '../../data/twitter_cross_val.csv'
testpath = '../../data/twitter_test.csv'



In [3]:

    
df = pd.read_csv(path)



In [4]:

    
df.head()









    Out[4]:






  
    
      
      Unnamed: 0
      tweet_text
      label
    
  
  
    
      0
      0
      Warning: penny boards will make you a faggot
      1
    
    
      1
      1
      Fuck dykes
      1
    
    
      2
      2
      @sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...
      1
    
    
      3
      3
      "@jayswaggkillah: "@JacklynAnnn: @jayswaggkill...
      1
    
    
      4
      4
      @Zhugstubble You heard me bitch but any way I'...
      1



In [17]:

    
numtweets = len(df.index)
print numtweets



In [18]:

    
population = range(numtweets)
crossValind = random.sample(population,10000)
testind = [x for x in population if x not in crossValind]



In [19]:

    
print len(crossValind)
print len(testind)



In [21]:

    
cvset = set(crossValind)
tset = set(testind)
cvset.intersection(tset)









    Out[21]:





set()



In [22]:

    
dfcv = df.ix[crossValind,['tweet_text','label']]
dft = df.ix[testind,['tweet_text','label']]



In [24]:

    
dfcv['label'].value_counts()









    Out[24]:





1    5034
0    4966
Name: label, dtype: int64



In [25]:

    
dft['label'].value_counts()









    Out[25]:





1    2091
0    2084
Name: label, dtype: int64



In [26]:

    
dfcv.to_csv(cvpath)
dft.to_csv(testpath)



In [ ]:

	Unnamed: 0	tweet_text	label
0	0	Warning: penny boards will make you a faggot	1
1	1	Fuck dykes	1
2	2	@sizzurp__ @ILIKECATS74 @yoPapi_chulo @brandon...	1
3	3	"@jayswaggkillah: "@JacklynAnnn: @jayswaggkill...	1
4	4	@Zhugstubble You heard me bitch but any way I'...	1