In [27]:

    
import pandas as pd
import re
import math
from nltk.classify import NaiveBayesClassifier
import collections
import nltk



In [28]:

    
# Let's pull in the tweets from the extracted ICE tweets
df = pd.read_csv('https://s3.amazonaws.com/d4d-public/public/ice_extract.csv')



In [41]:

    
# Let's take a look at it
df.head()









    Out[41]:






  
    
      
      id
      user_followers
      created
      user_name
      text
      user_created
      id_str
      user_description
      original_id
      original_name
      friends_count
      retweet_count
      hashtags
      retweet
      user_location
    
  
  
    
      0
      1
      92298
      2017-02-16 22:53:31.000000
      IndigndeVerdad
      LIVE NYC #ICERaids https://t.co/PZnleLAkpv
      2013-03-21 13:02:04.000000
      832362301672009728
      NaN
      NaN
      NaN
      90376
      0
      ["ICERaids"]
      N
      Madrid
    
    
      1
      2
      18927
      2017-02-16 22:53:44.000000
      MaketheRoadNY
      Antonio Alarcon from MRNY introduces passionat...
      2010-05-07 17:35:33.000000
      832362355581448192
      Building power of Latinx & working class commu...
      NaN
      NaN
      2882
      0
      ["ICERaids", "freedaniel"]
      N
      New York City
    
    
      2
      3
      16479
      2017-02-16 22:53:56.000000
      sickjew
      RT @altochulo: ¡Trump escucha, estamos en la l...
      2009-08-13 16:41:02.000000
      832362403845251072
      Health care is a human right. // backup accoun...
      NaN
      NaN
      12004
      0
      ["FreeDaniel", "ICERaids"]
      N
      NaN
    
    
      3
      4
      16479
      2017-02-16 22:54:06.000000
      sickjew
      RT @altochulo: Getting packed here to say #Fre...
      2009-08-13 16:41:02.000000
      832362447231127554
      Health care is a human right. // backup accoun...
      NaN
      NaN
      12004
      0
      ["FreeDaniel", "ICERaids"]
      N
      NaN
    
    
      4
      5
      129
      2017-02-16 22:54:08.000000
      Babar_392
      RT @MaketheRoadNY: Staten Island--where recent...
      2011-11-23 15:46:54.000000
      832362454650912768
      NaN
      NaN
      NaN
      346
      0
      ["ICERaids"]
      N
      Virginia, USA



In [30]:

    
# OK, now that we have that, let's save it to a text file. This will help get a quick way to look at the text
df['text'].to_csv('training_data/test.txt', index=False)

Now we're going to completely switch gears and build a model to determine whether a statement is positive or negative. We're going to use quotes from movies review, which is a bit of a stretch, but it's a place to start. This method is from Andy Bromberg's webpage. My goal is to build on it, but for now let's just try to get it working



In [31]:

    
#I have two files, one of positive statements (from movie reviews) and the other with negative. To run this
#download the files from Andy Bromberg's GitHub page: https://github.com/abromberg/sentiment_analysis_python
positive_statements = 'C:/Users/HMGSYS/Google Drive/JupyterNotebooks/Data4Democracy/training_data/pos.txt'
negative_statements = 'C:/Users/HMGSYS/Google Drive/JupyterNotebooks/Data4Democracy/training_data/neg.txt'
#And I also have our file we just created with ICERaids tweets
test_statements = 'C:/Users/HMGSYS/Google Drive/JupyterNotebooks/Data4Democracy/training_data/test.txt'



In [32]:

    
#creates a feature selection mechanism that uses all words
def make_full_dict(words):
    return dict([(word, True) for word in words])



In [33]:

    
# Let's open the files and create lists with all the words in them
posFeatures = []
negFeatures = []
mytestFeatures = []
with open(positive_statements, 'r') as posSentences:
    for i in posSentences:
        posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
        posWords = [make_full_dict(posWords), 'pos']
        posFeatures.append(posWords)
with open(negative_statements, 'r') as negSentences:
    for i in negSentences:
        negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
        negWords = [make_full_dict(negWords), 'neg']
        negFeatures.append(negWords)
# Now let's do the same with our test data
with open(test_statements, 'r') as mytestSentences:
    for i in mytestSentences:
        mytestWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
        # We're going to label them as positive so we can check the accuracy
        mytestWords = [make_full_dict(mytestWords), 'pos']
        mytestFeatures.append(mytestWords)



In [43]:

    
# Let's take a quick look at our result
posFeatures[0:2]









    Out[43]:





[[{',': True,
   '.': True,
   '21st': True,
   'a': True,
   'and': True,
   'arnold': True,
   'be': True,
   "century's": True,
   'claud': True,
   'conan': True,
   'damme': True,
   'destined': True,
   'even': True,
   'going': True,
   'greater': True,
   "he's": True,
   'is': True,
   'jean': True,
   'make': True,
   'new': True,
   'or': True,
   'rock': True,
   'schwarzenegger': True,
   'segal': True,
   'splash': True,
   'steven': True,
   'than': True,
   'that': True,
   'the': True,
   'to': True,
   'van': True},
  'pos'],
 [{'.': True,
   'a': True,
   'adequately': True,
   'cannot': True,
   'co': True,
   'column': True,
   'continuation': True,
   'describe': True,
   'director': True,
   'earth': True,
   'elaborate': True,
   'expanded': True,
   'gorgeously': True,
   'huge': True,
   'is': True,
   'j': True,
   "jackson's": True,
   'lord': True,
   'middle': True,
   'of': True,
   'peter': True,
   'r': True,
   'rings': True,
   'so': True,
   'that': True,
   'the': True,
   "tolkien's": True,
   'trilogy': True,
   'vision': True,
   'words': True,
   'writer': True},
  'pos']]

Now we have two big lists: posFeatures and negFeatures. These are lists of lists, where each internal list is a collection of all the words that are in a positive movie review. Inside those lists are two things: a dictionary and a string. The dictionary is a mapping of every word in the review to a boolean (True). The string is either 'pos' or 'neg' depending on which corpus it came from.



In [35]:

    
#selects 3/4 of the features to be used for training and 1/4 to be used for testing
posCutoff = int(math.floor(len(posFeatures)*3/4))
negCutoff = int(math.floor(len(negFeatures)*3/4))
mytestCutoff = int(math.floor(len(mytestFeatures)*3/4))
#Now this is a bit tricky because we have testFeatures and mytestFeatures. testFeatures is from the Bromberg model
#mytestFeatures is me throwing our test (ICERaids) tweets into the same process
trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]
#This last one doesn't change
mytestFeatures = mytestFeatures



In [36]:

    
# We'll start with a Naive Bayes Classifier. There's a lot more we could do here but it's a start
classifier = NaiveBayesClassifier.train(trainFeatures)

#initiates referenceSets and testSets
referenceSets = collections.defaultdict(set)
testSets = collections.defaultdict(set)



In [37]:

    
# puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
for i, (features, label) in enumerate(testFeatures):
    referenceSets[label].add(i)
    predicted = classifier.classify(features)
    testSets[predicted].add(i)



In [38]:

    
#prints metrics to show how well the feature selection did
print ('train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures)))
print ('accuracy:', nltk.classify.util.accuracy(classifier, testFeatures))
print ('pos precision:', nltk.scores.precision(referenceSets['pos'], testSets['pos']))
print ('pos recall:', nltk.scores.recall(referenceSets['pos'], testSets['pos']))
print ('neg precision:', nltk.scores.precision(referenceSets['neg'], testSets['neg']))
print ('neg recall:', nltk.scores.recall(referenceSets['neg'], testSets['neg']))









    



train on 7997 instances, test on 2666 instances
accuracy: 0.7730682670667667
pos precision: 0.7875197472353871
pos recall: 0.7479369842460615
neg precision: 0.76
neg recall: 0.7981995498874719

Now we have a Naive Bayes Classifier that looks at words in movie reviews and predicts whether that review is positive or negative. As it stands, the accuracy is only 77%, which means that we're on the right path (better than just guessing) but it's not very impressive. Still, we have a bunch of words that correlate with a positive or negative review. Let's take a look at some of the most predictive words and see what we've got



In [39]:

    
classifier.show_most_informative_features(10)









    



Most Informative Features
              engrossing = True              pos : neg    =     17.0 : 1.0
                   quiet = True              pos : neg    =     15.7 : 1.0
                mediocre = True              neg : pos    =     13.7 : 1.0
               absorbing = True              pos : neg    =     13.0 : 1.0
                portrait = True              pos : neg    =     12.4 : 1.0
                   flaws = True              pos : neg    =     12.3 : 1.0
               inventive = True              pos : neg    =     12.3 : 1.0
              refreshing = True              pos : neg    =     12.3 : 1.0
                 triumph = True              pos : neg    =     11.7 : 1.0
            refreshingly = True              pos : neg    =     11.7 : 1.0

Those words aren't great for determining someone's thoughts about ICE raids. "Engrossing" is definintely a movie word. It's also interesting that "flaws" is such a positive word.

OK, now for the moment of truth - let's see what percentage of tweets about the ICE raids this model classifies as positive. Remember, it only got 77% right for movie reviews, so this could be wildly inaccurate.



In [40]:

    
print ('This model predicts that {:.1%} of tweets about the ICE raids have been positive'
       .format(nltk.classify.util.accuracy(classifier, mytestFeatures)))









    



This model predicts that 67.6% of tweets about the ICE raids have been positive

Well, we got significantly over 50%. Can we conclude that the majority of tweets about this have been positive? Perhaps. We could also check it every month or so to see how this number changes over time.



In [ ]:

	id	user_followers	created	user_name	text	user_created	id_str	user_description	original_id	original_name	friends_count	hashtags	retweet	user_location
0	1	92298	2017-02-16 22:53:31.000000	IndigndeVerdad	LIVE NYC #ICERaids https://t.co/PZnleLAkpv	2013-03-21 13:02:04.000000	832362301672009728	NaN	NaN	NaN	90376	["ICERaids"]	N	Madrid
1	2	18927	2017-02-16 22:53:44.000000	MaketheRoadNY	Antonio Alarcon from MRNY introduces passionat...	2010-05-07 17:35:33.000000	832362355581448192	Building power of Latinx & working class commu...	NaN	NaN	2882	["ICERaids", "freedaniel"]	N	New York City
2	3	16479	2017-02-16 22:53:56.000000	sickjew	RT @altochulo: ¡Trump escucha, estamos en la l...	2009-08-13 16:41:02.000000	832362403845251072	Health care is a human right. // backup accoun...	NaN	NaN	12004	["FreeDaniel", "ICERaids"]	N	NaN
3	4	16479	2017-02-16 22:54:06.000000	sickjew	RT @altochulo: Getting packed here to say #Fre...	2009-08-13 16:41:02.000000	832362447231127554	Health care is a human right. // backup accoun...	NaN	NaN	12004	["FreeDaniel", "ICERaids"]	N	NaN
4	5	129	2017-02-16 22:54:08.000000	Babar_392	RT @MaketheRoadNY: Staten Island--where recent...	2011-11-23 15:46:54.000000	832362454650912768	NaN	NaN	NaN	346	["ICERaids"]	N	Virginia, USA