In [2]:
%load_ext Cython

Twitter Sentiment

Amy Ma

In [3]:
### Prepare for training tweets with mood
#df = pd.read_csv("Data/SocialMediaData/training.1600000.processed.noemoticon.csv",header=None)
#df = df.rename(columns = {0:'motion',5:"tweet"})
#df = df[df["motion"]==4]["tweet"]
# save as txt
#df.to_csv(r'positive.txt', header=None, index=None, sep=' ', mode='a')
#df = df[df["motion"]==0]["tweet"]
# save as txt
#df.to_csv(r'negative.txt', header=None, index=None, sep=' ', mode='a')

Training models

Training tweets download from http://help.sentiment140.com/for-students/


In [6]:
%%time
%%cython
from gensim.models import doc2vec
import nltk
import re

## postive model
def split_sentence (sentence):
    return re.split('\W+',sentence)
class MyDocs(object):
    def __iter__(self):
        for i, text in enumerate(open('train_set/positive.txt')): # doesn't use brand name
            yield doc2vec.LabeledSentence(words=split_sentence(text), tags=['%s' % i])
            # Train the doc2vec model
cdef pos = MyDocs()
model = doc2vec.Doc2Vec(pos, size = 200, window = 8, min_count = 5, workers = 4)
model.save('positive_tweets.model')# save the model
#model = doc2vec.Doc2Vec.load('cab_tweets.model')

## negative model
def split_sentence (sentence):
    return re.split('\W+',sentence)
class MyDocs(object):
    def __iter__(self):
        for i, text in enumerate(open('train_set/negative.txt')): # doesn't use brand name
            yield doc2vec.LabeledSentence(words=split_sentence(text), tags=['%s' % i])
            # Train the doc2vec model
cdef neg = MyDocs()
model = doc2vec.Doc2Vec(neg, size = 200, window = 8, min_count = 5, workers = 4)
model.save('negative_tweets.model')# save the model


CPU times: user 512 µs, sys: 325 µs, total: 837 µs
Wall time: 571 µs

Bulid models

1. CAB


In [13]:
# load models
model1 = doc2vec.Doc2Vec.load('positive_tweets.model')
model2 = doc2vec.Doc2Vec.load('negative_tweets.model')

In [14]:
# Calculate the vector of search term according to our model
import numpy as np
cab_pos = np.zeros((17800, 200))
for i,text in enumerate(open("train_set/cab_tweets.txt")): # input search terms 
    cab_pos[i]=model1.infer_vector(split_sentence(text))   
cab_pos


Out[14]:
array([[ 0.0173258 , -0.00898504, -0.00372691, ..., -0.04584163,
         0.04676172, -0.00129063],
       [-0.00223486,  0.02191991,  0.03007294, ...,  0.021487  ,
        -0.00660933, -0.0134338 ],
       [-0.01097532, -0.01810212, -0.02172616, ...,  0.00685231,
         0.01789958,  0.00049202],
       ..., 
       [ 0.05336217,  0.00263875, -0.00278048, ...,  0.01692542,
         0.03695101,  0.02515182],
       [ 0.02517324,  0.03924625, -0.02224799, ..., -0.01132001,
        -0.00593403,  0.04400497],
       [-0.0551214 , -0.02106871, -0.02813808, ...,  0.00797126,
         0.05846385, -0.04289546]])

In [15]:
cab_neg = np.zeros((17800, 200))
for i,text in enumerate(open("train_set/cab_tweets.txt")): # input search terms 
    cab_neg[i]=model2.infer_vector(split_sentence(text))   
cab_neg


Out[15]:
array([[-0.01521745,  0.02379482,  0.00525517, ...,  0.0085166 ,
         0.04483327,  0.00420388],
       [ 0.04234859, -0.07045103, -0.00678941, ..., -0.00261229,
        -0.00978813,  0.00431221],
       [ 0.02446519, -0.06751232, -0.02617534, ..., -0.02577785,
        -0.03009019,  0.00126595],
       ..., 
       [-0.04035848,  0.00359454,  0.0385383 , ...,  0.02497121,
        -0.00496958,  0.01990214],
       [-0.00276907,  0.01855428, -0.006     , ...,  0.01592065,
        -0.00485722,  0.00648402],
       [-0.02652839,  0.03481127, -0.01361123, ..., -0.05516719,
        -0.05775858, -0.04353976]])

2.DKS


In [17]:
import numpy as np
dks_pos = np.zeros((26480, 200))
for i,text in enumerate(open("train_set/dks_tweets.txt")): # input search terms 
    dks_pos[i]=model1.infer_vector(split_sentence(text))   
dks_pos


Out[17]:
array([[-0.03769807, -0.05903878,  0.03301163, ..., -0.04514879,
         0.01143621, -0.0092718 ],
       [ 0.03541106,  0.06419629,  0.02617208, ..., -0.03641075,
        -0.03344924,  0.06227129],
       [ 0.05381933,  0.06853318,  0.0335772 , ..., -0.03769714,
        -0.0425928 ,  0.0616048 ],
       ..., 
       [-0.05196267,  0.0226444 ,  0.02209938, ..., -0.02884795,
         0.01701822,  0.01468133],
       [-0.00889524,  0.00961463, -0.0320751 , ...,  0.02887218,
         0.01376016,  0.00777442],
       [-0.02629085,  0.03822378,  0.04691299, ..., -0.03198734,
        -0.01330235,  0.00199453]])

In [16]:
import numpy as np
dks_neg = np.zeros((26480, 200))
for i,text in enumerate(open("train_set/dks_tweets.txt")): # input search terms 
    dks_neg[i]=model1.infer_vector(split_sentence(text))   
dks_neg


Out[16]:
array([[-0.04076815,  0.01920363,  0.03726069, ...,  0.00924385,
         0.06227892,  0.0344903 ],
       [ 0.05956909,  0.04329693,  0.01750396, ..., -0.03252006,
        -0.03920196,  0.08062712],
       [ 0.03949922,  0.11058771,  0.01037455, ..., -0.05932995,
        -0.01978178,  0.05335316],
       ..., 
       [-0.03477366,  0.01279694,  0.00508458, ..., -0.00918794,
        -0.00195322, -0.02989849],
       [-0.0089085 ,  0.01510515, -0.02993284, ...,  0.03266199,
         0.02061392,  0.00195406],
       [-0.03854375,  0.03387105,  0.04029921, ..., -0.02916235,
        -0.00458328, -0.00497512]])

Calculate the cosin distance to check the change of moods


In [18]:
## calculate the cosin distance between 2 vectors to generate the 3rd features
# Compute the cosine similarity values between the input text and all archived reviews
# cossims_with_input = map(lambda v: cossim(input_vec, v), model.docvecs) 
# need to chaneg the code into rows: calculate the cosin distance between the vectors in same rows

# Calculate the cosine similarity between two vecotrs 
def cossim(v1, v2):
    return np.dot(v1, v2) / np.sqrt(np.dot(v1, v1)) / np.sqrt(np.dot(v2, v2))

cossims_cab=np.zeros((17800,1))
for i in range(17800):
    cossims_cab[i] = cossim(cab_pos[i], cab_neg[i])
np.savetxt("cab_sentiment.csv",cossims_cab, delimiter=",")

In [19]:
cossims_dks=np.zeros((26480,1))
for i in range(26480):
    cossims_dks[i] = cossim(dks_pos[i], dks_neg[i])
np.savetxt("dks_sentiment.csv",cossims_dks, delimiter=",")

In [ ]: