%load_ext Cython

Twitter Sentiment

### Prepare for training tweets with mood
#df = pd.read_csv("Data/SocialMediaData/training.1600000.processed.noemoticon.csv",header=None)
#df = df.rename(columns = {0:'motion',5:"tweet"})
#df = df[df["motion"]==4]["tweet"]
# save as txt
#df.to_csv(r'positive.txt', header=None, index=None, sep=' ', mode='a')
#df = df[df["motion"]==0]["tweet"]
# save as txt
#df.to_csv(r'negative.txt', header=None, index=None, sep=' ', mode='a')

Training models

Training tweets download from

from gensim.models import doc2vec
import nltk
import re

## postive model
def split_sentence (sentence):
    return re.split('\W+',sentence)
class MyDocs(object):
    def __iter__(self):
        for i, text in enumerate(open('train_set/positive.txt')): # doesn't use brand name
            yield doc2vec.LabeledSentence(words=split_sentence(text), tags=['%s' % i])
            # Train the doc2vec model
cdef pos = MyDocs()
model = doc2vec.Doc2Vec(pos, size = 200, window = 8, min_count = 5, workers = 4)'positive_tweets.model')# save the model
#model = doc2vec.Doc2Vec.load('cab_tweets.model')

## negative model
def split_sentence (sentence):
    return re.split('\W+',sentence)
class MyDocs(object):
    def __iter__(self):
        for i, text in enumerate(open('train_set/negative.txt')): # doesn't use brand name
            yield doc2vec.LabeledSentence(words=split_sentence(text), tags=['%s' % i])
            # Train the doc2vec model
cdef neg = MyDocs()
model = doc2vec.Doc2Vec(neg, size = 200, window = 8, min_count = 5, workers = 4)'negative_tweets.model')# save the model

Bulid models

1. CAB

# load models
model1 = doc2vec.Doc2Vec.load('positive_tweets.model')
model2 = doc2vec.Doc2Vec.load('negative_tweets.model')

# Calculate the vector of search term according to our model
import numpy as np
cab_pos = np.zeros((17800, 200))
for i,text in enumerate(open("train_set/cab_tweets.txt")): # input search terms 

cab_neg = np.zeros((17800, 200))
for i,text in enumerate(open("train_set/cab_tweets.txt")): # input search terms 

import numpy as np
dks_pos = np.zeros((26480, 200))
for i,text in enumerate(open("train_set/dks_tweets.txt")): # input search terms 

import numpy as np
dks_neg = np.zeros((26480, 200))
for i,text in enumerate(open("train_set/dks_tweets.txt")): # input search terms 

Calculate the cosin distance to check the change of moods

## calculate the cosin distance between 2 vectors to generate the 3rd features
# Compute the cosine similarity values between the input text and all archived reviews
# cossims_with_input = map(lambda v: cossim(input_vec, v), model.docvecs) 
# need to chaneg the code into rows: calculate the cosin distance between the vectors in same rows

# Calculate the cosine similarity between two vecotrs 
def cossim(v1, v2):
    return, v2) / np.sqrt(, v1)) / np.sqrt(, v2))

for i in range(17800):
    cossims_cab[i] = cossim(cab_pos[i], cab_neg[i])
np.savetxt("cab_sentiment.csv",cossims_cab, delimiter=",")

for i in range(26480):
    cossims_dks[i] = cossim(dks_pos[i], dks_neg[i])
np.savetxt("dks_sentiment.csv",cossims_dks, delimiter=",")

