In [2]:
%load_ext Cython
In [3]:
### Prepare for training tweets with mood
#df = pd.read_csv("Data/SocialMediaData/training.1600000.processed.noemoticon.csv",header=None)
#df = df.rename(columns = {0:'motion',5:"tweet"})
#df = df[df["motion"]==4]["tweet"]
# save as txt
#df.to_csv(r'positive.txt', header=None, index=None, sep=' ', mode='a')
#df = df[df["motion"]==0]["tweet"]
# save as txt
#df.to_csv(r'negative.txt', header=None, index=None, sep=' ', mode='a')
Training tweets download from http://help.sentiment140.com/for-students/
In [6]:
%%time
%%cython
from gensim.models import doc2vec
import nltk
import re
## postive model
def split_sentence (sentence):
return re.split('\W+',sentence)
class MyDocs(object):
def __iter__(self):
for i, text in enumerate(open('train_set/positive.txt')): # doesn't use brand name
yield doc2vec.LabeledSentence(words=split_sentence(text), tags=['%s' % i])
# Train the doc2vec model
cdef pos = MyDocs()
model = doc2vec.Doc2Vec(pos, size = 200, window = 8, min_count = 5, workers = 4)
model.save('positive_tweets.model')# save the model
#model = doc2vec.Doc2Vec.load('cab_tweets.model')
## negative model
def split_sentence (sentence):
return re.split('\W+',sentence)
class MyDocs(object):
def __iter__(self):
for i, text in enumerate(open('train_set/negative.txt')): # doesn't use brand name
yield doc2vec.LabeledSentence(words=split_sentence(text), tags=['%s' % i])
# Train the doc2vec model
cdef neg = MyDocs()
model = doc2vec.Doc2Vec(neg, size = 200, window = 8, min_count = 5, workers = 4)
model.save('negative_tweets.model')# save the model
In [13]:
# load models
model1 = doc2vec.Doc2Vec.load('positive_tweets.model')
model2 = doc2vec.Doc2Vec.load('negative_tweets.model')
In [14]:
# Calculate the vector of search term according to our model
import numpy as np
cab_pos = np.zeros((17800, 200))
for i,text in enumerate(open("train_set/cab_tweets.txt")): # input search terms
cab_pos[i]=model1.infer_vector(split_sentence(text))
cab_pos
Out[14]:
In [15]:
cab_neg = np.zeros((17800, 200))
for i,text in enumerate(open("train_set/cab_tweets.txt")): # input search terms
cab_neg[i]=model2.infer_vector(split_sentence(text))
cab_neg
Out[15]:
In [17]:
import numpy as np
dks_pos = np.zeros((26480, 200))
for i,text in enumerate(open("train_set/dks_tweets.txt")): # input search terms
dks_pos[i]=model1.infer_vector(split_sentence(text))
dks_pos
Out[17]:
In [16]:
import numpy as np
dks_neg = np.zeros((26480, 200))
for i,text in enumerate(open("train_set/dks_tweets.txt")): # input search terms
dks_neg[i]=model1.infer_vector(split_sentence(text))
dks_neg
Out[16]:
In [18]:
## calculate the cosin distance between 2 vectors to generate the 3rd features
# Compute the cosine similarity values between the input text and all archived reviews
# cossims_with_input = map(lambda v: cossim(input_vec, v), model.docvecs)
# need to chaneg the code into rows: calculate the cosin distance between the vectors in same rows
# Calculate the cosine similarity between two vecotrs
def cossim(v1, v2):
return np.dot(v1, v2) / np.sqrt(np.dot(v1, v1)) / np.sqrt(np.dot(v2, v2))
cossims_cab=np.zeros((17800,1))
for i in range(17800):
cossims_cab[i] = cossim(cab_pos[i], cab_neg[i])
np.savetxt("cab_sentiment.csv",cossims_cab, delimiter=",")
In [19]:
cossims_dks=np.zeros((26480,1))
for i in range(26480):
cossims_dks[i] = cossim(dks_pos[i], dks_neg[i])
np.savetxt("dks_sentiment.csv",cossims_dks, delimiter=",")
In [ ]: