In [ ]:
import pandas as pd
from pandas import DataFrame
import os
from nltk.corpus import stopwords
import re
import enchant
from nltk.stem.porter import *
import numpy as np
import cPickle as pickle
from collections import Counter
from keras.models import model_from_json
import math
import signal
import h5py
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution1D, MaxPooling1D, Convolution2D, MaxPooling2D
from keras.optimizers import SGD
from keras import backend as K
from scipy.sparse import csr_matrix
from sklearn.manifold import TSNE

In [ ]:
#feature extraction - TFIDF and unigrams
def vec(preprocessed_data_sample):
    from sklearn.feature_extraction.text import TfidfVectorizer

    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.
#    no_features = 1000#500#806#150#800#600#350

    #ngram_range=(1, 1)
#    input=u'content', encoding=u'utf-8', decode_error=u'strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer=u'word', stop_words=None, token_pattern=u'(?u)\b\w\w+\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<type 'numpy.int64'>, norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False

    vectorizer = TfidfVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, vocabulary = None, ngram_range=(1,1), strip_accents=None)#, max_features = no_features)#, ngram_range=(2,2))
    #vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, ngram_range=(2,2), max_features = no_features)
    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of
    # strings.
    train_data_features = vectorizer.fit_transform(preprocessed_data_sample)

    # Numpy arrays are easy to work with, so convert the result to an
    # array
    train_data_features = train_data_features.toarray()
    return [train_data_features, vectorizer, no_features]

In [ ]:
import pandas as pd
from pandas import DataFrame
import os
from nltk.corpus import stopwords
import re
import enchant
from nltk.stem.porter import *
import numpy as np
import cPickle as pickle
from collections import Counter
from keras.models import model_from_json
import math
import signal
import h5py
from scipy.sparse import csr_matrix
from sklearn.manifold import TSNE
import codecs
import pylab as plot
K.set_image_dim_ordering('th')


X_train = []
Y_train = []

def handler(signum, frame):
    print 'Ctrl+Z pressed'
    assert False
signal.signal(signal.SIGTSTP, handler)

# In[2]:

def readData(filename1, filename2):
    cwd = os.getcwd()
    req_attributes = ['tweet_id', 'topic', 'sentiment', 'tweet', 'user_id']#, 'followers_count', 'statuses_count', 'description', 'friends_count', 'location']
    user_req_attributes = ['tweet_id', 'user_id']#, 'followers_count', 'statuses_count', 'description', 'friends_count', 'location']
    tweet_req_attributes = ['tweet_id', 'topic', 'sentiment', 'tweet']
    path = cwd + "/data/" + filename1;
    tweet_df = pd.read_csv(path, sep='\t');
    
    tweet_df = tweet_df.drop_duplicates(['tweet_id'])
    tweet_df = tweet_df[tweet_req_attributes]

    path = cwd + "/data/" + filename2;
    user_df = pd.read_csv(path, sep='\t')
    user_df = user_df.dropna(subset=['user_id'])
    user_df = user_df[user_req_attributes]
    
    data = tweet_df.merge(user_df, left_on="tweet_id", right_on="tweet_id", how="inner")
    data = data.drop_duplicates(['tweet_id'])
    data = data.dropna(subset=['user_id', 'tweet'])
    data = data.dropna(subset=['user_id'])
    print len(user_df), len(tweet_df), len(data)
    
    print "From user data\n", Counter(list(data["user_id"])).most_common(50)
    return data[req_attributes]


def tokenize_and_stopwords(data_sample):

    print type(data_sample)
    print len(data_sample)
    #Get all english stopwords
    try:
        words = open("common_words.txt", "r").readlines()
        for i in range(len(words)):
            words[i] = words[i].strip()
    except: 
        words = []
    print "words", words
    #abb_dict = pickle.load(open("abbreviations", "r"))
    stop = stopwords.words('english') + words #list(string.punctuation) + ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
    #Use only characters from reviews
    data_sample = data_sample.str.replace("[^a-zA-Z ]", " ")#, " ")
    data_sample = data_sample.str.lower()
                
    return [(" ").join([i for i in sentence.split() if i not in stop]) for sentence in data_sample]

def cleanhtml(tweet):
      cleanr = re.compile('<.*?>')
      cleantext = re.sub(cleanr, '', tweet)
      return cleantext

def cleanUrl(tweet):
    tweet= re.sub(r"http\S+", "",  tweet)
    return tweet;

def removeMention(tweet):
    tweet = tweet.replace("rt@","").rstrip()
    tweet = tweet.replace("rt ","").rstrip()
    tweet = tweet.replace("@","").rstrip()
    return tweet;

def stemmer(preprocessed_data_sample):
    print "stemming "
    #Create a new Porter stemmer.
    stemmer = PorterStemmer()
    #try:
    for i in range(len(preprocessed_data_sample)):
        #Stemming
        try:
            preprocessed_data_sample[i] = preprocessed_data_sample[i].replace(preprocessed_data_sample[i], " ".join([stemmer.stem(str(word)) for word in preprocessed_data_sample[i].split()]))
        except:
        #No stemming
            preprocessed_data_sample[i] = preprocessed_data_sample[i].replace(preprocessed_data_sample[i], " ".join([str(word) for word in preprocessed_data_sample[i].split()]))
    return preprocessed_data_sample

    
#feature extraction - TFIDF and unigrams
def vectorize(preprocessed_data_sample):
    from sklearn.feature_extraction.text import TfidfVectorizer

    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool.
#    no_features = 1000#500#806#150#800#600#350

    #ngram_range=(1, 1)
#    input=u'content', encoding=u'utf-8', decode_error=u'strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer=u'word', stop_words=None, token_pattern=u'(?u)\b\w\w+\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<type 'numpy.int64'>, norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False

    vectorizer = TfidfVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, vocabulary = None, ngram_range=(1,1), strip_accents=None)#, max_features = no_features)#, ngram_range=(2,2))
    #vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, ngram_range=(2,2), max_features = no_features)
    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of
    # strings.
    train_data_features = vectorizer.fit_transform(preprocessed_data_sample)

    # Numpy arrays are easy to work with, so convert the result to an
    # array
    train_data_features = train_data_features.toarray()
    return [train_data_features, vectorizer, no_features]

    
def preprocess(filename1, filename2):
    #filename = "Homework2_data.csv"
    df = readData(filename1, filename2)
    print "from joined data\n", Counter(list(df["user_id"])).most_common(50)
    indices = []
#    df['tweet'] = df['tweet'].apply(cleanhtml).apply(cleanUrl).apply(removeMention).apply(removeTrailingHash);

    df['tweet'] = df['tweet'].apply(cleanhtml).apply(cleanUrl)#.apply(removeTrailingHash);
    df['tweet'] = tokenize_and_stopwords(df['tweet'])
    data = DataFrame(df.groupby('topic')['tweet'].apply(list)).reset_index()

    for i in range(len(data)):
        data['tweet'][i] = " ".join(data['tweet'][i])
    
    topics = list(data["topic"])
#    Watch out
#.   topics = topics[0:10]
#    Word topic mapping
    try:
        word_dict = pickle.load(open("word_dict", "r"))
    except:
        tweets = ""
        for index, i in data.iterrows():
#        for i in data['tweet']:
            if i['topic'] in topics:
                tweets += str(i['tweet'])
        
        word_dict = {}
        tweets = tweets.split()
        for word in tweets:
            word_dict[word] = []
            for i in range(len(topics)):
                if word in data["tweet"][i]:
                    word_dict[word].append(topics[i])        
        pickle.dump(word_dict, open("word_dict", "wb"))
    
    print "the word 'election' is present in", (word_dict['register'])
    print len(word_dict)
    return df, topics
#    Word model

In [ ]:
data = os.getcwd() + "/data/"
print data

In [ ]:
filename1 = "tweets.txt"#twitter-2016dev-CE-output.txt_semeval_tweets.txt"
print filename1
filename2 = "users.txt"#"twitter-2016dev-CE-output.txt_semeval_userinfo.txt"
print filename2
df, topics =preprocess(filename1, filename2)

In [ ]:
print df.shape

In [ ]:
from gensim.models import Word2Vec
all_text = df["tweet"].str.cat(sep=' ')

In [ ]:
import nltk
df['tokenized_sents'] = df.apply(lambda row: nltk.word_tokenize(row['tweet']), axis=1)

In [ ]:
print df.shape

In [ ]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

In [ ]:
import gensim.models.word2vec as wv
model = wv.Word2Vec(df["tokenized_sents"], size=100, window=5, min_count=5, workers=4)

In [ ]:
model.save("word2vec")

In [ ]:
model = Word2Vec.load("word2vec")
#model.similarity("this", "is")
model.init_sims(replace=True)

In [ ]:
def getEmbedding(sentence):
    list = np.array([])
    for word in sentence:
        if word in model.wv.vocab:
             list = np.append(list, model.wv[word])
    
    #print list.size
    if(list.size > 5000):
        list = list[0:5000]
    #print sentence
    pad = np.zeros(5000 - list.size)
    list = np.append(list, pad)
    #print list.shape
    return list
    
#getEmbedding(df['tokenized_sents'][0])

In [ ]:
print df.size
df['embedding'] = df['tokenized_sents'].apply(getEmbedding)

In [ ]:
print df

In [294]:
genre = np.array(['tech', 'politics', 'music', 'sports'])
tech = np.array(['@microsoft', 'nokia', 'amazon', 'amazon prime', 'amazon prime day', 'apple', 'apple watch', 'ipad', 'iphone', 'ipod', 'oracle', 'ibm', 'nintendo', 'moto g', 'google', 'google +', 'ps4', 'netflix'])                        

politics = np.array(['angela merkel',  'bernie sanders', 'david cameron',' donald trump', 'hillary', 'joe biden', 'michelle obama', 'obama', 'rahul gandhi', 'tony blair'])

music = np.array(['bee gees', 'beyonce', 'bob marley', 'chris brown', 'david bowie', 'katy perry',  'ed sheeran', 'foo fighters', 'janet jackson', 'lady gaga', 'michael jackson',  'ac/dc', 'the vamps', 'iron maiden', 'rolling stone', 'jay-z', 'snoop dogg', 'nirvana'])

sports = np.array(['arsenal', 'barca', 'federer', 'floyd mayweather', 'hulk hogan', 'john cena', 'kris bryant', 'randy orton', 'real madrid', 'serena', 'messi', 'david beckham', 'rousey', 'super eagles', 'kane', 'red sox', 'white sox'])

In [295]:
all_topics = np.concatenate((tech, politics, music, sports))
df_filter = df[df["topic"].isin(all_topics)]
topics_array = np.array(([tech, politics, music, sports]))
df_filter['topic_id'] = df_filter['topic'].apply(getTopicId)


/Users/shubhi/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [296]:
def getTopicId(topic):
    return all_topics.tolist().index(topic) 
df_filter['topic_id'] = df_filter['topic'].apply(getTopicId)


/Users/shubhi/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()

In [297]:
from sklearn import svm

#print df.columns
import pandas as pd 
one_hot = pd.get_dummies(all_topics)

In [298]:
df_filter['vector'] = df_filter['embedding'] # + one_hot[df['topic']].T
for index, row in df_filter.iterrows():
    one_hot_encoding  =  one_hot[row['topic']]
    row['vector'] = np.concatenate([row['vector'], one_hot_encoding])


/Users/shubhi/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [299]:
#list contains objects and you want to copy them as well, use generic copy.deepcopy():

import copy
#new_list = copy.deepcopy(old_list)
X =copy.deepcopy( np.vstack(df_filter['embedding'][0:5000]))
X_test = copy.deepcopy(np.vstack(df_filter['embedding'][5001:6357]))
Y = copy.deepcopy(df_filter['sentiment'][0:5000])
Y_test=copy.deepcopy(df_filter['sentiment'][5001:6357])

In [300]:
df
import collections
counter=collections.Counter(df['sentiment'])
print counter
counter2=collections.Counter(df_filter['sentiment'])
print counter2


Counter({'0': 10125, '1': 9576, '-1': 2483, '2': 692, '-2': 200})
Counter({'0': 2994, '1': 2808, '-1': 617, '2': 269, '-2': 69})

In [ ]:
Y

In [ ]:
#print Y
print X.shape
print Y.shape
Y_train = Y
Y_train[ Y_train < 0] = 0
Y_train[ Y_train == 0] = 1
Y_train[ Y_train > 0] = 2
print Y_train, Y_test
#np.reshape(X,(20000, 5000) )

In [ ]:
#np.reshape(Y_train, (1,5000))
print X.shape
print Y.shape
clf = svm.SVC()
clf.fit(X,Y)

In [ ]:
Y_pred_new = clf.predict(X_test)

In [ ]:
from sklearn.metrics import accuracy_score

In [ ]:
accuracy_score(Y_test, pred)

In [ ]:
Y_pred = np.array(Y_pred)
Y_pred[Y_pred < 0] = -1
Y_pred[Y_pred == 0] = 0
Y_pred[Y_pred > 0] = 1

In [ ]:
Y_pred.shape

In [ ]:
Y.shape

In [ ]:
Y_test[np.where(Y_test < 0)] = -1
Y_test[np.where(Y_test == 0)] = 0
Y_test[np.where(Y_test > 0)] = 1

In [307]:
def groupClasses(Y) :
    Y[Y < 0] = -1
    Y[Y == 0] = 0
    Y[Y > 0] = 1
    return Y

In [309]:
counter2=collections.Counter(Y)
print counter2
counter2=collections.Counter(pred_train)
print counter2
counter2=collections.Counter(Y_pred)
print counter2
counter2=collections.Counter(Y_test)
print counter2
counter2=collections.Counter(pred)
print counter2


Counter({1: 5000})
Counter({1: 5000})
Counter({2: 1356})
Counter({'1': 666, '0': 423, '-1': 153, '2': 89, '-2': 25})
Counter({'0': 733, '1': 557, '-1': 65, '2': 1})

In [301]:
print "logistic Regression"
from sklearn.linear_model import LogisticRegression
logregr = LogisticRegression()
logregr.fit(X, Y)
pred = logregr.predict(X_test)


logistic Regression

In [302]:
pred_train = logregr.predict(X)

In [306]:
print accuracy_score(Y, pred_train)
print accuracy_score(Y_test, pred)


0.6658
0.411504424779

In [308]:
Y = groupClasses(Y)
pred_train = groupClasses(pred_train)

In [ ]:
word2topic = pickle.load(open("word2topic", "r"))

In [ ]:
def getEmbeddingWord2Topic(sentence):
   
    list = np.array([])
    for word in sentence:
        if word in keys:
             list = np.append(list, word2topic[word])
    
    #print list.size
    if(list.size > 5000):
        list = list[0:5000]
    #print sentence
    pad = np.zeros(5000 - list.size)
    list = np.append(list, pad)
    #print list.shape
    return list

In [ ]:
keys = word2topic.keys()

In [293]:
df['embedding'] = df['tokenized_sents'].apply(getEmbeddingWord2Topic)

In [ ]: