In [ ]:
import pandas as pd
from pandas import DataFrame
import os
from nltk.corpus import stopwords
import re
import enchant
from nltk.stem.porter import *
import numpy as np
import cPickle as pickle
from collections import Counter
from keras.models import model_from_json
import math
import signal
import h5py
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution1D, MaxPooling1D, Convolution2D, MaxPooling2D
from keras.optimizers import SGD
from keras import backend as K
from scipy.sparse import csr_matrix
from sklearn.manifold import TSNE
In [ ]:
#feature extraction - TFIDF and unigrams
def vec(preprocessed_data_sample):
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
# no_features = 1000#500#806#150#800#600#350
#ngram_range=(1, 1)
# input=u'content', encoding=u'utf-8', decode_error=u'strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer=u'word', stop_words=None, token_pattern=u'(?u)\b\w\w+\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<type 'numpy.int64'>, norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False
vectorizer = TfidfVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, vocabulary = None, ngram_range=(1,1), strip_accents=None)#, max_features = no_features)#, ngram_range=(2,2))
#vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, ngram_range=(2,2), max_features = no_features)
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
# strings.
train_data_features = vectorizer.fit_transform(preprocessed_data_sample)
# Numpy arrays are easy to work with, so convert the result to an
# array
train_data_features = train_data_features.toarray()
return [train_data_features, vectorizer, no_features]
In [ ]:
import pandas as pd
from pandas import DataFrame
import os
from nltk.corpus import stopwords
import re
import enchant
from nltk.stem.porter import *
import numpy as np
import cPickle as pickle
from collections import Counter
from keras.models import model_from_json
import math
import signal
import h5py
from scipy.sparse import csr_matrix
from sklearn.manifold import TSNE
import codecs
import pylab as plot
K.set_image_dim_ordering('th')
X_train = []
Y_train = []
def handler(signum, frame):
print 'Ctrl+Z pressed'
assert False
signal.signal(signal.SIGTSTP, handler)
# In[2]:
def readData(filename1, filename2):
cwd = os.getcwd()
req_attributes = ['tweet_id', 'topic', 'sentiment', 'tweet', 'user_id']#, 'followers_count', 'statuses_count', 'description', 'friends_count', 'location']
user_req_attributes = ['tweet_id', 'user_id']#, 'followers_count', 'statuses_count', 'description', 'friends_count', 'location']
tweet_req_attributes = ['tweet_id', 'topic', 'sentiment', 'tweet']
path = cwd + "/data/" + filename1;
tweet_df = pd.read_csv(path, sep='\t');
tweet_df = tweet_df.drop_duplicates(['tweet_id'])
tweet_df = tweet_df[tweet_req_attributes]
path = cwd + "/data/" + filename2;
user_df = pd.read_csv(path, sep='\t')
user_df = user_df.dropna(subset=['user_id'])
user_df = user_df[user_req_attributes]
data = tweet_df.merge(user_df, left_on="tweet_id", right_on="tweet_id", how="inner")
data = data.drop_duplicates(['tweet_id'])
data = data.dropna(subset=['user_id', 'tweet'])
data = data.dropna(subset=['user_id'])
print len(user_df), len(tweet_df), len(data)
print "From user data\n", Counter(list(data["user_id"])).most_common(50)
return data[req_attributes]
def tokenize_and_stopwords(data_sample):
print type(data_sample)
print len(data_sample)
#Get all english stopwords
try:
words = open("common_words.txt", "r").readlines()
for i in range(len(words)):
words[i] = words[i].strip()
except:
words = []
print "words", words
#abb_dict = pickle.load(open("abbreviations", "r"))
stop = stopwords.words('english') + words #list(string.punctuation) + ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
#Use only characters from reviews
data_sample = data_sample.str.replace("[^a-zA-Z ]", " ")#, " ")
data_sample = data_sample.str.lower()
return [(" ").join([i for i in sentence.split() if i not in stop]) for sentence in data_sample]
def cleanhtml(tweet):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', tweet)
return cleantext
def cleanUrl(tweet):
tweet= re.sub(r"http\S+", "", tweet)
return tweet;
def removeMention(tweet):
tweet = tweet.replace("rt@","").rstrip()
tweet = tweet.replace("rt ","").rstrip()
tweet = tweet.replace("@","").rstrip()
return tweet;
def stemmer(preprocessed_data_sample):
print "stemming "
#Create a new Porter stemmer.
stemmer = PorterStemmer()
#try:
for i in range(len(preprocessed_data_sample)):
#Stemming
try:
preprocessed_data_sample[i] = preprocessed_data_sample[i].replace(preprocessed_data_sample[i], " ".join([stemmer.stem(str(word)) for word in preprocessed_data_sample[i].split()]))
except:
#No stemming
preprocessed_data_sample[i] = preprocessed_data_sample[i].replace(preprocessed_data_sample[i], " ".join([str(word) for word in preprocessed_data_sample[i].split()]))
return preprocessed_data_sample
#feature extraction - TFIDF and unigrams
def vectorize(preprocessed_data_sample):
from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.
# no_features = 1000#500#806#150#800#600#350
#ngram_range=(1, 1)
# input=u'content', encoding=u'utf-8', decode_error=u'strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer=u'word', stop_words=None, token_pattern=u'(?u)\b\w\w+\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<type 'numpy.int64'>, norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False
vectorizer = TfidfVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, vocabulary = None, ngram_range=(1,1), strip_accents=None)#, max_features = no_features)#, ngram_range=(2,2))
#vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, ngram_range=(2,2), max_features = no_features)
# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of
# strings.
train_data_features = vectorizer.fit_transform(preprocessed_data_sample)
# Numpy arrays are easy to work with, so convert the result to an
# array
train_data_features = train_data_features.toarray()
return [train_data_features, vectorizer, no_features]
def preprocess(filename1, filename2):
#filename = "Homework2_data.csv"
df = readData(filename1, filename2)
print "from joined data\n", Counter(list(df["user_id"])).most_common(50)
indices = []
# df['tweet'] = df['tweet'].apply(cleanhtml).apply(cleanUrl).apply(removeMention).apply(removeTrailingHash);
df['tweet'] = df['tweet'].apply(cleanhtml).apply(cleanUrl)#.apply(removeTrailingHash);
df['tweet'] = tokenize_and_stopwords(df['tweet'])
data = DataFrame(df.groupby('topic')['tweet'].apply(list)).reset_index()
for i in range(len(data)):
data['tweet'][i] = " ".join(data['tweet'][i])
topics = list(data["topic"])
# Watch out
#. topics = topics[0:10]
# Word topic mapping
try:
word_dict = pickle.load(open("word_dict", "r"))
except:
tweets = ""
for index, i in data.iterrows():
# for i in data['tweet']:
if i['topic'] in topics:
tweets += str(i['tweet'])
word_dict = {}
tweets = tweets.split()
for word in tweets:
word_dict[word] = []
for i in range(len(topics)):
if word in data["tweet"][i]:
word_dict[word].append(topics[i])
pickle.dump(word_dict, open("word_dict", "wb"))
print "the word 'election' is present in", (word_dict['register'])
print len(word_dict)
return df, topics
# Word model
In [ ]:
data = os.getcwd() + "/data/"
print data
In [ ]:
filename1 = "tweets.txt"#twitter-2016dev-CE-output.txt_semeval_tweets.txt"
print filename1
filename2 = "users.txt"#"twitter-2016dev-CE-output.txt_semeval_userinfo.txt"
print filename2
df, topics =preprocess(filename1, filename2)
In [ ]:
print df.shape
In [ ]:
from gensim.models import Word2Vec
all_text = df["tweet"].str.cat(sep=' ')
In [ ]:
import nltk
df['tokenized_sents'] = df.apply(lambda row: nltk.word_tokenize(row['tweet']), axis=1)
In [ ]:
print df.shape
In [ ]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
level=logging.INFO)
In [ ]:
import gensim.models.word2vec as wv
model = wv.Word2Vec(df["tokenized_sents"], size=100, window=5, min_count=5, workers=4)
In [ ]:
model.save("word2vec")
In [ ]:
model = Word2Vec.load("word2vec")
#model.similarity("this", "is")
model.init_sims(replace=True)
In [ ]:
def getEmbedding(sentence):
list = np.array([])
for word in sentence:
if word in model.wv.vocab:
list = np.append(list, model.wv[word])
#print list.size
if(list.size > 5000):
list = list[0:5000]
#print sentence
pad = np.zeros(5000 - list.size)
list = np.append(list, pad)
#print list.shape
return list
#getEmbedding(df['tokenized_sents'][0])
In [ ]:
print df.size
df['embedding'] = df['tokenized_sents'].apply(getEmbedding)
In [ ]:
print df
In [294]:
genre = np.array(['tech', 'politics', 'music', 'sports'])
tech = np.array(['@microsoft', 'nokia', 'amazon', 'amazon prime', 'amazon prime day', 'apple', 'apple watch', 'ipad', 'iphone', 'ipod', 'oracle', 'ibm', 'nintendo', 'moto g', 'google', 'google +', 'ps4', 'netflix'])
politics = np.array(['angela merkel', 'bernie sanders', 'david cameron',' donald trump', 'hillary', 'joe biden', 'michelle obama', 'obama', 'rahul gandhi', 'tony blair'])
music = np.array(['bee gees', 'beyonce', 'bob marley', 'chris brown', 'david bowie', 'katy perry', 'ed sheeran', 'foo fighters', 'janet jackson', 'lady gaga', 'michael jackson', 'ac/dc', 'the vamps', 'iron maiden', 'rolling stone', 'jay-z', 'snoop dogg', 'nirvana'])
sports = np.array(['arsenal', 'barca', 'federer', 'floyd mayweather', 'hulk hogan', 'john cena', 'kris bryant', 'randy orton', 'real madrid', 'serena', 'messi', 'david beckham', 'rousey', 'super eagles', 'kane', 'red sox', 'white sox'])
In [295]:
all_topics = np.concatenate((tech, politics, music, sports))
df_filter = df[df["topic"].isin(all_topics)]
topics_array = np.array(([tech, politics, music, sports]))
df_filter['topic_id'] = df_filter['topic'].apply(getTopicId)
In [296]:
def getTopicId(topic):
return all_topics.tolist().index(topic)
df_filter['topic_id'] = df_filter['topic'].apply(getTopicId)
In [297]:
from sklearn import svm
#print df.columns
import pandas as pd
one_hot = pd.get_dummies(all_topics)
In [298]:
df_filter['vector'] = df_filter['embedding'] # + one_hot[df['topic']].T
for index, row in df_filter.iterrows():
one_hot_encoding = one_hot[row['topic']]
row['vector'] = np.concatenate([row['vector'], one_hot_encoding])
In [299]:
#list contains objects and you want to copy them as well, use generic copy.deepcopy():
import copy
#new_list = copy.deepcopy(old_list)
X =copy.deepcopy( np.vstack(df_filter['embedding'][0:5000]))
X_test = copy.deepcopy(np.vstack(df_filter['embedding'][5001:6357]))
Y = copy.deepcopy(df_filter['sentiment'][0:5000])
Y_test=copy.deepcopy(df_filter['sentiment'][5001:6357])
In [300]:
df
import collections
counter=collections.Counter(df['sentiment'])
print counter
counter2=collections.Counter(df_filter['sentiment'])
print counter2
In [ ]:
Y
In [ ]:
#print Y
print X.shape
print Y.shape
Y_train = Y
Y_train[ Y_train < 0] = 0
Y_train[ Y_train == 0] = 1
Y_train[ Y_train > 0] = 2
print Y_train, Y_test
#np.reshape(X,(20000, 5000) )
In [ ]:
#np.reshape(Y_train, (1,5000))
print X.shape
print Y.shape
clf = svm.SVC()
clf.fit(X,Y)
In [ ]:
Y_pred_new = clf.predict(X_test)
In [ ]:
from sklearn.metrics import accuracy_score
In [ ]:
accuracy_score(Y_test, pred)
In [ ]:
Y_pred = np.array(Y_pred)
Y_pred[Y_pred < 0] = -1
Y_pred[Y_pred == 0] = 0
Y_pred[Y_pred > 0] = 1
In [ ]:
Y_pred.shape
In [ ]:
Y.shape
In [ ]:
Y_test[np.where(Y_test < 0)] = -1
Y_test[np.where(Y_test == 0)] = 0
Y_test[np.where(Y_test > 0)] = 1
In [307]:
def groupClasses(Y) :
Y[Y < 0] = -1
Y[Y == 0] = 0
Y[Y > 0] = 1
return Y
In [309]:
counter2=collections.Counter(Y)
print counter2
counter2=collections.Counter(pred_train)
print counter2
counter2=collections.Counter(Y_pred)
print counter2
counter2=collections.Counter(Y_test)
print counter2
counter2=collections.Counter(pred)
print counter2
In [301]:
print "logistic Regression"
from sklearn.linear_model import LogisticRegression
logregr = LogisticRegression()
logregr.fit(X, Y)
pred = logregr.predict(X_test)
In [302]:
pred_train = logregr.predict(X)
In [306]:
print accuracy_score(Y, pred_train)
print accuracy_score(Y_test, pred)
In [308]:
Y = groupClasses(Y)
pred_train = groupClasses(pred_train)
In [ ]:
word2topic = pickle.load(open("word2topic", "r"))
In [ ]:
def getEmbeddingWord2Topic(sentence):
list = np.array([])
for word in sentence:
if word in keys:
list = np.append(list, word2topic[word])
#print list.size
if(list.size > 5000):
list = list[0:5000]
#print sentence
pad = np.zeros(5000 - list.size)
list = np.append(list, pad)
#print list.shape
return list
In [ ]:
keys = word2topic.keys()
In [293]:
df['embedding'] = df['tokenized_sents'].apply(getEmbeddingWord2Topic)
In [ ]: