In [7]:
# -*- coding: utf-8 -*-
from pymongo import MongoClient
import numpy as np
import gensim, cython, codecs, os, logging, re
from gensim import models, corpora
# Variables
hostname = "localhost"
port = 27017
database_name = "twitter-data"
collection_name = "ns"
DATA_DIR = '/home/nipg1/Documents/summer_project/data/'+ collection_name
LOG_DIR = '/home/nipg1/Documents/summer_project/data/'+ collection_name + '/embedding_logs'
In [8]:
#word2vec = gensim.models.Word2Vec.load(os.path.join(DATA_DIR, 'word2vec_model'))
word2vec = gensim.models.KeyedVectors.load_word2vec_format('/home/nipg1/Documents/summer_project/data/models/GoogleNews-vectors-negative300.bin', binary=True, limit=100000)
dictionary = corpora.Dictionary.load(os.path.join(DATA_DIR, '/home/nipg1/Documents/summer_project/data/ns/words.dict'))
tfidf = models.tfidfmodel.TfidfModel.load(os.path.join(DATA_DIR, '/home/nipg1/Documents/summer_project/data/ns/tfidf_model'))
In [9]:
if not os.path.exists(DATA_DIR):
os.makedirs(DATA_DIR)
stopwords = ['the', 'be', 'and', 'of', 'a', 'an', 'in', 'to', 'or', 'have', 'has',
'it', 'i', 'that', 'for', 'you', 'he', 'with', 'on', 'do','say',
'this', 'they', 'at', 'but', 'we', 'rt', '']
pattern = re.compile('[^a-zA-Z\s]', re.UNICODE)
stopword_patterns = ['www\.[^ ]+\.[^ ]+', 'https?:\/\/[^ ]+', '([@#])([a-z\d_]+)', '^\d$'] #links, mentions, hashtags, numbers
stopword_pattern = re.compile("|".join(stopword_patterns))
In [10]:
def standardize_vector(X):
return (X - np.mean(X))/np.std(X)
def tweet_preprocessor(tweet):
tweet = tweet.lower()
tweet = re.sub(stopword_pattern, '', tweet)
tweet = re.sub('\t|\n', '', tweet)
tweet = re.sub(pattern, '', tweet)
return tweet
def filter_stopwords(word):
if word in stopwords:
return False
else:
return True
class MyCorpus(object):
def __iter__(self):
for tweet in collection.find({"text": {"$exists": "true"}}, {'text': 1}):
yield filter(filter_stopwords, tweet_preprocessor(tweet['text']).split(' '))
In [11]:
client = MongoClient(hostname, port)
db = client[database_name]
collection = db[collection_name]
In [ ]:
feature = []
#metadata = codecs.open(os.path.join(DATA_DIR, 'metadata.tsv'), 'w', encoding="utf-8")
#metadata.write("Tweet\n")
for tweet in MyCorpus():
#tweet_meta = ' '.join(tweet)
tweet_tfidf = tfidf[dictionary.doc2bow(tweet)]
tweet_tfidf.sort(key=lambda x: x[1], reverse=True)
vec = []
for word, w_tfidf in tweet_tfidf:
if dictionary[word] in word2vec:
vec.append(np.multiply(word2vec[dictionary[word]], w_tfidf))
if len(vec)>2:
vec = np.mean(vec, axis=0)
vec = standardize_vector(vec)
feature.append(vec)
#metadata.write(tweet_meta + '\n')
In [ ]:
np.save(os.path.join(DATA_DIR, 'feature'), feature)