notebook.community

Edit and run



In [7]:

    
# -*- coding: utf-8 -*-
from pymongo import MongoClient
import numpy as np
import gensim, cython, codecs, os, logging, re
from gensim import models, corpora

# Variables
hostname = "localhost"
port = 27017
database_name = "twitter-data"
collection_name = "ns"

DATA_DIR = '/home/nipg1/Documents/summer_project/data/'+ collection_name
LOG_DIR = '/home/nipg1/Documents/summer_project/data/'+ collection_name + '/embedding_logs'



In [8]:

    
#word2vec = gensim.models.Word2Vec.load(os.path.join(DATA_DIR, 'word2vec_model'))
word2vec = gensim.models.KeyedVectors.load_word2vec_format('/home/nipg1/Documents/summer_project/data/models/GoogleNews-vectors-negative300.bin', binary=True, limit=100000)
dictionary = corpora.Dictionary.load(os.path.join(DATA_DIR, '/home/nipg1/Documents/summer_project/data/ns/words.dict'))
tfidf = models.tfidfmodel.TfidfModel.load(os.path.join(DATA_DIR, '/home/nipg1/Documents/summer_project/data/ns/tfidf_model'))



In [9]:

    
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

stopwords = ['the', 'be', 'and', 'of', 'a', 'an', 'in', 'to', 'or', 'have', 'has',
            'it', 'i', 'that', 'for', 'you', 'he', 'with', 'on', 'do','say',
            'this', 'they', 'at', 'but', 'we', 'rt', '']
pattern = re.compile('[^a-zA-Z\s]', re.UNICODE)

stopword_patterns = ['www\.[^ ]+\.[^ ]+', 'https?:\/\/[^ ]+', '([@#])([a-z\d_]+)', '^\d$'] #links, mentions, hashtags, numbers

stopword_pattern = re.compile("|".join(stopword_patterns))



In [10]:

    
def standardize_vector(X):
    return (X - np.mean(X))/np.std(X)

def tweet_preprocessor(tweet):
    tweet = tweet.lower()
    tweet = re.sub(stopword_pattern, '', tweet)
    tweet = re.sub('\t|\n', '', tweet)
    tweet = re.sub(pattern, '', tweet)
    return tweet

def filter_stopwords(word):
    if word in stopwords:
        return False
    else:
        return True

class MyCorpus(object):
        def __iter__(self):
            for tweet in collection.find({"text": {"$exists": "true"}}, {'text': 1}):
                yield filter(filter_stopwords, tweet_preprocessor(tweet['text']).split(' '))



In [11]:

    
client = MongoClient(hostname, port)
db = client[database_name]
collection = db[collection_name]



In [ ]:

    
feature = []
#metadata = codecs.open(os.path.join(DATA_DIR, 'metadata.tsv'), 'w', encoding="utf-8")
#metadata.write("Tweet\n")

for tweet in MyCorpus():
    #tweet_meta = ' '.join(tweet)
    tweet_tfidf = tfidf[dictionary.doc2bow(tweet)]
    tweet_tfidf.sort(key=lambda x: x[1], reverse=True)
    vec = []
    for word, w_tfidf in tweet_tfidf:
        if dictionary[word] in word2vec:
            vec.append(np.multiply(word2vec[dictionary[word]], w_tfidf))
    if len(vec)>2:
        vec = np.mean(vec, axis=0)
        vec = standardize_vector(vec)
        feature.append(vec)
        #metadata.write(tweet_meta + '\n')



In [ ]:

    
np.save(os.path.join(DATA_DIR, 'feature'), feature)