In [22]:
from pymongo import MongoClient
import numpy as np
import scipy.sparse as sps
import codecs, os

In [23]:
database_name = "twitter-data"
collection_name = "brussels_mentions"
DATA_DIR = '../../data/'
LOG_DIR = '../../data/embedding_logs'

In [24]:
# connect to collection
client = MongoClient()
db = client[database_name]
collection = db[collection_name]

# user - user id
# ind - matrix row or column
# username - username

# get unique sources and targets, and set user to matrix row or column dictionary
src_users = set()
trg_users = set()
user_to_username = {}

for rec in collection.find():
    src_ind = rec["src"]
    trg_ind = rec["trg"]
    
    src_users.add(src_ind)
    trg_users.add(trg_ind)
    # user id to username
    user_to_username[src_ind] = rec["src_str"]
    user_to_username[trg_ind] = rec["trg_str"]

In [25]:
# assign matrix row, col to users id and vice versa
src_to_ind = {}
trg_to_ind = {}
ind_to_src = {}
ind_to_trg = {}

for ind, src in enumerate(src_users):
    src_to_ind[src] = ind
    ind_to_src[ind] = src

for ind, trg in enumerate(trg_users):
    trg_to_ind[trg] = ind
    ind_to_trg[ind] = trg

ind_to_user = ind_to_src.copy()
ind_to_user.update(ind_to_trg)

In [26]:
# create feature matrix

X = sps.lil_matrix((len(src_users),len(trg_users)))
#X = sps.lil_matrix((n,n))
#matrix = sps.dok_matrix((n, n))

for rec in collection.find():
    src_ind = src_to_ind[rec["src"]]
    trg_ind = trg_to_ind[rec["trg"]]
    X[src_ind, trg_ind] += 1

# save source to target directed graphs matrix representation
X = X.tocoo()
sps.save_npz(os.path.join(DATA_DIR, "src_trg_mtx.npz"), X)

In [27]:
# standard deviation of vector
def normalize(X, axis=None):
    if(sum(X)==0):
        return X
    return ((X - np.mean(X, axis=axis, keepdims=True)) / np.std(X, axis=axis, keepdims=True))

In [28]:
X = X.tocsr()
feature = X.copy()

In [29]:
# set CSR sparse matrix row
def set_row_csr(A, row_idx, new_row):
    N_cols = A.shape[1]
    
    idx_start_row = A.indptr[row_idx]
    idx_end_row = A.indptr[row_idx + 1]
    additional_nnz = N_cols - (idx_end_row - idx_start_row)

    A.data = np.r_[A.data[:idx_start_row], new_row, A.data[idx_end_row:]]
    A.indices = np.r_[A.indices[:idx_start_row], np.arange(N_cols), A.indices[idx_end_row:]]
    A.indptr = np.r_[A.indptr[:row_idx + 1], A.indptr[(row_idx + 1):] + additional_nnz]

In [30]:
# standard deviation of vectors (avg=0, deviation=1)
for row_idx in range(0,feature.shape[0]):
    set_row_csr(feature, row_idx, normalize(feature.getrow(row_idx).toarray()[0]))

In [31]:
# save feature matrix
sps.save_npz(os.path.join(DATA_DIR, "feature_mtx.npz"), feature)

In [32]:
# write out metadata to file (4 most mentioned users by the user)
f = codecs.open(os.path.join(LOG_DIR, 'metadata_brussels.tsv'), "w", "utf-8")
f.write("Vector\tFirst\tSecond\tThird\tFourth\n")

for row_idx in range(0,feature.shape[0]):
    row = feature.getrow(row_idx).toarray()[0]
    row2 = X.getrow(row_idx).toarray()[0]
    ind = np.argpartition(row, -4)[-4:] # 4 most mentioned
    ind = ind[np.argsort(row[ind])] # sorted
    ind = [x for x in ind if row2[x] != 0] # filter out zeros
    out = [user_to_username[ind_to_user[i]] for i in ind]
    out += [""] * (4-len(ind))
    f.write(",".join([str(int(row2[i])) for i in ind]) + "\t" + "\t".join(out) + "\n")

In [33]:
feature = feature.tocoo()
feature = feature.toarray()

In [34]:
-


Out[34]:
'../../data/embedding_logs/model_brussels.ckpt-1'

In [ ]: