In [22]:
from pymongo import MongoClient
import numpy as np
import scipy.sparse as sps
import codecs, os
In [23]:
database_name = "twitter-data"
collection_name = "brussels_mentions"
DATA_DIR = '../../data/'
LOG_DIR = '../../data/embedding_logs'
In [24]:
# connect to collection
client = MongoClient()
db = client[database_name]
collection = db[collection_name]
# user - user id
# ind - matrix row or column
# username - username
# get unique sources and targets, and set user to matrix row or column dictionary
src_users = set()
trg_users = set()
user_to_username = {}
for rec in collection.find():
src_ind = rec["src"]
trg_ind = rec["trg"]
src_users.add(src_ind)
trg_users.add(trg_ind)
# user id to username
user_to_username[src_ind] = rec["src_str"]
user_to_username[trg_ind] = rec["trg_str"]
In [25]:
# assign matrix row, col to users id and vice versa
src_to_ind = {}
trg_to_ind = {}
ind_to_src = {}
ind_to_trg = {}
for ind, src in enumerate(src_users):
src_to_ind[src] = ind
ind_to_src[ind] = src
for ind, trg in enumerate(trg_users):
trg_to_ind[trg] = ind
ind_to_trg[ind] = trg
ind_to_user = ind_to_src.copy()
ind_to_user.update(ind_to_trg)
In [26]:
# create feature matrix
X = sps.lil_matrix((len(src_users),len(trg_users)))
#X = sps.lil_matrix((n,n))
#matrix = sps.dok_matrix((n, n))
for rec in collection.find():
src_ind = src_to_ind[rec["src"]]
trg_ind = trg_to_ind[rec["trg"]]
X[src_ind, trg_ind] += 1
# save source to target directed graphs matrix representation
X = X.tocoo()
sps.save_npz(os.path.join(DATA_DIR, "src_trg_mtx.npz"), X)
In [27]:
# standard deviation of vector
def normalize(X, axis=None):
if(sum(X)==0):
return X
return ((X - np.mean(X, axis=axis, keepdims=True)) / np.std(X, axis=axis, keepdims=True))
In [28]:
X = X.tocsr()
feature = X.copy()
In [29]:
# set CSR sparse matrix row
def set_row_csr(A, row_idx, new_row):
N_cols = A.shape[1]
idx_start_row = A.indptr[row_idx]
idx_end_row = A.indptr[row_idx + 1]
additional_nnz = N_cols - (idx_end_row - idx_start_row)
A.data = np.r_[A.data[:idx_start_row], new_row, A.data[idx_end_row:]]
A.indices = np.r_[A.indices[:idx_start_row], np.arange(N_cols), A.indices[idx_end_row:]]
A.indptr = np.r_[A.indptr[:row_idx + 1], A.indptr[(row_idx + 1):] + additional_nnz]
In [30]:
# standard deviation of vectors (avg=0, deviation=1)
for row_idx in range(0,feature.shape[0]):
set_row_csr(feature, row_idx, normalize(feature.getrow(row_idx).toarray()[0]))
In [31]:
# save feature matrix
sps.save_npz(os.path.join(DATA_DIR, "feature_mtx.npz"), feature)
In [32]:
# write out metadata to file (4 most mentioned users by the user)
f = codecs.open(os.path.join(LOG_DIR, 'metadata_brussels.tsv'), "w", "utf-8")
f.write("Vector\tFirst\tSecond\tThird\tFourth\n")
for row_idx in range(0,feature.shape[0]):
row = feature.getrow(row_idx).toarray()[0]
row2 = X.getrow(row_idx).toarray()[0]
ind = np.argpartition(row, -4)[-4:] # 4 most mentioned
ind = ind[np.argsort(row[ind])] # sorted
ind = [x for x in ind if row2[x] != 0] # filter out zeros
out = [user_to_username[ind_to_user[i]] for i in ind]
out += [""] * (4-len(ind))
f.write(",".join([str(int(row2[i])) for i in ind]) + "\t" + "\t".join(out) + "\n")
In [33]:
feature = feature.tocoo()
feature = feature.toarray()
In [34]:
-
Out[34]:
In [ ]: