In [101]:
import scipy as sc
import numpy as np
import pandas as pd
import powerlaw
import scipy.sparse as sps
import scipy.sparse.linalg as slin
import matplotlib.pyplot as plt
In [68]:
adj_matrix_path = r'C:/Users/dmpas/data/reddit/_all/adjacency_matrix.npz'
matrix = sps.load_npz(adj_matrix_path).tolil()
In [69]:
adj_matrix = (matrix + matrix.transpose())
In [70]:
## shape
print(adj_matrix.shape)
In [71]:
## degree ...
ones = sps.lil_matrix((adj_matrix.shape[0], 1), dtype=np.int8)
for i in range(0,adj_matrix.shape[0]):
ones[i,0] = 1
degrees = (adj_matrix * ones).todense().tolist()
degrees[0:10]
Out[71]:
In [72]:
authors = {}
index = 0
nodes_in_adj_matrix = r'C:/Users/dmpas/data/reddit/_nodes/nodes.csv'
df = pd.read_csv(nodes_in_adj_matrix, header=0)
for index,row in df.iterrows():
authors[row['author']]=degrees[index]
authors_length = len(authors.keys())
In [73]:
print(len(authors.keys()))
In [74]:
df = pd.DataFrame({'author':list(authors.keys()),'degree':list(authors.values())})
df['degree'] = df['degree'].map(lambda x: x[0])
In [75]:
n = authors_length
te = df['degree'].values.sum()
e = te / 2
In [76]:
print('nodes:', n)
print('edges:', e)
In [77]:
## average degree ...
In [78]:
# author on average, comments on 619 posts where others also posted on ...
# (posts w/ len(comments) == 1 not counted) ...
print('average degree:', e/n)
In [79]:
## degree distribution ...
In [80]:
_ = plt.hist('degree', data=df, bins=100, normed=True)
plt.show()
In [81]:
# Power Law?
In [82]:
k = []
pk = []
for _, row in df.iterrows():
deg = row['degree']
k.append(deg)
pk.append(deg/te)
In [83]:
# assert the probablity is decently close to 1 ...
assert (np.sum(pk) > .99) and (np.sum(pk) <= 1)
In [84]:
_ = plt.loglog(k,pk, linestyle='none', marker='.', color='blue')
_ = plt.title('loglog')
_ = plt.xlabel('k')
_ = plt.ylabel('p(k)')
plt.show()
In [85]:
np.seterr(divide='ignore', invalid='ignore')
Out[85]:
In [86]:
## Powerlaw
# http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0085777
# R is the loglikelihood ratio between the two distributions, + = first, - = second
In [87]:
all_degrees = df['degree'].values
results = powerlaw.Fit(all_degrees, discrete=True)
print('alpha:',results.power_law.alpha)
print('xmin:',results.power_law.xmin)
In [88]:
R, p = results.distribution_compare('power_law', 'lognormal')
print('R:',R)
print('p:',p)
In [89]:
# actual,
f=results.plot_pdf(color='b',linewidth=2)
# power,
results.power_law.plot_pdf(color='r',linestyle='-',ax=f)
# lognormal,
results.lognormal.plot_pdf(color='g',linestyle='-',ax=f)
plt.show()
In [90]:
## Degree Centrality
In [91]:
degree_centrality = {}
for _, row in df.iterrows():
author = row['author']
deg = row['degree']
degree_centrality[author] = deg / (n-1)
In [92]:
dc_sorted = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)
In [99]:
dc_sorted[:20]
Out[99]:
In [97]:
dc_sorted[-20:]
Out[97]:
In [ ]:
# eigenvector ...
In [111]:
eigenvalue, eigenvector = slin.eigs(adj_matrix.astype(np.float32).T, which='LR', k=5, maxiter=100)
largest = eigenvector.flatten().real
norm = sc.sign(largest.sum()) * np.linalg.norm(largest)
eig_centrality = (largest / norm)
In [107]:
eigs = {}
for key,value in authors.items():
author = key
eigs[author] = eig_centrality[value].tolist()[0]
In [112]:
sorted_eigs = sorted(eigs.items(), key=lambda x: x[1], reverse=True)
sorted_eigs[:20]
Out[112]:
In [ ]: