In [101]:
import scipy as sc
import numpy as np
import pandas as pd
import powerlaw
import scipy.sparse as sps
import scipy.sparse.linalg as slin
import matplotlib.pyplot as plt

In [68]:
adj_matrix_path = r'C:/Users/dmpas/data/reddit/_all/adjacency_matrix.npz'
matrix = sps.load_npz(adj_matrix_path).tolil()

In [69]:
adj_matrix = (matrix + matrix.transpose())

In [70]:
## shape
print(adj_matrix.shape)


(329929, 329929)

In [71]:
## degree ...
ones = sps.lil_matrix((adj_matrix.shape[0], 1), dtype=np.int8)
for i in range(0,adj_matrix.shape[0]):
    ones[i,0] = 1

degrees = (adj_matrix * ones).todense().tolist()
degrees[0:10]


Out[71]:
[[147], [19], [2], [145], [355], [69], [79], [49], [3], [354]]

In [72]:
authors = {}

index = 0
nodes_in_adj_matrix = r'C:/Users/dmpas/data/reddit/_nodes/nodes.csv'
df = pd.read_csv(nodes_in_adj_matrix, header=0)
for index,row in df.iterrows():
    authors[row['author']]=degrees[index]
    
authors_length = len(authors.keys())

In [73]:
print(len(authors.keys()))


329929

In [74]:
df = pd.DataFrame({'author':list(authors.keys()),'degree':list(authors.values())})
df['degree'] = df['degree'].map(lambda x: x[0])

In [75]:
n = authors_length
te = df['degree'].values.sum()
e = te / 2

In [76]:
print('nodes:', n)
print('edges:', e)


nodes: 329929
edges: 201948451.0

In [77]:
## average degree ...

In [78]:
# author on average, comments on 619 posts where others also posted on ... 
# (posts w/ len(comments) == 1 not counted) ...
print('average degree:', e/n)


average degree: 612.096696562

In [79]:
## degree distribution ...

In [80]:
_ = plt.hist('degree', data=df, bins=100, normed=True)
plt.show()



In [81]:
# Power Law?

In [82]:
k  = []
pk = []

for _, row in df.iterrows():
    deg = row['degree']
    k.append(deg)
    pk.append(deg/te)

In [83]:
# assert the probablity is decently close to 1 ...
assert (np.sum(pk) > .99) and (np.sum(pk) <= 1)

In [84]:
_ = plt.loglog(k,pk, linestyle='none', marker='.', color='blue')
_ = plt.title('loglog')
_ = plt.xlabel('k')
_ = plt.ylabel('p(k)')

plt.show()



In [85]:
np.seterr(divide='ignore', invalid='ignore')


Out[85]:
{'divide': 'ignore', 'invalid': 'ignore', 'over': 'warn', 'under': 'ignore'}

In [86]:
## Powerlaw

# http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0085777
# R is the loglikelihood ratio between the two distributions, + = first, - = second

In [87]:
all_degrees = df['degree'].values
results = powerlaw.Fit(all_degrees, discrete=True)

print('alpha:',results.power_law.alpha)
print('xmin:',results.power_law.xmin)


Calculating best minimal value for power law fit
alpha: 2.17652577565
xmin: 2136.0

In [88]:
R, p = results.distribution_compare('power_law', 'lognormal')
print('R:',R)
print('p:',p)


R: -218.414706981
p: 7.92162402735e-43

In [89]:
# actual,
f=results.plot_pdf(color='b',linewidth=2)

# power,
results.power_law.plot_pdf(color='r',linestyle='-',ax=f)

# lognormal,
results.lognormal.plot_pdf(color='g',linestyle='-',ax=f)

plt.show()



In [90]:
## Degree Centrality

In [91]:
degree_centrality = {}
for _, row in df.iterrows():
    author = row['author']
    deg    = row['degree']
    
    degree_centrality[author] = deg / (n-1)

In [92]:
dc_sorted = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)

In [99]:
dc_sorted[:20]


Out[99]:
[('rydan', 0.8739361315196043),
 ('ebaley', 0.7552526611866831),
 ('Explodicle', 0.6387029897432167),
 ('Natanael_L', 0.6086236997163017),
 ('Introshine', 0.5352349603549865),
 ('ztsmart', 0.5343953832351301),
 ('killerstorm', 0.5330981305012003),
 ('BashCo', 0.5232990228170995),
 ('sreaka', 0.5227898208093887),
 ('Cryptolution', 0.5213440508232099),
 ('dellintelbitcoin', 0.49164666230207804),
 ('Frogolocalypse', 0.4885884192914818),
 ('paleh0rse', 0.48169600640139665),
 ('GibbsSamplePlatter', 0.4780376324531413),
 ('Bitcoin_Bug', 0.466556339564996),
 ('bitsteiner', 0.44255110205863096),
 ('physalisx', 0.4257504667685071),
 ('thieflar', 0.4214374045246235),
 ('walloon5', 0.4174789651075386),
 ('cqm', 0.41692429863485364)]

In [97]:
dc_sorted[-20:]


Out[97]:
[('ziglef', 3.0309643316117456e-06),
 ('zip_13', 3.0309643316117456e-06),
 ('zixteee', 3.0309643316117456e-06),
 ('ziyadp', 3.0309643316117456e-06),
 ('zlryan', 3.0309643316117456e-06),
 ('zn3qbc', 3.0309643316117456e-06),
 ('zndg', 3.0309643316117456e-06),
 ('zolaph', 3.0309643316117456e-06),
 ('zombielordzero0', 3.0309643316117456e-06),
 ('zombieztcz', 3.0309643316117456e-06),
 ('zreedx', 3.0309643316117456e-06),
 ('zubairarshad', 3.0309643316117456e-06),
 ('zukdoge', 3.0309643316117456e-06),
 ('zulukilocharlie', 3.0309643316117456e-06),
 ('zuzuk2', 3.0309643316117456e-06),
 ('zwschlei', 3.0309643316117456e-06),
 ('zwt', 3.0309643316117456e-06),
 ('zx_909', 3.0309643316117456e-06),
 ('zxcvbnmFreak714', 3.0309643316117456e-06),
 ('zylstrar', 3.0309643316117456e-06)]

In [ ]:
# eigenvector ...

In [111]:
eigenvalue, eigenvector = slin.eigs(adj_matrix.astype(np.float32).T, which='LR', k=5, maxiter=100)
largest = eigenvector.flatten().real
norm = sc.sign(largest.sum()) * np.linalg.norm(largest)

eig_centrality = (largest / norm)

In [107]:
eigs = {}
for key,value in authors.items():
    author = key
    eigs[author] = eig_centrality[value].tolist()[0]

In [112]:
sorted_eigs = sorted(eigs.items(), key=lambda x: x[1], reverse=True)
sorted_eigs[:20]


Out[112]:
[('skereMan', 0.0943111926317215),
 ('BluSyn', 0.046781279146671295),
 ('Myceilingfan', 0.041636910289525986),
 ('DeanMaverick', 0.039750922471284866),
 ('0xDDDD', 0.033229414373636246),
 ('Ancient7755', 0.033229414373636246),
 ('GeorgiPetrov', 0.033229414373636246),
 ('Guntermonkey', 0.033229414373636246),
 ('MagicNonce', 0.033229414373636246),
 ('OversTom', 0.033229414373636246),
 ('Sansia', 0.033229414373636246),
 ('Tharejamudit', 0.033229414373636246),
 ('adambit', 0.033229414373636246),
 ('agorathrow8080', 0.033229414373636246),
 ('bluebachcrypto', 0.033229414373636246),
 ('glennmatthew', 0.033229414373636246),
 ('iMrDot', 0.033229414373636246),
 ('jamesfrown', 0.033229414373636246),
 ('kingmathers12', 0.033229414373636246),
 ('mmafan666', 0.033229414373636246)]

In [ ]: