notebook.community

Edit and run



In [101]:

    
import scipy as sc
import numpy as np
import pandas as pd
import powerlaw
import scipy.sparse as sps
import scipy.sparse.linalg as slin
import matplotlib.pyplot as plt



In [68]:

    
adj_matrix_path = r'C:/Users/dmpas/data/reddit/_all/adjacency_matrix.npz'
matrix = sps.load_npz(adj_matrix_path).tolil()



In [69]:

    
adj_matrix = (matrix + matrix.transpose())



In [70]:

    
## shape
print(adj_matrix.shape)









    



(329929, 329929)



In [71]:

    
## degree ...
ones = sps.lil_matrix((adj_matrix.shape[0], 1), dtype=np.int8)
for i in range(0,adj_matrix.shape[0]):
    ones[i,0] = 1

degrees = (adj_matrix * ones).todense().tolist()
degrees[0:10]









    Out[71]:





[[147], [19], [2], [145], [355], [69], [79], [49], [3], [354]]



In [72]:

    
authors = {}

index = 0
nodes_in_adj_matrix = r'C:/Users/dmpas/data/reddit/_nodes/nodes.csv'
df = pd.read_csv(nodes_in_adj_matrix, header=0)
for index,row in df.iterrows():
    authors[row['author']]=degrees[index]
    
authors_length = len(authors.keys())



In [73]:

    
print(len(authors.keys()))



In [74]:

    
df = pd.DataFrame({'author':list(authors.keys()),'degree':list(authors.values())})
df['degree'] = df['degree'].map(lambda x: x[0])



In [75]:

    
n = authors_length
te = df['degree'].values.sum()
e = te / 2



In [76]:

    
print('nodes:', n)
print('edges:', e)









    



nodes: 329929
edges: 201948451.0



In [77]:

    
## average degree ...



In [78]:

    
# author on average, comments on 619 posts where others also posted on ... 
# (posts w/ len(comments) == 1 not counted) ...
print('average degree:', e/n)









    



average degree: 612.096696562



In [79]:

    
## degree distribution ...



In [80]:

    
_ = plt.hist('degree', data=df, bins=100, normed=True)
plt.show()



In [81]:

    
# Power Law?



In [82]:

    
k  = []
pk = []

for _, row in df.iterrows():
    deg = row['degree']
    k.append(deg)
    pk.append(deg/te)



In [83]:

    
# assert the probablity is decently close to 1 ...
assert (np.sum(pk) > .99) and (np.sum(pk) <= 1)



In [84]:

    
_ = plt.loglog(k,pk, linestyle='none', marker='.', color='blue')
_ = plt.title('loglog')
_ = plt.xlabel('k')
_ = plt.ylabel('p(k)')

plt.show()



In [85]:

    
np.seterr(divide='ignore', invalid='ignore')









    Out[85]:





{'divide': 'ignore', 'invalid': 'ignore', 'over': 'warn', 'under': 'ignore'}



In [86]:

    
## Powerlaw

# http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0085777
# R is the loglikelihood ratio between the two distributions, + = first, - = second



In [87]:

    
all_degrees = df['degree'].values
results = powerlaw.Fit(all_degrees, discrete=True)

print('alpha:',results.power_law.alpha)
print('xmin:',results.power_law.xmin)









    



Calculating best minimal value for power law fit






    



alpha: 2.17652577565
xmin: 2136.0



In [88]:

    
R, p = results.distribution_compare('power_law', 'lognormal')
print('R:',R)
print('p:',p)









    



R: -218.414706981
p: 7.92162402735e-43



In [89]:

    
# actual,
f=results.plot_pdf(color='b',linewidth=2)

# power,
results.power_law.plot_pdf(color='r',linestyle='-',ax=f)

# lognormal,
results.lognormal.plot_pdf(color='g',linestyle='-',ax=f)

plt.show()



In [90]:

    
## Degree Centrality



In [91]:

    
degree_centrality = {}
for _, row in df.iterrows():
    author = row['author']
    deg    = row['degree']
    
    degree_centrality[author] = deg / (n-1)



In [92]:

    
dc_sorted = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)



In [99]:

    
dc_sorted[:20]









    Out[99]:





[('rydan', 0.8739361315196043),
 ('ebaley', 0.7552526611866831),
 ('Explodicle', 0.6387029897432167),
 ('Natanael_L', 0.6086236997163017),
 ('Introshine', 0.5352349603549865),
 ('ztsmart', 0.5343953832351301),
 ('killerstorm', 0.5330981305012003),
 ('BashCo', 0.5232990228170995),
 ('sreaka', 0.5227898208093887),
 ('Cryptolution', 0.5213440508232099),
 ('dellintelbitcoin', 0.49164666230207804),
 ('Frogolocalypse', 0.4885884192914818),
 ('paleh0rse', 0.48169600640139665),
 ('GibbsSamplePlatter', 0.4780376324531413),
 ('Bitcoin_Bug', 0.466556339564996),
 ('bitsteiner', 0.44255110205863096),
 ('physalisx', 0.4257504667685071),
 ('thieflar', 0.4214374045246235),
 ('walloon5', 0.4174789651075386),
 ('cqm', 0.41692429863485364)]



In [97]:

    
dc_sorted[-20:]









    Out[97]:





[('ziglef', 3.0309643316117456e-06),
 ('zip_13', 3.0309643316117456e-06),
 ('zixteee', 3.0309643316117456e-06),
 ('ziyadp', 3.0309643316117456e-06),
 ('zlryan', 3.0309643316117456e-06),
 ('zn3qbc', 3.0309643316117456e-06),
 ('zndg', 3.0309643316117456e-06),
 ('zolaph', 3.0309643316117456e-06),
 ('zombielordzero0', 3.0309643316117456e-06),
 ('zombieztcz', 3.0309643316117456e-06),
 ('zreedx', 3.0309643316117456e-06),
 ('zubairarshad', 3.0309643316117456e-06),
 ('zukdoge', 3.0309643316117456e-06),
 ('zulukilocharlie', 3.0309643316117456e-06),
 ('zuzuk2', 3.0309643316117456e-06),
 ('zwschlei', 3.0309643316117456e-06),
 ('zwt', 3.0309643316117456e-06),
 ('zx_909', 3.0309643316117456e-06),
 ('zxcvbnmFreak714', 3.0309643316117456e-06),
 ('zylstrar', 3.0309643316117456e-06)]



In [ ]:

    
# eigenvector ...



In [111]:

    
eigenvalue, eigenvector = slin.eigs(adj_matrix.astype(np.float32).T, which='LR', k=5, maxiter=100)
largest = eigenvector.flatten().real
norm = sc.sign(largest.sum()) * np.linalg.norm(largest)

eig_centrality = (largest / norm)



In [107]:

    
eigs = {}
for key,value in authors.items():
    author = key
    eigs[author] = eig_centrality[value].tolist()[0]



In [112]:

    
sorted_eigs = sorted(eigs.items(), key=lambda x: x[1], reverse=True)
sorted_eigs[:20]









    Out[112]:





[('skereMan', 0.0943111926317215),
 ('BluSyn', 0.046781279146671295),
 ('Myceilingfan', 0.041636910289525986),
 ('DeanMaverick', 0.039750922471284866),
 ('0xDDDD', 0.033229414373636246),
 ('Ancient7755', 0.033229414373636246),
 ('GeorgiPetrov', 0.033229414373636246),
 ('Guntermonkey', 0.033229414373636246),
 ('MagicNonce', 0.033229414373636246),
 ('OversTom', 0.033229414373636246),
 ('Sansia', 0.033229414373636246),
 ('Tharejamudit', 0.033229414373636246),
 ('adambit', 0.033229414373636246),
 ('agorathrow8080', 0.033229414373636246),
 ('bluebachcrypto', 0.033229414373636246),
 ('glennmatthew', 0.033229414373636246),
 ('iMrDot', 0.033229414373636246),
 ('jamesfrown', 0.033229414373636246),
 ('kingmathers12', 0.033229414373636246),
 ('mmafan666', 0.033229414373636246)]



In [ ]: