In [1]:
import cPickle
import pandas as pd
import networkx as nx

In [6]:
with open('../500px.pickle') as f:
    photos_df, users_photos_df = cPickle.load(f)

In [37]:
g = nx.Graph()
for i, row in users_photos_df.iterrows():
    g.add_edge('photo_{}'.format(row['photo_id']), 'user_{}'.format(row['user_id']))

In [38]:
photo_degrees = [val for key, val in nx.degree(g).iteritems() if key.startswith('photo')]
hist(photo_degrees, bins=arange(100))


Out[38]:
(array([    0.,   225.,   393.,   495.,   494.,   603.,   799.,   970.,
         1206.,  1276.,  1168.,  1108.,  1080.,  1036.,   972.,   889.,
          821.,   797.,   700.,   694.,   621.,   572.,   494.,   502.,
          484.,   459.,   450.,   376.,   365.,   331.,   324.,   289.,
          266.,   251.,   267.,   222.,   196.,   240.,   195.,   170.,
          176.,   188.,   179.,   144.,   152.,   137.,   127.,   135.,
          136.,   129.,   124.,   121.,   125.,   121.,    99.,   119.,
          105.,    84.,    93.,    80.,   101.,    78.,   114.,    86.,
           90.,    83.,    75.,    85.,    94.,    70.,    72.,    66.,
           59.,    60.,    67.,    58.,    66.,    70.,    62.,    63.,
           60.,    52.,    47.,    49.,    60.,    47.,    53.,    52.,
           49.,    56.,    53.,    51.,    49.,    53.,    52.,    46.,
           38.,    43.,    86.]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
        85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]),
 <a list of 99 Patch objects>)

In [40]:
well_represented_photos =  [key for key, val in nx.degree(g).iteritems() if key.startswith('photo') and val > 20]
print len(well_represented_photos)


18565

In [41]:
user_degrees = [val for key, val in nx.degree(g).iteritems() if key.startswith('user')]
hist(user_degrees, bins=arange(100))


Out[41]:
(array([  0.00000000e+00,   1.31286000e+05,   4.97170000e+04,
          2.79700000e+04,   1.81010000e+04,   1.27830000e+04,
          9.64900000e+03,   7.44500000e+03,   5.82300000e+03,
          4.93500000e+03,   4.12600000e+03,   3.50000000e+03,
          3.06900000e+03,   2.69700000e+03,   2.25800000e+03,
          2.03800000e+03,   1.80300000e+03,   1.70100000e+03,
          1.49600000e+03,   1.35700000e+03,   1.19100000e+03,
          1.11000000e+03,   9.81000000e+02,   9.09000000e+02,
          8.61000000e+02,   7.69000000e+02,   7.44000000e+02,
          7.10000000e+02,   6.77000000e+02,   5.79000000e+02,
          5.30000000e+02,   5.26000000e+02,   4.98000000e+02,
          4.77000000e+02,   4.25000000e+02,   4.17000000e+02,
          3.85000000e+02,   3.72000000e+02,   3.74000000e+02,
          3.55000000e+02,   3.02000000e+02,   3.12000000e+02,
          2.75000000e+02,   2.78000000e+02,   2.78000000e+02,
          3.02000000e+02,   2.44000000e+02,   2.58000000e+02,
          2.32000000e+02,   2.42000000e+02,   2.10000000e+02,
          1.89000000e+02,   1.98000000e+02,   1.59000000e+02,
          1.78000000e+02,   2.00000000e+02,   1.74000000e+02,
          1.55000000e+02,   1.59000000e+02,   1.47000000e+02,
          1.31000000e+02,   1.32000000e+02,   1.35000000e+02,
          1.10000000e+02,   1.29000000e+02,   1.13000000e+02,
          1.07000000e+02,   1.29000000e+02,   1.12000000e+02,
          1.22000000e+02,   9.40000000e+01,   8.90000000e+01,
          9.50000000e+01,   1.12000000e+02,   9.20000000e+01,
          9.20000000e+01,   8.60000000e+01,   8.80000000e+01,
          7.10000000e+01,   8.30000000e+01,   8.50000000e+01,
          7.70000000e+01,   7.60000000e+01,   9.10000000e+01,
          7.40000000e+01,   8.40000000e+01,   7.90000000e+01,
          8.50000000e+01,   7.20000000e+01,   6.30000000e+01,
          7.90000000e+01,   4.80000000e+01,   5.90000000e+01,
          7.80000000e+01,   7.50000000e+01,   5.00000000e+01,
          5.60000000e+01,   5.40000000e+01,   1.15000000e+02]),
 array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
        51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
        68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
        85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]),
 <a list of 99 Patch objects>)

In [43]:
well_represented_users =  [key for key, val in nx.degree(g).iteritems() if key.startswith('user') and val > 20]
print len(well_represented_users)


25044

In [48]:
well_represented_user_ids = [int(_[5:]) for _ in well_represented_users]
with open('well_represented_user_ids.pickle', 'wb') as f:
    cPickle.dump(well_represented_user_ids, f, protocol=2)