Transformar em matriz



In [1]:

    
import pandas as pd
import numpy as np



In [2]:

    
origin_df = pd.DataFrame.from_csv("networks/real2.csv")
# ceil = 1000



In [3]:

    
source = set(origin_df.index.unique())
target = set(origin_df['1'].unique())
nodes = source | target
len(nodes)









    Out[3]:





1957027



In [4]:

    
nodes = list(nodes)[:ceil]
len_nodes = len(nodes)



In [5]:

    
len_nodes









    Out[5]:





1000



In [11]:

    
matrix = np.zeros((len_nodes, len_nodes))
new_df = pd.DataFrame(matrix, index=nodes, columns=nodes, dtype=np.int32)



In [15]:

    
for i in range(len_nodes):
    # filtra casos que não estão no index
    if i in source:
        outgoing_nodes = np.array(origin_df.ix[i, '1']).flatten()
        for j in [var for var in outgoing_nodes if var < ceil]:
            new_df.ix[i, j] = 1



In [16]:

    
# df2.ix[:, 102].tail();



In [21]:

    
np.bincount(new_df.ix[:, 389] > 0)









    Out[21]:





array([998,   2])



In [142]:

    
df2.to_csv('input_test.csv')



In [ ]:



In [99]:

    
df2[df2.ix[:, 370] > 0]









    Out[99]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      990
      991
      992
      993
      994
      995
      996
      997
      998
      999
    
  
  
    
      202
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      204
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      349
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      369
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

4 rows × 1000 columns



In [105]:

    
df2[df2.ix[:, 369] > 0]









    Out[105]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      990
      991
      992
      993
      994
      995
      996
      997
      998
      999
    
  
  
    
      203
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      212
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

2 rows × 1000 columns



In [110]:

    
np.bincount(df2.sum(axis=0))









    Out[110]:





array([110, 579, 261,  44,   6])



In [111]:

    
import pickle



In [112]:

    
a = pd.Series([1,2,3,3])



In [113]:

    
a.to_pickle('test2.p')



In [ ]:

    
with open('filename.pickle', 'wb') as handle:
    pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('filename.pickle', 'rb') as handle:
    b = pickle.load(handle)



In [122]:

    
dir(pickle);



In [121]:

    
b = pickle.loads(open('test2.p', 'rb'))









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-121-9505128abd7e> in <module>()
----> 1 b = pickle.loads(open('test2.p', 'rb'))

TypeError: a bytes-like object is required, not '_io.BufferedReader'



In [123]:

    
with open('a.p', 'wb') as handle:
    pickle.dump(a, handle)

with open('a.p', 'rb') as handle:
    b = pickle.load(handle)



In [143]:

    
with open('page_rank_results.p', 'rb') as handle:
    pr = pickle.load(handle)
pr.order(ascending=False)









    



/Users/lucasosouza/anaconda/envs/udacity/lib/python3.5/site-packages/ipykernel/__main__.py:3: FutureWarning: order is deprecated, use sort_values(...)
  app.launch_new_instance()






    Out[143]:





370    0.001438
978    0.001332
249    0.001173
592    0.001173
996    0.001173
429    0.001119
872    0.001119
368    0.001119
477    0.001013
438    0.001013
865    0.001013
315    0.001013
184    0.001013
111    0.001013
118    0.001013
199    0.001013
33     0.001013
307    0.001013
459    0.001013
428    0.001013
359    0.001013
346    0.001013
658    0.001013
384    0.001013
509    0.001013
661    0.001013
656    0.001013
748    0.001013
516    0.001013
543    0.001013
         ...   
594    0.000375
977    0.000375
845    0.000375
859    0.000375
860    0.000375
872    0.000375
874    0.000375
879    0.000375
880    0.000375
881    0.000375
882    0.000375
884    0.000375
903    0.000375
905    0.000375
917    0.000375
921    0.000375
922    0.000375
923    0.000375
930    0.000375
932    0.000375
936    0.000375
938    0.000375
942    0.000375
944    0.000375
951    0.000375
952    0.000375
956    0.000375
960    0.000375
962    0.000375
301    0.000375
dtype: float64



In [129]:

    
import pandas
import numpy
def to_matrix(origin_df):
    # get unique nodes
    source = set(origin_df.index.unique()) 
    target = set(origin_df['1'].unique()) 
    nodes = source | target  

    # create zero matrix and init dataframe
    len_nodes = len(nodes)
    zero_matrix = numpy.zeros((len_nodes, len_nodes))  
    new_df = pandas.DataFrame(zero_matrix, index=nodes, columns=nodes, dtype=numpy.int32)

    # itera e cria a matriz
    ceil= 1000
    for i in range(len_nodes):
        # filtra casos que nao estao no index
        if i in source:
            # cria a lista com os target/outgoing
            target = numpy.array(origin_df.ix[i, '1']).flatten()
            # itero pelo target
            for j in [var for var in target if var < ceil]: 
                # atualizo o valor na matriz
                new_df.ix[i, j] = 1

    return new_df



In [ ]:

    
mt = to_matrix(pd.DataFrame.from_csv("networks/real2.csv"))



In [32]:

    
test_df = pd.read_csv('input_test.csv',dtype=np.int32, index_col=0)



In [33]:

    
test_df.head()









    Out[33]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      990
      991
      992
      993
      994
      995
      996
      997
      998
      999
    
  
  
    
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      0
      0
      0
      0
      0
      0
      1
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 1000 columns



In [34]:

    
test_df.dtypes









    Out[34]:





0      int32
1      int32
2      int32
3      int32
4      int32
5      int32
6      int32
7      int32
8      int32
9      int32
10     int32
11     int32
12     int32
13     int32
14     int32
15     int32
16     int32
17     int32
18     int32
19     int32
20     int32
21     int32
22     int32
23     int32
24     int32
25     int32
26     int32
27     int32
28     int32
29     int32
       ...  
970    int32
971    int32
972    int32
973    int32
974    int32
975    int32
976    int32
977    int32
978    int32
979    int32
980    int32
981    int32
982    int32
983    int32
984    int32
985    int32
986    int32
987    int32
988    int32
989    int32
990    int32
991    int32
992    int32
993    int32
994    int32
995    int32
996    int32
997    int32
998    int32
999    int32
dtype: object



In [36]:

    
new_df.columns = new_df.columns









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-36-a62b7bb00809> in <module>()
----> 1 new_df.columns = new_df.columns.to_numeric()

AttributeError: 'Int64Index' object has no attribute 'to_numeric'



In [41]:

    
new_df.columns.value()









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-41-5c105c923476> in <module>()
----> 1 new_df.columns.value()

AttributeError: 'Int64Index' object has no attribute 'value'



In [ ]:

	0	1	2	3	4	5	6	7	8	9	...	990	991	992	993	994	995	996	997	998	999
202	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
204	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
349	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
369	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	2	3	4	5	6	...
0	1	0	0	0	0	...
1	0	0	0	0	1	...
2	0	1	0	0	0	...
3	0	0	1	0	0	...
4	0	0	0	1	0	...

	0	1	2	3	4	5	6	7	8	9	...	990	991	992	993	994	995	996	997	998	999
202	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
204	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
349	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
369	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	2	3	4	5	6	...
0	1	0	0	0	0	...
1	0	0	0	0	1	...
2	0	1	0	0	0	...
3	0	0	1	0	0	...
4	0	0	0	1	0	...

	0	1	2	3	4	5	6	7	8	9	...	990	991	992	993	994	995	996	997	998	999
202	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
204	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
349	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
369	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	2	3	4	5	6	...
0	1	0	0	0	0	...
1	0	0	0	0	1	...
2	0	1	0	0	0	...
3	0	0	1	0	0	...
4	0	0	0	1	0	...