network_analysis



In [54]:
import os
import networkx as nx
import matplotlib.pyplot as plt
import cython
import numpy as np
import pandas as pd
import datetime
import pylab

In [55]:
# load in merged data
data_path = '../../../UCB_dept_merge_CorrectDeptLabels.csv'
merged = pd.read_csv(data_path)

In [56]:
#clean up the department dataset
def convert_strings_to_specials(s):
    s = s.replace(' ', '_')
    s = s.replace(':', '_')
    s = s.replace('#', 'num')
    s = s.lower()
    return s
merged.columns = [convert_strings_to_specials(col) for col in merged.columns]

In [57]:
#convert creation_date to datetime object
merged['creation_date']=pd.to_datetime(pd.Series(merged['creation_date']))

In [58]:
month = [date.month for date in merged['creation_date']]
year = [date.year for date in merged['creation_date']]
day = [date.day for date in merged['creation_date']]
merged['month'] = month
merged['year'] = year
merged['day'] = day

In [59]:
#iterator for getting month,year
def month_year_iter( start_month, start_year, end_month, end_year ):
    ym_start= 12*start_year + start_month - 1
    ym_end= 12*end_year + end_month - 1
    for ym in range( ym_start, ym_end ):
        y, m = divmod( ym, 12 )
        yield y, m+1

In [10]:
def remove_zeroes(dictionary):
    for key,value in dictionary.items():
        if value == 0.0:
            del dictionary[key]
    return dictionary

In [62]:
#function to create a series of bipartite graphs across two sets of nodes(two columns) subsetted by month
#inputs are the data frame, set 1 and set 2 represent the names of the columns of interest, time_column represents
#the name of the time column in the dataset. year_column and month_columns are created because I did not know how
#to be able to specify a month year combination any other way
#returns a list of calculations(density, centrality, etc.) subseted by month in a list
#where each item in the list is in the following form: ([calculations],month,year)
def testCentralityGraph(data, set1, set2, time_column, month_column=None, year_column=None):
    results = []
    # higher order function that does the calculations by month
    def calculate(set1, set2, current):
        G = nx.Graph()
        G.add_nodes_from(current[set1],bipartite = 0)
        G.add_nodes_from(current[set2],bipartite = 1)
        edgeList = [tuple(x) for x in current.values]
        G.add_edges_from(edgeList)
        bottom_nodes, top_nodes = nx.bipartite.sets(G)
        #projecting both set1 onto set2 and vice versa to see if they give different results
        projected_bottom = nx.bipartite.projected_graph(G, bottom_nodes, multigraph = True)
        projected_top = nx.bipartite.projected_graph(G, top_nodes, multigraph = True)
        #bipartite density
        density = nx.bipartite.density(G, top_nodes)
        #rest of the calculations made for both projected graphs
        top_degree = nx.degree_centrality(projected_top)
        bot_degree = nx.degree_centrality(projected_bottom)
        top_between = nx.betweenness_centrality(projected_top)
        bot_between = nx.betweenness_centrality(projected_bottom)
        top_projected_density = nx.density(projected_top)
        bot_projected_density = nx.density(projected_bottom)
        
        return (G,density,bot_degree,top_degree,top_between,bot_between,
                top_projected_density,bot_projected_density)
            
    firstDate = min(data[time_column])
    lastDate = max(data[time_column])
    iter = month_year_iter(firstDate.month, firstDate.year, lastDate.month, lastDate.year)
    for year,month in iter:
        current = data[data[month_column] == month]
        current = current[current[year_column] == year]
        current = current[[set1,set2]]
        current.dropna()
        temp = calculate(set1,set2,current)
        results +=[(temp,month,year)]

    return results

In [63]:
#calculate the results of the data
masterlist = testCentralityGraph(merged,'department_name_update', 'supplier_name','creation_date','month','year')

In [100]:
#saving graphs for each month to image file
for item in masterlist:
    G = item[0][0]
    edgeList = G.edges()
    nodelistDept, nodelistSup = nx.bipartite.sets(G)
    nodelistDept = list(nodelistDept)
    nodelistSup = list(nodelistSup)
    pos=nx.networkx.spring_layout(G)
    nx.draw_networkx_nodes(G, pos, nodelist = nodelistDept, node_color = 'w', node_size = 50)
    nx.draw_networkx_nodes(G, pos, nodelist = nodelistSup, node_color= 'r', node_size=50)
    nx.draw_networkx_edges(G,pos,width=0.5,alpha=0.5)
    nx.draw_networkx_edges(G,pos,edgelist=edgeList)
    name = str(item[1]) + ', ' + str(item[2])
    plt.savefig(name)
    plt.clf()
    plt.cla()
    plt.close()

In [49]:
import json

In [65]:
#saving results from masterlist to csv files for further analysis
for item in masterlist:
    sup_degree_centrality = pd.DataFrame(data=item[0][2].items(), columns = ['supplier', 'degree_centrality'])
    dept_degree_centrality = pd.DataFrame(data = item[0][3].items(),columns = ['department','degree_centrality'])
    dept_between_centrality = pd.DataFrame(data = item[0][4].items(),columns = ['department','betweenness_centrality'])
    sup_betweeen_centrality = pd.DataFrame(data = item[0][5].items(), columns = ['supplier','betweenness_centrality'])
    name = str(item[1]) + '_' + str(item[2])
    sdc = name + '_' +'supplier_degree_centrality.csv'
    ddc = name + '_' + 'dept_degree_centrality.csv'
    dbc = name + '_' + 'dept_between_centrality.csv'
    sbc = name + '_' + 'sup_between_centrality.csv'
    sup_degree_centrality.to_csv(sdc)
    dept_degree_centrality.to_csv(ddc)
    dept_between_centrality.to_csv(dbc)
    sup_betweeen_centrality.to_csv(sbc)
    dense = {'density' : item[0][1], 'department_projected_density' : item[0][6], 'supplier_projected_density' : item[0][7]}
    density = name+'_' + 'density_calculations.json'
    with open(density, 'wb') as fp:
        json.dump(dense, fp)

In [83]:
purchase_type = set(merged['item_type'])
purchase_type = list(purchase_type)

In [85]:
purchase_type[1]


Out[85]:
'NonCatalog Product'

In [94]:
def testEdgeGraph(data, set1, set2,subset_set, time_column, month_column=None, year_column=None):
    density = {}
    # higher order function that does the calculations by month
    def calculate(set1, set2, current):
        G = nx.Graph()
        G.add_nodes_from(current[set1],bipartite = 0)
        G.add_nodes_from(current[set2],bipartite = 1)
        edgeList = [tuple(x) for x in current.values]
        G.add_edges_from(edgeList)
        bottom_nodes, top_nodes = nx.bipartite.sets(G)
        #projecting both set1 onto set2 and vice versa to see if they give different results
        projected_bottom = nx.bipartite.projected_graph(G, bottom_nodes, multigraph = True)
        projected_top = nx.bipartite.projected_graph(G, top_nodes, multigraph = True)
        #bipartite density
        density = nx.bipartite.density(G, top_nodes)
        #rest of the calculations made for both projected graphs
        """
        top_degree = nx.degree_centrality(projected_top)
        bot_degree = nx.degree_centrality(projected_bottom)
        top_between = nx.betweenness_centrality(projected_top)
        bot_between = nx.betweenness_centrality(projected_bottom)
        top_projected_density = nx.density(projected_top)
        bot_projected_density = nx.density(projected_bottom)
        """
        return (G,density)
            
    year = 2013
    subset = list(set(data[subset_set]))
    for month in range(1,13):
        current = data[data[month_column] == month]
        current = current[current[year_column] == year]
        for item in subset:
            current = current[current[subset_set] == item]
            current = current[[set1,set2]]
            current.dropna()
            temp = calculate(set1,set2,current)
            G = temp[0]
            edgeList = G.edges()
            nodelistDept, nodelistSup = nx.bipartite.sets(G)
            nodelistDept = list(nodelistDept)
            nodelistSup = list(nodelistSup)
            pos=nx.networkx.spring_layout(G)
            nx.draw_networkx_nodes(G, pos, nodelist = nodelistDept, node_color = 'r', node_size = 50)
            nx.draw_networkx_nodes(G, pos, nodelist = nodelistSup, node_color= 'w', node_size=50)
            nx.draw_networkx_edges(G,pos,width=0.5,alpha=0.5)
            nx.draw_networkx_edges(G,pos,edgelist=edgeList)
            name = item + '_'+ str(item[1]) + '_' + str(item[2])
            plt.savefig(name)
            plt.clf()
            plt.cla()
            plt.close()
            density[name] = temp[1]
    return density

In [98]:
def calculate(set1, set2, current):
    G = nx.Graph()
    G.add_nodes_from(current[set1],bipartite = 0)
    G.add_nodes_from(current[set2],bipartite = 1)
    edgeList = [tuple(x) for x in current.values]
    G.add_edges_from(edgeList)
    bottom_nodes, top_nodes = nx.bipartite.sets(G)
    density = nx.bipartite.density(G, top_nodes)
    return (G,density)
    #projecting both set1 onto set2 and vice versa to see if they give different results
    #projected_bottom = nx.bipartite.projected_graph(G, bottom_nodes, multigraph = True)
    #projected_top = nx.bipartite.projected_graph(G, top_nodes, multigraph = True)
    #bipartite density
    #rest of the calculations made for both projected graphs
    """
        top_degree = nx.degree_centrality(projected_top)
        bot_degree = nx.degree_centrality(projected_bottom)
        top_between = nx.betweenness_centrality(projected_top)
        bot_between = nx.betweenness_centrality(projected_bottom)
        top_projected_density = nx.density(projected_top)
        bot_projected_density = nx.density(projected_bottom)
    """

In [99]:
density = {}

purchase_type = set(merged['item_type'])
purchase_type = list(purchase_type)
set1,set2= 'department_name_update', 'supplier_name'
data2013 = merged[merged['year'] == 2013]
for month in range(1,13):
    current = data2013[data2013['month'] == month]
    for item in purchase_type:
        current = current[current['item_type'] == item]
        current = current[[set1,set2]]
        current.dropna()
        temp = calculate(set1,set2,current)
        G = temp[0]
        edgeList = G.edges()
        nodelistDept, nodelistSup = nx.bipartite.sets(G)
        nodelistDept = list(nodelistDept)
        nodelistSup = list(nodelistSup)
        pos=nx.networkx.spring_layout(G)
        nx.draw_networkx_nodes(G, pos, nodelist = nodelistDept, node_color = 'r', node_size = 50)
        nx.draw_networkx_nodes(G, pos, nodelist = nodelistSup, node_color= 'w', node_size=50)
        nx.draw_networkx_edges(G,pos,width=0.5,alpha=0.5)
        nx.draw_networkx_edges(G,pos,edgelist=edgeList)
        name = item + '_'+ str(month) + '_' + str(2013)
        plt.savefig(name)
        plt.clf()
        plt.cla()
        plt.close()
        density[name] = temp[1]


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-99-6bc6cf7e358a> in <module>()
      7     current = data2013[data2013['month'] == month]
      8     for item in purchase_type:
----> 9         current = current[current['item_type'] == item]
     10         current = current[[set1,set2]]
     11         current.dropna()

c:\Anaconda\lib\site-packages\pandas\core\frame.pyc in __getitem__(self, key)
   1778             return self._getitem_multilevel(key)
   1779         else:
-> 1780             return self._getitem_column(key)
   1781 
   1782     def _getitem_column(self, key):

c:\Anaconda\lib\site-packages\pandas\core\frame.pyc in _getitem_column(self, key)
   1785         # get column
   1786         if self.columns.is_unique:
-> 1787             return self._get_item_cache(key)
   1788 
   1789         # duplicate columns & possible reduce dimensionaility

c:\Anaconda\lib\site-packages\pandas\core\generic.pyc in _get_item_cache(self, item)
   1066         res = cache.get(item)
   1067         if res is None:
-> 1068             values = self._data.get(item)
   1069             res = self._box_item_values(item, values)
   1070             cache[item] = res

c:\Anaconda\lib\site-packages\pandas\core\internals.pyc in get(self, item, fastpath)
   2847 
   2848             if not isnull(item):
-> 2849                 loc = self.items.get_loc(item)
   2850             else:
   2851                 indexer = np.arange(len(self.items))[isnull(self.items)]

c:\Anaconda\lib\site-packages\pandas\core\index.pyc in get_loc(self, key)
   1400         loc : int if unique index, possibly slice or mask if not
   1401         """
-> 1402         return self._engine.get_loc(_values_from_object(key))
   1403 
   1404     def get_value(self, series, key):

pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:3807)()

pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:3687)()

pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12310)()

pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12261)()

KeyError: 'item_type'

In [ ]: