In [54]:
import os
import networkx as nx
import matplotlib.pyplot as plt
import cython
import numpy as np
import pandas as pd
import datetime
import pylab
In [55]:
# load in merged data
data_path = '../../../UCB_dept_merge_CorrectDeptLabels.csv'
merged = pd.read_csv(data_path)
In [56]:
#clean up the department dataset
def convert_strings_to_specials(s):
s = s.replace(' ', '_')
s = s.replace(':', '_')
s = s.replace('#', 'num')
s = s.lower()
return s
merged.columns = [convert_strings_to_specials(col) for col in merged.columns]
In [57]:
#convert creation_date to datetime object
merged['creation_date']=pd.to_datetime(pd.Series(merged['creation_date']))
In [58]:
month = [date.month for date in merged['creation_date']]
year = [date.year for date in merged['creation_date']]
day = [date.day for date in merged['creation_date']]
merged['month'] = month
merged['year'] = year
merged['day'] = day
In [59]:
#iterator for getting month,year
def month_year_iter( start_month, start_year, end_month, end_year ):
ym_start= 12*start_year + start_month - 1
ym_end= 12*end_year + end_month - 1
for ym in range( ym_start, ym_end ):
y, m = divmod( ym, 12 )
yield y, m+1
In [10]:
def remove_zeroes(dictionary):
for key,value in dictionary.items():
if value == 0.0:
del dictionary[key]
return dictionary
In [62]:
#function to create a series of bipartite graphs across two sets of nodes(two columns) subsetted by month
#inputs are the data frame, set 1 and set 2 represent the names of the columns of interest, time_column represents
#the name of the time column in the dataset. year_column and month_columns are created because I did not know how
#to be able to specify a month year combination any other way
#returns a list of calculations(density, centrality, etc.) subseted by month in a list
#where each item in the list is in the following form: ([calculations],month,year)
def testCentralityGraph(data, set1, set2, time_column, month_column=None, year_column=None):
results = []
# higher order function that does the calculations by month
def calculate(set1, set2, current):
G = nx.Graph()
G.add_nodes_from(current[set1],bipartite = 0)
G.add_nodes_from(current[set2],bipartite = 1)
edgeList = [tuple(x) for x in current.values]
G.add_edges_from(edgeList)
bottom_nodes, top_nodes = nx.bipartite.sets(G)
#projecting both set1 onto set2 and vice versa to see if they give different results
projected_bottom = nx.bipartite.projected_graph(G, bottom_nodes, multigraph = True)
projected_top = nx.bipartite.projected_graph(G, top_nodes, multigraph = True)
#bipartite density
density = nx.bipartite.density(G, top_nodes)
#rest of the calculations made for both projected graphs
top_degree = nx.degree_centrality(projected_top)
bot_degree = nx.degree_centrality(projected_bottom)
top_between = nx.betweenness_centrality(projected_top)
bot_between = nx.betweenness_centrality(projected_bottom)
top_projected_density = nx.density(projected_top)
bot_projected_density = nx.density(projected_bottom)
return (G,density,bot_degree,top_degree,top_between,bot_between,
top_projected_density,bot_projected_density)
firstDate = min(data[time_column])
lastDate = max(data[time_column])
iter = month_year_iter(firstDate.month, firstDate.year, lastDate.month, lastDate.year)
for year,month in iter:
current = data[data[month_column] == month]
current = current[current[year_column] == year]
current = current[[set1,set2]]
current.dropna()
temp = calculate(set1,set2,current)
results +=[(temp,month,year)]
return results
In [63]:
#calculate the results of the data
masterlist = testCentralityGraph(merged,'department_name_update', 'supplier_name','creation_date','month','year')
In [100]:
#saving graphs for each month to image file
for item in masterlist:
G = item[0][0]
edgeList = G.edges()
nodelistDept, nodelistSup = nx.bipartite.sets(G)
nodelistDept = list(nodelistDept)
nodelistSup = list(nodelistSup)
pos=nx.networkx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, nodelist = nodelistDept, node_color = 'w', node_size = 50)
nx.draw_networkx_nodes(G, pos, nodelist = nodelistSup, node_color= 'r', node_size=50)
nx.draw_networkx_edges(G,pos,width=0.5,alpha=0.5)
nx.draw_networkx_edges(G,pos,edgelist=edgeList)
name = str(item[1]) + ', ' + str(item[2])
plt.savefig(name)
plt.clf()
plt.cla()
plt.close()
In [49]:
import json
In [65]:
#saving results from masterlist to csv files for further analysis
for item in masterlist:
sup_degree_centrality = pd.DataFrame(data=item[0][2].items(), columns = ['supplier', 'degree_centrality'])
dept_degree_centrality = pd.DataFrame(data = item[0][3].items(),columns = ['department','degree_centrality'])
dept_between_centrality = pd.DataFrame(data = item[0][4].items(),columns = ['department','betweenness_centrality'])
sup_betweeen_centrality = pd.DataFrame(data = item[0][5].items(), columns = ['supplier','betweenness_centrality'])
name = str(item[1]) + '_' + str(item[2])
sdc = name + '_' +'supplier_degree_centrality.csv'
ddc = name + '_' + 'dept_degree_centrality.csv'
dbc = name + '_' + 'dept_between_centrality.csv'
sbc = name + '_' + 'sup_between_centrality.csv'
sup_degree_centrality.to_csv(sdc)
dept_degree_centrality.to_csv(ddc)
dept_between_centrality.to_csv(dbc)
sup_betweeen_centrality.to_csv(sbc)
dense = {'density' : item[0][1], 'department_projected_density' : item[0][6], 'supplier_projected_density' : item[0][7]}
density = name+'_' + 'density_calculations.json'
with open(density, 'wb') as fp:
json.dump(dense, fp)
In [83]:
purchase_type = set(merged['item_type'])
purchase_type = list(purchase_type)
In [85]:
purchase_type[1]
Out[85]:
In [94]:
def testEdgeGraph(data, set1, set2,subset_set, time_column, month_column=None, year_column=None):
density = {}
# higher order function that does the calculations by month
def calculate(set1, set2, current):
G = nx.Graph()
G.add_nodes_from(current[set1],bipartite = 0)
G.add_nodes_from(current[set2],bipartite = 1)
edgeList = [tuple(x) for x in current.values]
G.add_edges_from(edgeList)
bottom_nodes, top_nodes = nx.bipartite.sets(G)
#projecting both set1 onto set2 and vice versa to see if they give different results
projected_bottom = nx.bipartite.projected_graph(G, bottom_nodes, multigraph = True)
projected_top = nx.bipartite.projected_graph(G, top_nodes, multigraph = True)
#bipartite density
density = nx.bipartite.density(G, top_nodes)
#rest of the calculations made for both projected graphs
"""
top_degree = nx.degree_centrality(projected_top)
bot_degree = nx.degree_centrality(projected_bottom)
top_between = nx.betweenness_centrality(projected_top)
bot_between = nx.betweenness_centrality(projected_bottom)
top_projected_density = nx.density(projected_top)
bot_projected_density = nx.density(projected_bottom)
"""
return (G,density)
year = 2013
subset = list(set(data[subset_set]))
for month in range(1,13):
current = data[data[month_column] == month]
current = current[current[year_column] == year]
for item in subset:
current = current[current[subset_set] == item]
current = current[[set1,set2]]
current.dropna()
temp = calculate(set1,set2,current)
G = temp[0]
edgeList = G.edges()
nodelistDept, nodelistSup = nx.bipartite.sets(G)
nodelistDept = list(nodelistDept)
nodelistSup = list(nodelistSup)
pos=nx.networkx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, nodelist = nodelistDept, node_color = 'r', node_size = 50)
nx.draw_networkx_nodes(G, pos, nodelist = nodelistSup, node_color= 'w', node_size=50)
nx.draw_networkx_edges(G,pos,width=0.5,alpha=0.5)
nx.draw_networkx_edges(G,pos,edgelist=edgeList)
name = item + '_'+ str(item[1]) + '_' + str(item[2])
plt.savefig(name)
plt.clf()
plt.cla()
plt.close()
density[name] = temp[1]
return density
In [98]:
def calculate(set1, set2, current):
G = nx.Graph()
G.add_nodes_from(current[set1],bipartite = 0)
G.add_nodes_from(current[set2],bipartite = 1)
edgeList = [tuple(x) for x in current.values]
G.add_edges_from(edgeList)
bottom_nodes, top_nodes = nx.bipartite.sets(G)
density = nx.bipartite.density(G, top_nodes)
return (G,density)
#projecting both set1 onto set2 and vice versa to see if they give different results
#projected_bottom = nx.bipartite.projected_graph(G, bottom_nodes, multigraph = True)
#projected_top = nx.bipartite.projected_graph(G, top_nodes, multigraph = True)
#bipartite density
#rest of the calculations made for both projected graphs
"""
top_degree = nx.degree_centrality(projected_top)
bot_degree = nx.degree_centrality(projected_bottom)
top_between = nx.betweenness_centrality(projected_top)
bot_between = nx.betweenness_centrality(projected_bottom)
top_projected_density = nx.density(projected_top)
bot_projected_density = nx.density(projected_bottom)
"""
In [99]:
density = {}
purchase_type = set(merged['item_type'])
purchase_type = list(purchase_type)
set1,set2= 'department_name_update', 'supplier_name'
data2013 = merged[merged['year'] == 2013]
for month in range(1,13):
current = data2013[data2013['month'] == month]
for item in purchase_type:
current = current[current['item_type'] == item]
current = current[[set1,set2]]
current.dropna()
temp = calculate(set1,set2,current)
G = temp[0]
edgeList = G.edges()
nodelistDept, nodelistSup = nx.bipartite.sets(G)
nodelistDept = list(nodelistDept)
nodelistSup = list(nodelistSup)
pos=nx.networkx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, nodelist = nodelistDept, node_color = 'r', node_size = 50)
nx.draw_networkx_nodes(G, pos, nodelist = nodelistSup, node_color= 'w', node_size=50)
nx.draw_networkx_edges(G,pos,width=0.5,alpha=0.5)
nx.draw_networkx_edges(G,pos,edgelist=edgeList)
name = item + '_'+ str(month) + '_' + str(2013)
plt.savefig(name)
plt.clf()
plt.cla()
plt.close()
density[name] = temp[1]
In [ ]: