In [40]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from networkx.algorithms import bipartite
In [2]:
#this has org level 5 data dept names and number
data = pd.read_csv('/Users/dariushbozorgmehri/Documents/My Work/Berkeley Classes Spring 2015/BIDS/data/modified_data_4_15/departmentDataV3_ALL_DATA_CLEAN.csv', low_memory=False)
In [3]:
data = data.rename(columns={'Org.Level.4': 'org_level_4', 'Org.Level.4.Number': 'org_level_4_number','Org.Level.5': 'org_level_5', 'Org.Level.5.Number': 'org_level_5_number'})
In [4]:
data.head(3)
Out[4]:
In [7]:
data4Number = data[["org_level_4","org_level_4_number"]]
data4Number = data4Number.drop_duplicates()
data4Number = data4Number[["org_level_4_number", "org_level_4"]]
#reindex after drop dup
data4Number.index = range(1,len(data4Number) + 1)
In [8]:
data5Number = data[["org_level_5","org_level_5_number"]]
data5Number = data5Number.drop_duplicates()
data5Number = data5Number[["org_level_5_number", "org_level_5"]]
data5Number.index = range(1,len(data5Number) + 1)
In [10]:
len(data4Number)
#data4Number = data4Number.drop(data4Number.index[[0]])
#data4Number
Out[10]:
In [40]:
#data4Number['org_level_4_number'] = data4['org_level_4_number'].to_string()
In [52]:
#a = data4Number.org_level_4_number[3]
#type(data4Number)
In [93]:
#data4Number['org_level_4_number'] = data4Number['org_level_4_number'].map(str.strip)
In [94]:
#convert data frame into a dictionary
#data4Dic = data4Number.set_index('org_level_4_number').to_dict()
In [11]:
data5Number.org_level_5_number[7]
Out[11]:
In [12]:
data4Number.head(10)
#reindiex
#data4Number = data4Number.reindex(index = [0,291])
#data4Number.org_level_4_number[1]
Out[12]:
In [13]:
data4Dic = {}
In [14]:
#CREATE A DICITONARY for org elevel 4
#data4Dic2[data4Number.org_level_4_number[1]] = data4Number.org_level_4[1]
i=1
for i in range(1,len(data4Number)):
#while i < 3:
data4Dic.update({data4Number.org_level_4_number[i]:data4Number.org_level_4[i]})
In [15]:
data5Dic = {}
In [16]:
i=1
for i in range(1,len(data5Number)):
#while i < 3:
data5Dic.update({data5Number.org_level_5_number[i]:data5Number.org_level_5[i]})
In [17]:
data5Dic
len(data5Number)
Out[17]:
In [18]:
data5Number.org_level_5_number[1]
Out[18]:
In [19]:
data4Number.org_level_4_number[1]
#len(data4Number)
Out[19]:
In [20]:
dataOrig = pd.read_csv('/Users/dariushbozorgmehri/Documents/My Work/Berkeley Classes Spring 2015/BIDS/data/UCB_dept_merge.csv', low_memory=False)
In [21]:
dataOrig.head(7)
Out[21]:
In [22]:
dataOrig['department_number'] = dataOrig["department_name"].apply(lambda x: x[:5])
dataOrig['department_name_stripped'] = dataOrig["department_name"].apply(lambda x: x[6:])
In [23]:
dataOrig['department_name_update'] = ""
In [24]:
dataOrig.head()
Out[24]:
In [48]:
"""#this creates a list
dataOrigEdit = dataOrig["department_number"]
#convert back to dataframe
dataOrigEdit = pd.DataFrame(dataOrigEdit)
dataOrigEdit """
Out[48]:
In [114]:
dataMerged = pd.merge(dataOrig, data4Number, left_on='department_number', right_index=True)
dataMerged
Out[114]:
In [58]:
i=0
#dataOrig.department_number[i]
j=1
data4Number.org_level_4_number[j]
Out[58]:
In [151]:
""" i=0
for i in range(10000,50000):
j=0
for j in range(1, len(data4Number)):
#print j
if dataOrig.department_number[i] == data4Number.org_level_4_number[j]:
dataOrig.department_name_update[i] = data4Number.org_level_4[j] """
Out[151]:
In [25]:
#USE DICTIONARY!!!
i=0
for i in range(1,len(dataOrig)):
if dataOrig.department_number[i] in data4Dic:
#print "in there"
dataOrig.department_name_update[i] = data4Dic[dataOrig.department_number[i]]
elif dataOrig.department_number[i] in data5Dic:
dataOrig.department_name_update[i] = data5Dic[dataOrig.department_number[i]]
else:
dataOrig.department_name_update[i] = dataOrig.department_name_stripped[i]
In [27]:
dataOrig.head()
Out[27]:
In [28]:
dataOrig.department_name_update.describe()
Out[28]:
In [3]:
#READ IN CLEANED UP DATA
#dataOrig.to_csv('/Users/dariushbozorgmehri/Documents/My Work/Berkeley Classes Spring 2015/BIDS/data/temp.csv')
dataOrig = pd.read_csv('/Users/dariushbozorgmehri/Documents/My Work/Berkeley Classes Spring 2015/BIDS/data/temp.csv')
In [4]:
#clean up data
dataOrig['supplier_name'] = dataOrig['supplier_name'].str.replace(',', ' ')
dataOrig['supplier_name'] = dataOrig['supplier_name'].str.replace(' ', '_')
dataOrig['department_name_update'] = dataOrig['department_name_update'].str.replace(',', ' ')
dataOrig['department_name_update'] = dataOrig['department_name_update'].str.replace(' ', '_')
In [5]:
dataOrig['creation_date'] = dataOrig['creation_date'].str.replace('00:00:00', '')
In [6]:
#create seperate year, month, day colums
dataOrig['year'] = dataOrig["creation_date"].apply(lambda x: x[:4])
dataOrig['month'] = dataOrig['creation_date'].apply(lambda x: x[5:7])
dataOrig['day'] = dataOrig['creation_date'].apply(lambda x: x[8:10])
In [8]:
dataOrig.head(3)
Out[8]:
In [9]:
#subset data
#data = data[['creation_date', 'supplier_name','item_type','org_level_3']]
dataOrigSubset = dataOrig[[ 'supplier_name','department_name_update','item_type','year','month','day']]
In [11]:
dataOrigSubset.head(50)
Out[11]:
In [12]:
dataOrigSubset.year.describe()
#data4['year']= data4['year'].astype(int)
#data4['year'].isnull().sum()
#data4[50000:100000]
#len(data4)
len(dataOrigSubset)
Out[12]:
In [26]:
#subsetting by element in row, subset data for year only, drop all 2104
#data = data[data['year'] == '2013']
dataOrigSubset = dataOrigSubset[dataOrigSubset['year'] == '2013']
dataOrigSubset = dataOrigSubset[dataOrigSubset.supplier_name != 'OFFICE_MAX']
dataOrigSubset = dataOrigSubset[dataOrigSubset.supplier_name != 'GIVE_SOMETHING_BACK']
dataOrigSubset = dataOrigSubset[dataOrigSubset.supplier_name != 'ALKO_OFFICE_SUPPLY']
dataOrigSubset.head()
Out[26]:
In [37]:
dataOrigSubset.notnull().sum()
Out[37]:
In [48]:
"""data13Feb = data[data['month'].isin(['02'])]
#data13Feb = data13Feb.drop_duplicates()
data13March = data[data['month'].isin(['03'])]"""
Out[48]:
In [27]:
#PREPARE DATA FOR GRAPH
dataToGraph = dataOrigSubset[dataOrigSubset['month'].isin(['02'])]
dataToGraph = dataToGraph[dataToGraph['item_type'].isin(['NonCatalog Product'])]
dataToGraph = dataToGraph[['supplier_name','department_name_update']]
dataToGraph = dataToGraph.drop_duplicates()
#GraphData(dataToGraph, month)
dataToGraph.to_csv('/Users/dariushbozorgmehri/Documents/My Work/Berkeley Classes Spring 2015/BIDS/data/DATA_CUT_JAN13ALL.csv')
In [23]:
dataToGraph.to_csv('/Users/dariushbozorgmehri/Documents/My Work/Berkeley Classes Spring 2015/BIDS/data/DATA2013FEB_NONCAT.csv')
In [142]:
"""#CREATE GRAPH AND ADD NODES AND EDGES
#def GraphData(dataToGraph, month):
G = nx.Graph()
G.add_nodes_from(dataToGraph['department_name_update'], bipartite = 0)
G.add_nodes_from(dataToGraph['supplier_name'], bipartite = 1)
edgeList = [tuple(x) for x in dataToGraph.values]
G.add_edges_from(edgeList)
nx.draw(G)
plt.show()"""
In [69]:
#CLEAR THE GRAPH FOR REGRAPHING
print "clearing graph"
G = G.clear()
#print "number of nodes and edges:"
#if G.nodes() == 0 and G.edges() == 0:
#print "node and edges are clear"
nodelistDept = []
nodelistSup = []
edgeList = []
print "clearing nodelistDept", nodelistDept, nodelistSup, edgeList
In [71]:
#CREATE GRAPH AND ADD NODES AND EDGES
#def GraphData(dataToGraph, month):
G=nx.Graph()
#G = nx.Graph()
G.add_nodes_from(dataToGraph['department_name_update'], bipartite = 0)
G.add_nodes_from(dataToGraph['supplier_name'], bipartite = 1)
edgeList = [tuple(x) for x in dataToGraph.values]
G.add_edges_from(edgeList)
"""nx.draw(G)
plt.show() """
#CREATE A GRAPH WITH NODES COLORED ACCORDING TO DEPARTMENT OR SUPPLIER, AND SIZED
#WITH DEPT LARGER THAN SUPPLIER, CREATE NETWORK X NODELIST FROM DATAFRAME
nodelistDept = dataToGraph['department_name_update'].tolist()
nodelistSup = dataToGraph['supplier_name'].tolist()
#nodes, node_color=(.2,.4,.6,.8) is in R,G,B, A color range, or for shade just use 0.1 ro 1
#Also, b: blue, g: green, r: red, c: cyan, m: magenta, y: yellow, k: black, w: white
#network layout
pos=nx.networkx.spring_layout(G)
#draw nodes
nx.draw_networkx_nodes(G, pos, nodelist = nodelistDept, node_color = 'r', node_size = 50)
nx.draw_networkx_nodes(G, pos, nodelist = nodelistSup, node_color= 'w', node_size=50)
#draw edges
nx.draw_networkx_edges(G,pos,width=0.5,alpha=0.5)
nx.draw_networkx_edges(G,pos,edgelist=edgeList)
"""#draw the labels
labels = {k:k for k in nodelistDept}
plt.clf()
nx.draw_networkx_labels(G,pos, labels)"""
#nx.draw_networkx_labels(G,pos,font_size=12,font_family='sans-serif')
#plt.show()
#NEWpicName = 'data2013' + month
plt.savefig('/Users/dariushbozorgmehri/Documents/My Work/Berkeley Classes Spring 2015/BIDS/data/temp_pics/NEWpicName.png')
In [272]:
Out[272]:
In [79]:
In [48]:
cent = nx.bipartite.betweenness_centrality(G, G.nodes())
centDf = pd.DataFrame(cent.items(), columns=['actor', 'centrality']).sort(columns="centrality", ascending=False)
In [61]:
bottom_nodes, top_nodes = bipartite.sets(G)
In [62]:
nx.bipartite.density(G, top_nodes), nx.bipartite.density(G, bottom_nodes)
Out[62]:
In [63]:
nx.bipartite.average_clustering(G)
Out[63]:
In [64]:
#nx.bipartite.clustering(G)
In [53]:
#nx.bipartite.color(G)
In [65]:
GProjected = nx.bipartite.projected_graph(G, top_nodes)
In [66]:
nx.draw(GProjected)
plt.show()
In [67]:
nx.density(GProjected)
Out[67]:
In [ ]: