In [1]:
from traitlets.config.manager import BaseJSONConfigManager
path = "/Users/bob/anaconda/etc/jupyter/nbconfig"
cm = BaseJSONConfigManager(config_dir=path)
pixels = 900
cm.update('livereveal', {
'transition': 'convex',
'start_slideshow_at': 'selected',
'scroll': True,
'width': pixels * 16 / 9,
'height': pixels,
'controls': False
})
Out[1]:
In [2]:
from __future__ import division
%matplotlib inline
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = (8.0, 6.0)
import json
import itertools
import pandas as pd
import numpy as np
import networkx as nx
501c3 Nonprofit started to connect Professional and Aspiring Data Scientists with problems involving Social Good.
|
In [3]:
df = pd.read_csv(
'../data/scraped_data.csv.gz',
converters={'name': lambda x: str(x).lower(),
'number': str,
'oid': str,
'post_id': str},
parse_dates=['postdate'])
df.head()
Out[3]:
In [6]:
df.describe(include = 'all')
Out[6]:
In [7]:
df\
.groupby('number')\
.count()\
.sort_values('post_id',ascending=False)[['post_id']]\
.head()
Out[7]:
In [8]:
ph_sample = df[df.number=='7865032020']
ph_sample.sort_values('name',ascending=False).head()
Out[8]:
In [9]:
def plot_graph_data(in_data, data_type, color, G=nx.Graph()):
''' Plot graph '''
out = []
for a, b in itertools.product(in_data, in_data):
out.append((a, b, {'type': data_type, 'color': color}))
G.add_edges_from(out)
pos = nx.circular_layout(G)
colors = [G[u][v]['color'] for u, v in G.edges()]
nx.draw(G, pos, node_color='k', edge_color=colors, width=1,node_size=15)
return G
In [10]:
G_samp = plot_graph_data(ph_sample.post_id, 'phone', 'b', G=nx.Graph());
In [11]:
em_sample = df[df['name'].str.contains('tuc',False)]
em_sample
Out[11]:
In [12]:
G_samp_em = plot_graph_data(em_sample.post_id, 'email', 'r', G=nx.Graph())
In [13]:
out = []
for a, b in itertools.product(em_sample.post_id, em_sample.post_id):
out.append((a, b, {'type': 'email', 'color': 'r'}))
G_samp.add_edges_from(out)
pos = nx.spring_layout(G_samp)
colors = [G_samp[u][v]['color'] for u, v in G_samp.edges()]
nx.draw(G_samp, pos, node_color='k', edge_color=colors, width=1,node_size=15)
In [14]:
G_samp_loop = nx.Graph()
# No product for loop
v = ph_sample.post_id.values.tolist()
v_right = v[1:]
if len(v) == 1:
v_right = v
else:
v_right[-1] = v[0]
out = [(a, b,{'type':'phone','color':'b'}) for a, b in zip(v, v_right)]
G_samp_loop.add_edges_from(out)
pos = nx.spectral_layout(G_samp_loop)
colors = [G_samp_loop[u][v]['color'] for u,v in G_samp_loop.edges()]
In [15]:
nx.draw(G_samp_loop,pos,node_color='k',edge_color=colors,width=2,node_size=15)
In [16]:
v = em_sample.post_id.values.tolist()
v_right = v[1:]
if len(v) == 1:
v_right = v
else:
v_right[-1] = v[0]
out += [(a, b,{'type':'phone','color':'r'}) for a, b in zip(v, v_right)]
G_samp_loop.add_edges_from(out)
pos = nx.spring_layout(G_samp_loop)
colors = [G_samp_loop[u][v]['color'] for u,v in G_samp_loop.edges()]
nx.draw(G_samp_loop,pos,node_color='k',edge_color=colors,width=2,node_size=15)
In [17]:
def make_graph(df, color, data_type):
'''
Makes a list of tuple lists for each node-edge-node segment in the graph
'''
out = []
for i, (k, v) in enumerate(df.groupby(df.columns[-1])):
v = v.values.tolist()
v = [x[0] for x in v]
v_right = v[1:]
if len(v) == 1:
v_right = v
else:
v_right[-1] = v[0]
out.append([(a, b, {'type': data_type,
'color': color}) for a, b in zip(v, v_right)])
out = [item for sublist in out for item in sublist]
return out
In [18]:
out = make_graph(df[df.name!=''][['post_id','name']],'r','email')
out += make_graph(df[df.number!=''][['post_id','number']],'b','number')
out += make_graph(df[df.oid!=''][['post_id','oid']],'g','oid')
In [19]:
G = nx.Graph()
G.add_edges_from(out)
sub_graphs = []
for i, x in enumerate(nx.connected_component_subgraphs(G)):
nodes = nx.nodes(x)
sub_graphs.append(list(zip([i] * len(nodes), nodes)))
sub_graphs = [item for sublist in sub_graphs for item in sublist]
In [20]:
df_out = pd.DataFrame(sub_graphs,
columns=['entity_id',
'post_id'])
df_out.head(10)
Out[20]:
In [21]:
df_out = df_out.merge(df,on='post_id')
df_out.set_index(['entity_id','number','name','oid'],inplace=True)
df_out.head(10)
Out[21]:
In [22]:
df_out.xs('tucenicienta360@gmail.com',level='name')
Out[22]:
In [23]:
G['104780']
Out[23]:
In [24]:
df_out.loc[560].sort_index()
Out[24]:
In [28]:
G_check = G.subgraph(df_out.loc[560:610].post_id.values)
pos = nx.spring_layout(G_check)
colors = [G_check[u][v]['color'] for u,v in G_check.edges()]
nx.draw(G_check,pos,node_color='k',edge_color=colors,width=2,node_size=5)