In [1]:
from traitlets.config.manager import BaseJSONConfigManager
path = "/Users/bob/anaconda/etc/jupyter/nbconfig"
cm = BaseJSONConfigManager(config_dir=path)
pixels = 900
cm.update('livereveal', {
'transition': 'convex',
'start_slideshow_at': 'selected',
'scroll': True,
'width': pixels * 16 / 9,
'height': pixels,
'controls': False
})
Out[1]:
In [2]:
from __future__ import division
%matplotlib inline
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = (8.0, 6.0)
import json
import itertools
import pandas as pd
import numpy as np
import networkx as nx
501c3 Nonprofit started to connect Professional and Aspiring Data Scientists with problems involving Social Good.
In [3]:
df = pd.read_csv(
'../data/scraped_data.csv.gz',
converters={'name': lambda x: str(x).lower(),
'number': str,
'oid': str,
'post_id': str},
parse_dates=['postdate'])
df.head()
Out[3]:
In [4]:
df.describe(include = 'all')
Out[4]:
In [5]:
df\
.groupby('number')\
.count()\
.sort_values('post_id',ascending=False)[['post_id']]\
.head()
Out[5]:
In [6]:
ph_sample = df[df.number=='7865032020']
ph_sample.sort_values('name',ascending=False).head()
Out[6]:
In [18]:
def plot_graph_data(in_data, data_type, color, G=nx.Graph(), do_plot=True):
''' Plot graph '''
out = []
for a, b in itertools.product(in_data, in_data):
out.append((a, b, {'type': data_type, 'color': color,
'd3color': '#%02x%02x%02x' % tuple(c*255 for c in color)}))
G.add_edges_from(out)
if do_plot:
pos = nx.circular_layout(G)
colors = [G[u][v]['color'] for u, v in G.edges()]
nx.draw(G, pos, node_color='k', edge_color=colors, width=1,node_size=15)
return G
In [8]:
G_samp = plot_graph_data(ph_sample.post_id, 'phone', [0,0,1], G=nx.Graph());
In [9]:
from networkx.readwrite import json_graph
from IPython.display import Javascript
import json
def draw_d3graph(G, link_distance=200, charge=-1000,
node_radius = 5, stroke_width=1):
variables = """
window.graph={};
window.link_distance={};
window.charge={};
window.node_radius={};
window.stroke_width={};
""".format(json.dumps(json_graph.node_link_data(G)),
link_distance,
charge,
node_radius,
stroke_width)
meat = """
require.config({
paths: {
d3: '//cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3.min'
}
});
require(['d3'], function(d3){
//a weird idempotency thing
$("#chart1").remove();
//create canvas
element.append("<div id='chart1'></div>");
var margin = {top: 20, right: 20, bottom: 30, left: 40};
var width = 960 - margin.left - margin.right;
var height = 600 - margin.top - margin.bottom;
var svg = d3.select("#chart1").append("svg")
.style("position", "relative")
.style("max-width", "960px")
.attr("width", width + "px")
.attr("height", (height + 50) + "px")
.append("g")
.attr("transform", "translate(" + margin.left + "," + margin.top + ")");
var color = d3.scale.category10();
var force = d3.layout.force()
.charge(window.charge)
.linkDistance(window.link_distance)
.size([width, height]);
force
.nodes(window.graph.nodes)
.links(window.graph.links)
.start();
var link = svg.selectAll("line.link")
.data(window.graph.links)
.enter().append("line")
.attr("class", "link")
.style("stroke", function(d) { return d.d3color; })
.style("stroke-opacity", .5)
.style("stroke-width", window.stroke_width);
var node = svg.selectAll("circle.node")
.data(window.graph.nodes)
.enter().append("circle")
.attr("class", "node")
.attr("r", window.node_radius)
.style("fill", "#999")
.style("opacity", .4)
.call(force.drag);
force.on("tick", function() {
link.attr("x1", function(d) { return d.source.x; })
.attr("y1", function(d) { return d.source.y; })
.attr("x2", function(d) { return d.target.x; })
.attr("y2", function(d) { return d.target.y; });
node.attr("cx", function(d) { return d.x; })
.attr("cy", function(d) { return d.y; });
});
});
"""
return Javascript(variables+meat)
In [10]:
draw_d3graph(G_samp,200)
Out[10]:
In [15]:
em_sample = df[df['name'].str.contains('tuc',False)]
em_sample
Out[15]:
In [21]:
G_samp_em = plot_graph_data(em_sample.post_id, 'email', [1,0,0], G=nx.Graph(), do_plot=False)
draw_d3graph(G_samp_em,200)
Out[21]:
In [23]:
out = []
for a, b in itertools.product(em_sample.post_id, em_sample.post_id):
out.append((a, b, {'type': 'email', 'color': 'r', 'd3color': '#f00'}))
G_samp.add_edges_from(out)
pos = nx.spring_layout(G_samp)
colors = [G_samp[u][v]['color'] for u, v in G_samp.edges()]
# nx.draw(G_samp, pos, node_color='k', edge_color=colors, width=1,node_size=15)
draw_d3graph(G_samp,200)
Out[23]:
In [24]:
G_samp_loop = nx.Graph()
# No product for loop
v = ph_sample.post_id.values.tolist()
v_right = v[1:]
if len(v) == 1:
v_right = v
else:
v_right[-1] = v[0]
out = [(a, b,{'type':'phone','color':'b', 'd3color': '#00f'}) for a, b in zip(v, v_right)]
G_samp_loop.add_edges_from(out)
pos = nx.spectral_layout(G_samp_loop)
colors = [G_samp_loop[u][v]['color'] for u,v in G_samp_loop.edges()]
In [25]:
# nx.draw(G_samp_loop,pos,node_color='k',edge_color=colors,width=2,node_size=15)
draw_d3graph(G_samp_loop,20,-100)
Out[25]:
In [26]:
v = em_sample.post_id.values.tolist()
v_right = v[1:]
if len(v) == 1:
v_right = v
else:
v_right[-1] = v[0]
out += [(a, b,{'type':'phone','color':'r', 'd3color': '#f00'}) for a, b in zip(v, v_right)]
G_samp_loop.add_edges_from(out)
pos = nx.spring_layout(G_samp_loop)
colors = [G_samp_loop[u][v]['color'] for u,v in G_samp_loop.edges()]
# nx.draw(G_samp_loop,pos,node_color='k',edge_color=colors,width=2,node_size=15)
draw_d3graph(G_samp_loop,20,-100)
Out[26]:
In [27]:
def make_graph(df, color, data_type):
'''
Makes a list of tuple lists for each node-edge-node segment in the graph
'''
out = []
for i, (k, v) in enumerate(df.groupby(df.columns[-1])):
v = v.values.tolist()
v = [x[0] for x in v]
v_right = v[1:]
if len(v) == 1:
v_right = v
else:
v_right[-1] = v[0]
out.append([(a, b, {'type': data_type,
'color': color,
'd3color': '#%02x%02x%02x' % tuple(c*255 for c in color)}) for a, b in zip(v, v_right)])
out = [item for sublist in out for item in sublist]
return out
In [28]:
out = make_graph(df[df.name!=''][['post_id','name']],[1,0,0],'email')
out += make_graph(df[df.number!=''][['post_id','number']],[0,0,1],'number')
out += make_graph(df[df.oid!=''][['post_id','oid']],[0,1,0],'oid')
In [29]:
G = nx.Graph()
G.add_edges_from(out)
sub_graphs = []
for i, x in enumerate(nx.connected_component_subgraphs(G)):
nodes = nx.nodes(x)
sub_graphs.append(list(zip([i] * len(nodes), nodes)))
sub_graphs = [item for sublist in sub_graphs for item in sublist]
In [30]:
df_out = pd.DataFrame(sub_graphs,
columns=['entity_id',
'post_id'])
df_out.head(10)
Out[30]:
In [31]:
df_out = df_out.merge(df,on='post_id')
df_out.set_index(['entity_id','number','name','oid'],inplace=True)
df_out.head(10)
Out[31]:
In [32]:
df_out.xs('tucenicienta360@gmail.com',level='name')
Out[32]:
In [33]:
G['104780']
Out[33]:
In [34]:
df_out.loc[560].sort_index()
Out[34]:
In [35]:
G_check = G.subgraph(df_out.loc[560:610].post_id.values)
pos = nx.spring_layout(G_check)
colors = [G_check[u][v]['color'] for u,v in G_check.edges()]
In [38]:
draw_d3graph(G_check,2,-5,3,3)
Out[38]:
In [37]:
nx.draw(G_check,pos,node_color='k',edge_color=colors,width=2,node_size=5)