In [1]:
from traitlets.config.manager import BaseJSONConfigManager
path = "/Users/bob/anaconda/etc/jupyter/nbconfig"
cm = BaseJSONConfigManager(config_dir=path)
pixels = 900
cm.update('livereveal', {
    'transition': 'convex',
    'start_slideshow_at': 'selected',
    'scroll': True,
    'width': pixels * 16 / 9,
    'height': pixels,
    'controls': False
})


Out[1]:
{u'controls': False,
 u'height': 900,
 u'minScale': 0.2,
 u'scroll': True,
 u'start_slideshow_at': 'selected',
 u'theme': u'serif',
 u'transition': 'convex',
 u'width': 1600}

In [2]:
from __future__ import division

%matplotlib inline
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = (8.0, 6.0)

import json
import itertools

import pandas as pd
import numpy as np
import networkx as nx

501c3 Nonprofit started to connect Professional and Aspiring Data Scientists with problems involving Social Good.

  • Hackathons
  • Meetups

Data

  • Scraped several months of ad data from seedy websites
  • Columns:
    • name
    • phone number
    • oid (poster unique ID)
    • posterage
    • region
    • type
  • Sample data includes three flat files that pair a post_id with an email, user ID, or email address.

Data Sample


In [3]:
df = pd.read_csv(
    '../data/scraped_data.csv.gz',
    converters={'name': lambda x: str(x).lower(),
                'number': str,
                'oid': str,
                'post_id': str},
    parse_dates=['postdate'])
df.head()


Out[3]:
post_id name number oid postdate posterage region
0 0 6242414310 9635571 2015-11-28 12:00:00 19.0 birmingham
1 1 13957915 2015-12-23 09:13:00 21.0 nashville
2 3 33808981 2015-12-24 01:03:00 24.0 miami
3 4 3059227034 32821362 2015-12-23 01:51:00 35.0 miami
4 6 6242414310 16767542 2015-12-18 06:20:00 25.0 tampa

Data Description


In [4]:
df.describe(include = 'all')


Out[4]:
post_id name number oid postdate posterage region
count 232920 232920 232920 232920 232920 232868.000000 232920
unique 232920 1319 21933 194721 84966 NaN 18
top 217189 21070057 2015-11-24 02:17:00 NaN atlanta
freq 1 228994 72361 110 112 NaN 98439
first NaN NaN NaN NaN 2012-10-12 12:22:00 NaN NaN
last NaN NaN NaN NaN 2016-02-28 12:59:00 NaN NaN
mean NaN NaN NaN NaN NaN 25.678316 NaN
std NaN NaN NaN NaN NaN 8.096470 NaN
min NaN NaN NaN NaN NaN 18.000000 NaN
25% NaN NaN NaN NaN NaN 22.000000 NaN
50% NaN NaN NaN NaN NaN 24.000000 NaN
75% NaN NaN NaN NaN NaN 27.000000 NaN
max NaN NaN NaN NaN NaN 112.000000 NaN

Entity Resolution

After wrestling with the data a bit, we realized that we can conceptualize the data as a graph.

Entity Graph

  • Vertices: Backpage posts
  • Edged: Common attributes (email, phone number, poster ID)

Explore Subgraph Sizes


In [5]:
df\
.groupby('number')\
.count()\
.sort_values('post_id',ascending=False)[['post_id']]\
.head()


Out[5]:
post_id
number
72361
7863556827 1436
4047236489 1336
6242414310 1221
4044511961 945

Example Sub-Graph

  • One challenge is to efficiently create the sub graphs.
  • Our first approach was to make fully connected graph out of the data subsets.

Here is an example of a phone number that is seen on 11 posts:


In [6]:
ph_sample = df[df.number=='7865032020']
ph_sample.sort_values('name',ascending=False).head()


Out[6]:
post_id name number oid postdate posterage region
220676 221500 tucenicienta360@gmail.com 7865032020 26583449 2015-12-22 10:32:00 21.0 miami
134390 134885 tucenicienta360@gmail.com 7865032020 26659871 2015-12-02 11:52:00 21.0 miami
16418 16500 dallaz360@hotmail.com 7865032020 28354889 2015-12-05 04:57:00 22.0 miami
2276 2304 7865032020 31811642 2015-12-19 10:44:00 25.0 miami
177430 178083 7865032020 31857356 2015-12-16 09:53:00 24.0 miami

Fully Connected

Phone Numbers Only


In [18]:
def plot_graph_data(in_data, data_type, color, G=nx.Graph(), do_plot=True):
    ''' Plot graph '''
    out = []
    for a, b in itertools.product(in_data, in_data):
        out.append((a, b, {'type': data_type, 'color': color, 
                           'd3color': '#%02x%02x%02x' % tuple(c*255 for c in color)}))
    G.add_edges_from(out)
    if do_plot:
        pos = nx.circular_layout(G)
        colors = [G[u][v]['color'] for u, v in G.edges()]
        nx.draw(G, pos, node_color='k', edge_color=colors, width=1,node_size=15)
    return G

In [8]:
G_samp = plot_graph_data(ph_sample.post_id, 'phone', [0,0,1], G=nx.Graph());


Better Plot

  • That doesn't look great
  • Let's try with D3.

In [9]:
from networkx.readwrite import json_graph
from IPython.display import Javascript
import json

def draw_d3graph(G, link_distance=200, charge=-1000,
                node_radius = 5, stroke_width=1):
    variables = """
               window.graph={};
               window.link_distance={};
               window.charge={};
               window.node_radius={};
               window.stroke_width={};
               """.format(json.dumps(json_graph.node_link_data(G)),
                         link_distance,
                         charge,
                         node_radius,
                         stroke_width)
    meat = """

    require.config({
        paths: {
            d3: '//cdnjs.cloudflare.com/ajax/libs/d3/3.4.8/d3.min'
        }
    });
               require(['d3'], function(d3){
      //a weird idempotency thing
      $("#chart1").remove();

      //create canvas
      element.append("<div id='chart1'></div>");

      var margin = {top: 20, right: 20, bottom: 30, left: 40};
      var width = 960 - margin.left - margin.right;
      var height = 600 - margin.top - margin.bottom;
      var svg = d3.select("#chart1").append("svg")
        .style("position", "relative")
        .style("max-width", "960px")
        .attr("width", width + "px")
        .attr("height", (height + 50) + "px")
        .append("g")
        .attr("transform", "translate(" + margin.left + "," + margin.top + ")");

    var color = d3.scale.category10();
    var force = d3.layout.force()
        .charge(window.charge)
        .linkDistance(window.link_distance)
        .size([width, height]);

          force
          .nodes(window.graph.nodes)
          .links(window.graph.links)
          .start();

      var link = svg.selectAll("line.link")
          .data(window.graph.links)
        .enter().append("line")
          .attr("class", "link")
      .style("stroke", function(d) { return d.d3color; })
      .style("stroke-opacity", .5)
          .style("stroke-width", window.stroke_width);


      var node = svg.selectAll("circle.node")
          .data(window.graph.nodes)
        .enter().append("circle")
          .attr("class", "node")
          .attr("r", window.node_radius)
          .style("fill", "#999")
          .style("opacity", .4)
          .call(force.drag);

      force.on("tick", function() {
        link.attr("x1", function(d) { return d.source.x; })
            .attr("y1", function(d) { return d.source.y; })
            .attr("x2", function(d) { return d.target.x; })
            .attr("y2", function(d) { return d.target.y; });
        node.attr("cx", function(d) { return d.x; })
            .attr("cy", function(d) { return d.y; });
      });
    });
               """

    return Javascript(variables+meat)

In [10]:
draw_d3graph(G_samp,200)


Out[10]:

Email Addresses Only


In [15]:
em_sample = df[df['name'].str.contains('tuc',False)]
em_sample


Out[15]:
post_id name number oid postdate posterage region
2319 2347 tucenicienta360@gmail.com 7866505040 26588406 2015-12-22 10:27:00 22.0 miami
2320 2348 tucenicienta360@gmail.comhref 7866505040 26588406 2015-12-22 10:27:00 22.0 miami
104393 104780 tucenicienta360@gmail.com 7866505040 26577033 2015-12-14 01:38:00 22.0 miami
134390 134885 tucenicienta360@gmail.com 7865032020 26659871 2015-12-02 11:52:00 21.0 miami
220676 221500 tucenicienta360@gmail.com 7865032020 26583449 2015-12-22 10:32:00 21.0 miami

In [21]:
G_samp_em = plot_graph_data(em_sample.post_id, 'email', [1,0,0], G=nx.Graph(), do_plot=False)

draw_d3graph(G_samp_em,200)


Out[21]:

Combined Graph with Email and Phone Numbers


In [23]:
out = []
for a, b in itertools.product(em_sample.post_id, em_sample.post_id):
    out.append((a, b, {'type': 'email', 'color': 'r', 'd3color': '#f00'}))
    
G_samp.add_edges_from(out)
pos = nx.spring_layout(G_samp)
colors = [G_samp[u][v]['color'] for u, v in G_samp.edges()]
# nx.draw(G_samp, pos, node_color='k', edge_color=colors, width=1,node_size=15)
draw_d3graph(G_samp,200)


Out[23]:

Simplifying The Graph

  • This works, but having a fully connected set of graphs ends up taking a bunch of Memory.
  • To simplify, we only need each network of posts to be connected--not fully connected.
  • Create a sub-graph that is a loosely connected loop instead.

In [24]:
G_samp_loop = nx.Graph()

# No product for loop
v = ph_sample.post_id.values.tolist()
v_right = v[1:]
if len(v) == 1:
    v_right = v
else:
    v_right[-1] = v[0]
out = [(a, b,{'type':'phone','color':'b', 'd3color': '#00f'}) for a, b in zip(v, v_right)]

G_samp_loop.add_edges_from(out)
pos = nx.spectral_layout(G_samp_loop)


colors = [G_samp_loop[u][v]['color'] for u,v in G_samp_loop.edges()]

In [25]:
# nx.draw(G_samp_loop,pos,node_color='k',edge_color=colors,width=2,node_size=15)
draw_d3graph(G_samp_loop,20,-100)


Out[25]:

In [26]:
v = em_sample.post_id.values.tolist()
v_right = v[1:]
if len(v) == 1:
    v_right = v
else:
    v_right[-1] = v[0]
out += [(a, b,{'type':'phone','color':'r', 'd3color': '#f00'}) for a, b in zip(v, v_right)]

G_samp_loop.add_edges_from(out)
pos = nx.spring_layout(G_samp_loop)

colors = [G_samp_loop[u][v]['color'] for u,v in G_samp_loop.edges()]
# nx.draw(G_samp_loop,pos,node_color='k',edge_color=colors,width=2,node_size=15)
draw_d3graph(G_samp_loop,20,-100)


Out[26]:

Graph Clusters

When viewed this way, a set of connected posts (vertices) and poster attributes (edges) constitute an entity.

Approach

  1. Make a graph out of the data using these ideas
  2. Find all of the disjoint subgraphs and designate those as entities

In [27]:
def make_graph(df, color, data_type):
    '''
    Makes a list of tuple lists for each node-edge-node segment in the graph
    '''
    out = []
    for i, (k, v) in enumerate(df.groupby(df.columns[-1])):
        
        v = v.values.tolist()
        v = [x[0] for x in v]
        v_right = v[1:]
        if len(v) == 1:
            v_right = v
        else:
            v_right[-1] = v[0]
        out.append([(a, b, {'type': data_type,
                            'color': color, 
                           'd3color': '#%02x%02x%02x' % tuple(c*255 for c in color)}) for a, b in zip(v, v_right)])
    out = [item for sublist in out for item in sublist]
    return out

Add Graphs for Each Type of Connection


In [28]:
out = make_graph(df[df.name!=''][['post_id','name']],[1,0,0],'email')
out += make_graph(df[df.number!=''][['post_id','number']],[0,0,1],'number')
out += make_graph(df[df.oid!=''][['post_id','oid']],[0,1,0],'oid')

Use NetworkX to Find Disjoint SubGraphs


In [29]:
G = nx.Graph()
G.add_edges_from(out)

sub_graphs = []
for i, x in enumerate(nx.connected_component_subgraphs(G)):
    nodes = nx.nodes(x)
    sub_graphs.append(list(zip([i] * len(nodes), nodes)))

sub_graphs = [item for sublist in sub_graphs for item in sublist]

Check Entity Data


In [30]:
df_out = pd.DataFrame(sub_graphs,
                      columns=['entity_id',
                               'post_id'])
df_out.head(10)


Out[30]:
entity_id post_id
0 0 228056
1 1 228051
2 2 228050
3 3 228053
4 4 228052
5 5 90828
6 5 134739
7 5 228059
8 5 65347
9 5 6603

Merge With Original Data

And we are done...


In [31]:
df_out = df_out.merge(df,on='post_id')
df_out.set_index(['entity_id','number','name','oid'],inplace=True)
df_out.head(10)


Out[31]:
post_id postdate posterage region
entity_id number name oid
0 19170225 228056 2016-02-27 12:45:00 23.0 tampa
1 8132700180 19122423 228051 2016-02-24 07:44:00 26.0 tampa
2 7277127655 14116170 228050 2016-02-24 12:18:00 62.0 tampa
3 8136662555 17199891 228053 2016-02-25 03:22:00 22.0 tampa
4 7273419926 11032214 228052 2016-02-27 10:25:00 99.0 tampa
5 8133476501 14028137 90828 2015-12-22 09:27:00 25.0 tampa
16283312 134739 2015-12-23 01:10:00 25.0 tampa
14028137 228059 2016-02-27 03:14:00 26.0 tampa
16283312 65347 2016-02-02 02:10:00 26.0 tampa
14028137 6603 2016-02-02 12:54:00 26.0 tampa

Check Results

Check Email


In [32]:
df_out.xs('tucenicienta360@gmail.com',level='name')


Out[32]:
post_id postdate posterage region
entity_id number oid
560 7866505040 26588406 2347 2015-12-22 10:27:00 22.0 miami
7865032020 26659871 134885 2015-12-02 11:52:00 21.0 miami
26583449 221500 2015-12-22 10:32:00 21.0 miami
7866505040 26577033 104780 2015-12-14 01:38:00 22.0 miami

In [33]:
G['104780']


Out[33]:
{'104780': {'color': [0, 1, 0], 'd3color': '#00ff00', 'type': 'oid'},
 '104876': {'color': [0, 0, 1], 'd3color': '#0000ff', 'type': 'number'},
 '134885': {'color': [1, 0, 0], 'd3color': '#ff0000', 'type': 'email'},
 '2347': {'color': [1, 0, 0], 'd3color': '#ff0000', 'type': 'email'},
 '46482': {'color': [0, 0, 1], 'd3color': '#0000ff', 'type': 'number'}}

Check Entity


In [34]:
df_out.loc[560].sort_index()


Out[34]:
post_id postdate posterage region
number name oid
7865032020 26577332 90113 2015-12-08 05:25:00 24.0 miami
27978380 31818 2015-12-22 12:56:00 22.0 miami
28372705 20101 2016-01-08 02:16:00 22.0 miami
28561650 52095 2016-01-25 10:17:00 24.0 miami
28561650 147990 2015-12-22 10:29:00 24.0 tampa
31300988 122688 2016-01-18 10:59:00 23.0 miami
31300988 207134 2015-12-24 10:34:00 23.0 miami
31467113 2406 2015-12-24 10:27:00 22.0 miami
31810842 2409 2015-12-24 10:32:00 24.0 miami
31811642 2304 2015-12-19 10:44:00 25.0 miami
31814749 163166 2015-12-02 11:54:00 24.0 miami
31831488 134051 2015-12-18 12:52:00 23.0 miami
31837041 221367 2015-12-05 04:52:00 25.0 miami
31837101 119466 2015-12-17 02:56:00 24.0 miami
31837906 163315 2015-12-18 11:00:00 22.0 miami
31838170 46451 2015-12-22 10:30:00 23.0 miami
31854569 119535 2015-12-23 11:52:00 23.0 miami
31855380 75795 2015-12-22 01:12:00 24.0 miami
31856096 163422 2015-12-24 10:27:00 23.0 miami
31856922 31784 2015-12-19 11:52:00 25.0 miami
31857356 178083 2015-12-16 09:53:00 24.0 miami
31862500 195857 2016-01-15 05:24:00 24.0 miami
31862500 221467 2015-12-19 02:31:00 24.0 miami
31864686 124252 2016-02-13 03:57:00 24.0 miami
31864686 221544 2015-12-24 10:33:00 24.0 miami
31878332 134159 2015-12-24 10:33:00 24.0 miami
33495618 206457 2015-12-08 05:23:00 22.0 miami
33889430 138814 2016-02-13 03:58:00 23.0 miami
33889430 148610 2015-12-23 11:48:00 23.0 miami
33889585 61000 2015-12-21 10:35:00 22.0 miami
... ... ... ... ...
... 192610 2015-12-08 05:24:00 21.0 miami
34172650 60936 2015-12-16 04:29:00 22.0 miami
34172870 181047 2016-01-16 10:03:00 22.0 miami
35289414 45955 2015-12-24 10:32:00 22.0 miami
35289414 22751 2016-02-10 01:38:00 23.0 miami
36808973 16500 2015-12-05 04:57:00 22.0 miami
dallaz360@hotmail.com 28354889 221500 2015-12-22 10:32:00 21.0 miami
26583449 134885 2015-12-02 11:52:00 21.0 miami
7865032020 tucenicienta360@gmail.com 26659871 20100 2016-01-08 02:16:00 22.0 miami
28372705 16562 2015-12-17 11:28:00 23.0 miami
28384098 119485 2015-12-19 11:51:00 22.0 miami
31331761 206968 2015-12-09 10:51:00 24.0 miami
31811398 207132 2015-12-24 10:35:00 23.0 miami
31828798 148590 2015-12-22 10:30:00 24.0 miami
31831778 46453 2015-12-22 12:56:00 24.0 miami
31835789 46482 2015-12-23 02:42:00 23.0 miami
31840012 104876 2015-12-23 02:47:00 22.0 miami
31859247 163356 2015-12-21 02:21:00 23.0 miami
31860900 46455 2015-12-22 01:16:00 23.0 miami
31861445 8089 2016-02-15 10:00:00 24.0 miami
31878332 207109 2015-12-23 02:46:00 22.0 miami
33889131 154122 2016-02-09 10:27:00 24.0 miami
34148294 134114 2015-12-22 12:57:00 24.0 miami
34148294 2332 2015-12-21 04:59:00 20.0 miami
34148521 2287 2015-12-17 03:30:00 21.0 miami
34155477 207033 2015-12-18 10:52:00 23.0 miami
34172765 207092 2015-12-22 10:28:00 24.0 miami
34172970 104780 2015-12-14 01:38:00 22.0 miami
26577033 2347 2015-12-22 10:27:00 22.0 miami
tucenicienta360@gmail.com 26588406 2348 2015-12-22 10:27:00 22.0 miami

67 rows × 4 columns


In [35]:
G_check = G.subgraph(df_out.loc[560:610].post_id.values)

pos = nx.spring_layout(G_check)
colors = [G_check[u][v]['color'] for u,v in G_check.edges()]

In [38]:
draw_d3graph(G_check,2,-5,3,3)


Out[38]:

In [37]:
nx.draw(G_check,pos,node_color='k',edge_color=colors,width=2,node_size=5)