In [8]:
%matplotlib inline
import os
import sys
import random
import networkx as nx
## Paths from the file
PROJECT = os.path.join(os.getcwd(), "..")
FIXTURES = os.path.join(PROJECT, "fixtures")
DATASET = os.path.join(FIXTURES, 'activity.csv')
## Append the path for the logbook utilities
sys.path.append(PROJECT)
from logbook.reader import LogReader
from logbook.graph import *
from logbook.compare import *
In [9]:
# Actions to exclude from our graph.
# exclude = None
exclude=['Subscribed to DDL blog', 'Signed up for new course notifications']
# Load dataset and generate graph
dataset = LogReader(DATASET, exclude=exclude)
G = graph_from_triples(dataset)
print info(G)
In [10]:
draw_activity_graph(G, connected=True, iterations=100)
By implementing a graph where person entity nodes are a tuple of (name, email) pairs (an immutable data structure that is hashable), we get structure right off the bat by direct comparison.
The number of pariwise comparisons is computed as:
$$c = \frac{n(n-1)}{2}$$Where n is the number of nodes in the graph. The graph can be further filtered on entity type as well. Here are a random sample of 5 pairwise node to node comparisons:
In [ ]:
print "Pairwise Comparisons: {}\n\n".format(pairwise_comparisons(G, True))
combos = list(pairwise_comparisons(G, entity='person'))
random.shuffle(combos)
for idx, pair in enumerate(combos):
print "Pair {}:".format(idx + 1)
print " {}\n -- vs --\n {}".format(*pair)
print
if idx >= 4: break
Edge structured comparisons only yield nodes so long as the itersection of the node's neighborhoods is empty (that is, two entities can't have an action to the same detail).
In [ ]:
print "Edge Blocked Pairwise Comparisons: {}\n\n".format(edge_blocked_comparisons(G, True))
combos = list(edge_blocked_comparisons(G, entity='person'))
random.shuffle(combos)
for idx, pair in enumerate(combos):
print "Pair {}:".format(idx + 1)
print " {}".format(pair[0])
for detail in G.neighbors(pair[0]):
print " {}".format(detail)
print " -- vs --"
print " {}".format(pair[1])
for detail in G.neighbors(pair[1]):
print " {}".format(detail)
print
if idx >= 4: break
Other structural blocking can then be applied.
With some blocking in the data structure, we can now begin to do pairwise comparisons. Here, I'll use the fuzzywuzzy tool to produce comparisons for the annotator such that the mean of the fuzzy score for both email and name meets a certain threshold.
In [ ]:
combos = list(edge_blocked_comparisons(G, entity='person'))
combos = filter(lambda pair: fuzzblock(*pair), combos)
print "Fuzz/Edge Blocked Pairwise Comparisons: {}\n\n".format(len(combos))
random.shuffle(combos)
for idx, pair in enumerate(combos):
print "Pair {}:".format(idx + 1)
print " {}".format(pair[0])
for detail in G.neighbors(pair[0]):
print " {}".format(detail)
print " -- vs --"
print " {}".format(pair[1])
for detail in G.neighbors(pair[1]):
print " {}".format(detail)
print
if idx >= 100: break
In [16]:
from collections import Counter
def count_email_domains():
counter = Counter()
for triple in dataset:
email = triple.entity.email
domain = email.split("@")[-1]
counter[domain] += 1
return counter
domains = count_email_domains()
for domain in domains.most_common():
print "{}: {}".format(*domain)