In [1]:
%matplotlib inline
Build an artificial dataset: starting from the string 'abcdefghijklmnopqrstuvwxyz', generate iteratively strings by swapping two characters at random. In this way instances are progressively more dissimilar
In [2]:
import random
def make_data(size):
text = ''.join([str(unichr(97+i)) for i in range(26)])
seqs = []
def swap_two_characters(seq):
'''define a function that swaps two characters at random positions in a string '''
line = list(seq)
id_i = random.randint(0,len(line)-1)
id_j = random.randint(0,len(line)-1)
line[id_i], line[id_j] = line[id_j], line[id_i]
return ''.join(line)
for i in range(size):
text = swap_two_characters( text )
seqs.append( text )
print text
return seqs
In [3]:
seqs = make_data(25)
define a function that builds a graph from a string, i.e. the path graph with the characters as node labels
In [4]:
import networkx as nx
def sequence_to_graph(seq):
'''convert a sequence into a EDeN 'compatible' graph
i.e. a graph with the attribute 'label' for every node and edge'''
G = nx.Graph()
for id,character in enumerate(seq):
G.add_node(id, label = character )
if id > 0:
G.add_edge(id-1, id, label = '-')
return G
make a generator that yields graphs: generators are 'good' as they allow functional composition
In [5]:
def pre_process(iterable):
for seq in iterable:
yield sequence_to_graph(seq)
initialize the vectorizer object with the desired 'resolution'
In [6]:
%%time
from eden.graph import Vectorizer
vectorizer = Vectorizer( complexity = 4 )
obtain an iterator over the sequences processed into graphs
In [7]:
%%time
graphs = pre_process( seqs )
compute the vector encoding of each instance in a sparse data matrix
In [8]:
%%time
X = vectorizer.transform( graphs )
print 'Instances: %d ; Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1], X.getnnz()/X.shape[0])
compute the pairwise similarity as the dot product between the vector representations of each sequence
In [9]:
from sklearn import metrics
K=metrics.pairwise.pairwise_kernels(X, metric='linear')
print K
visualize it as a picture is worth thousand words...
In [10]:
import pylab as plt
plt.figure( figsize=(8,8) )
img = plt.imshow( K, interpolation='none', cmap=plt.get_cmap( 'YlOrRd' ) )
plt.show()