In [141]:
%matplotlib inline
import pylab
import random
import numpy as np
import pandas as pd
from collections import defaultdict
from pysurvey.plot import setup, density, icolorbar
In [140]:
import pysurvey.plot
reload(pysurvey.plot)
Out[140]:
In [16]:
people = pd.read_json('/Users/ajmendez/data/okcupid/random_v2.json')
In [53]:
# Simple Markov chain generator.
# Modified from:
# http://alexeymk.com/2012/07/15/weekend-hack--a-markov-baby-name-generator.html
WORD_SEP = ' '
class MarkovName(object):
def __init__(self, names):
'''names is a list sampling some population'''
self.chain = defaultdict(list)
self.mean_length = usernames.apply(len).mean()
for name in names:
proper_name = name.lower().strip()
for first, second in zip(proper_name, proper_name[1:]):
self.chain[first].append(second)
self.chain[proper_name[-1]].append(WORD_SEP)
self.chain[WORD_SEP].append(proper_name[0])
def __call__(self):
name = []
current = WORD_SEP # used to mark both first and last character
while not (current == WORD_SEP and name):
current = random.choice(self.chain[current])
name.append(current)
return ''.join(name).strip().capitalize()
usernames = people[people['match'] > 90]['username']
highmatch = MarkovName(usernames)
[highmatch() for i in range(20)]
Out[53]:
In [54]:
v,l = np.histogram(usernames.apply(len).as_matrix())
pylab.bar(left=l[:-1], width=np.diff(l),
height=v)
Out[54]:
In [70]:
from random import choice
import sys
def generateModel(text, order):
model = {}
for i in range(0, len(text) - order):
fragment = text[i:i+order]
next_letter = text[i+order]
if fragment not in model:
model[fragment] = {}
if next_letter not in model[fragment]:
model[fragment][next_letter] = 1
else:
model[fragment][next_letter] += 1
return model
def getNextCharacter(model, fragment):
letters = []
for letter in model[fragment].keys():
for times in range(0, model[fragment][letter]):
letters.append(letter)
return choice(letters)
def generateText(text, order, length):
model = generateModel(text, order)
currentFragment = text[0:order]
output = []
username = ''
for i in range(0, length-order):
newCharacter = getNextCharacter(model, currentFragment)
username += newCharacter
if newCharacter == ' ':
output.append(username.strip())
username = ''
currentFragment = currentFragment[1:] + newCharacter
return output
tmp = ' '.join(people[people['match'] > 90]['username'])
generateText(tmp, order=20, length=200)
Out[70]:
In [73]:
tmp = ' '.join(people[(people['match'] > 40) & (people['match'] < 70) ]['username'])
print len(tmp)
generateText(tmp, order=20, length=200)
Out[73]:
In [75]:
import string
In [83]:
def make_words(n=5):
words = people[(people['match'] > 40) & (people['match'] < 70) ]['username']
words = people[(people['match'] > 90) ]['username']
# words = [w for w in words
# if all([c in string.ascii_lowercase for c in w])]
words = ["^" + w + "$" for w in words if w != ""]
# construct a discrete-time markov chain of n-grams
n = 5 # this is the "n" in n-grams, try adjusting this for different results
transitions = defaultdict(lambda: defaultdict(float))
for word in words:
if len(word) >= n:
transitions[""][word[:n]] += 1.0
for i in range(len(word) - n):
gram = word[i : i + n]
next = word[i + 1 : i + n + 1]
transitions[gram][next] += 1.0
# normalize the probabilities
for gram in transitions:
total = sum([transitions[gram][next] for next in transitions[gram]])
for next in transitions[gram]:
transitions[gram][next] /= total
return words, transitions
# sample a probability mass function (dict from elements to probabilities)
def sample(pmf):
sample = random.random()
cdf = 0.0
for e in pmf:
cdf += pmf[e]
if cdf >= sample:
return e
return random.choice(pmf.keys())
# generate a word according to the markov chain
def gen_word(n=5):
# start with a prefix
word = sample(transitions[""])
# wait until the markov chain adds a terminator to the word
while word[-1] != "$":
# append a new letter chosen according to the markov chain
gram = word[-n:]
if gram in transitions:
word += sample(transitions[gram])[-1:]
else:
word += choice(ascii_lowercase + "$")
# optional: allow multi-word domains
if word[-1] == "$" and random.random() > 0.7 and len(word) < 8:
word += sample(transitions[""])
# remove the boundary markers and return the word
return word.replace("^", "").replace("$", "")
words, transitions = make_words()
[gen_word() for i in range(20)]
Out[83]:
In [82]:
words, transitions = make_words()
[gen_word() for i in range(20)]
Out[82]: