Comparing collections (Part One)

Set comparison
Ordered collections
Ranked collections
Collection transformation



In [22]:

    
import random
import collections
import operator
import time

import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
import twitter
%matplotlib inline

import count_min

# some matplotlib color-mapping 
cmap = plt.get_cmap('viridis')
c_space = np.linspace(0,99,100)

Set comparison



In [4]:

    
# create two sets of data scientists
a = set(['josh','fiona','scotty','skippy'])
b = set(['jeff','whitney','fiona'])



In [5]:

    
a.intersection(b)









    



/Users/jkolb/pyenvs/3.5.1/lib/python3.5/site-packages/IPython/core/formatters.py:92: DeprecationWarning: DisplayFormatter._ipython_display_formatter_default is deprecated: use @default decorator instead.
  def _ipython_display_formatter_default(self):
/Users/jkolb/pyenvs/3.5.1/lib/python3.5/site-packages/IPython/core/formatters.py:669: DeprecationWarning: PlainTextFormatter._singleton_printers_default is deprecated: use @default decorator instead.
  def _singleton_printers_default(self):






    Out[5]:





{'fiona'}



In [6]:

    
c = set(range(10))
d = set(range(8,15))



In [7]:

    
c.intersection(d)









    Out[7]:





{8, 9}



In [8]:

    
c.union(d)









    Out[8]:





{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}

List comparison

Number sequences



In [9]:

    
# create an ordered sequence of evenly-spaced integers
# create a second ordered seqence of intergers, which differs from the first by only noise
a = np.array([[i,i+np.random.normal()] for i in range(100)])



In [10]:

    
# the color variation shows the sequence order
plt.scatter(a[:,0],a[:,1],c=c_space,cmap=cmap)









    Out[10]:





<matplotlib.collections.PathCollection at 0x114d83fd0>



In [11]:

    
# check the the correlation between the sequences (off-diagonal elements), 
# which will be high for small noise
np.corrcoef(a,rowvar=0)









    Out[11]:





array([[ 1.        ,  0.99951139],
       [ 0.99951139,  1.        ]])



In [12]:

    
# now create two-similarly related sequences, but with non-even spacing and larger noise
_ = [1,4,5,8,15,45,48,50,55,60,88,89,90,93,99]
b = np.array([[i,i+np.random.normal()*5] for i in _])



In [13]:

    
plt.scatter(b[:,0],b[:,1],c=np.linspace(0,99,len(b)),cmap=cmap)









    Out[13]:





<matplotlib.collections.PathCollection at 0x114e399b0>



In [14]:

    
# check the correlation
np.corrcoef(b,rowvar=0)









    Out[14]:





array([[ 1.        ,  0.99154274],
       [ 0.99154274,  1.        ]])



In [15]:

    
# now create randomly-ordered seqences with larger noise


_ = np.array([random.random()*100 for _ in range(100)])
c = np.array([[i,i+np.random.normal()*10] for i in _])



In [16]:

    
plt.scatter(c[:,0],c[:,1],c=c_space,cmap=cmap)









    Out[16]:





<matplotlib.collections.PathCollection at 0x114f3d278>



In [17]:

    
# the correlation coefficient is still relatively large
np.corrcoef(c,rowvar=0)









    Out[17]:





array([[ 1.        ,  0.94772484],
       [ 0.94772484,  1.        ]])

Try relating with the nosie scale and sparsity of the sequences with the correlation coefficient.

Ordinal comparison

e.g. comparing rank



In [50]:

    
"""
Get Tweets from the Twitter public API
"""
import yaml

# Get your Twitter API tokens
# this is specific to my computer; modify for yours
my_creds_file = '/Users/jkolb/.twitter_api_creds'
creds = yaml.load(open(my_creds_file))
consumer_key = creds['audience']['consumer_key']
consumer_secret = creds['audience']['consumer_secret']
access_token_key = creds['audience']['token']
access_token_secret = creds['audience']['token_secret']



In [ ]:

    
api = twitter.Api(consumer_key=consumer_key,
                 consumer_secret=consumer_secret,
                 access_token_key=access_token_key,
                 access_token_secret=access_token_secret
                 )

mom_tweets = []
for _ in range(20):
    mom_tweets.extend( api.GetSearch("mom",count=100) )
    time.sleep(1)



In [ ]:

    
"""
Get Tweets from the Gnip Search API
"""

"""
from search.api import Query
import json
import yaml
creds = yaml.load(open('/Users/jkolb/.creds.yaml'))

# set up a query to the Gnip Search API
q = Query(creds['username'],
          creds['password'],
          creds['search_endpoint'],
          paged=True,
          hard_max = 1000,
          search_v2 = True
          )

# query parameters
start_date = '2016-08-01T00:00'
end_date = '2016-09-01T00:00'
rule = 'mom'

# get the tweet data
q.execute(rule,start=start_date,end=end_date)
mom_tweets = list(q.get_activity_set())
""""""



In [44]:

    
def token_frequency(tweets,num_top_terms = 30,delta=10**-5,epsilon=0.001,):
    """
    Space-tokenize tweet bodies and 
    return exact and approximate 1-gram counts
    
    Approximate the counts with a count-min sketch
    """
    
    terms = collections.defaultdict(int)
    sketch = count_min.Sketch(dict(delta=delta,epsilon=epsilon,k=num_top_terms))
    for tweet in tweets:
        try:
            tokens = tweet['body'].split()
        except TypeError:
            tokens = tweet.AsDict()['text'].split()
        for token in tokens:
            terms[token.lower()] += 1
            sketch.update(token.lower(),1)
    return (terms,sketch)



In [45]:

    
def display_two_lists(list_1,list_2):
    """helper function"""
    for x,y in zip(list_1,list_2):
        print(x,y)



In [46]:

    
num_top_terms = 30
# accuracy parameters for CM sketch
delta = 10**-4
epsilon = 0.01



In [47]:

    
# get exact and approximate top terms and counts
mom_terms,mom_sketch = token_frequency(mom_tweets,num_top_terms,delta,epsilon)
exact_top_mom_terms = list(reversed(sorted(mom_terms.items(),key = operator.itemgetter(1))))[:num_top_terms]
approx_top_mom_terms = [(term,count) for count,term in reversed(sorted(mom_sketch.top_k.values(),key = operator.itemgetter(0)))]

Kendall's tau coefficient is a sort of correlation coefficient that is proportional to the difference between the number of concordant pairs and the number of discordant pairs.



In [48]:

    
kt_result = scipy.stats.kendalltau(exact_top_mom_terms,approx_top_mom_terms)
kt_result.correlation









    



/Users/jkolb/pyenvs/3.5.1/lib/python3.5/site-packages/scipy/stats/stats.py:250: RuntimeWarning: The input array could not be properly checked for nan values. nan values will be ignored.
  "values. nan values will be ignored.", RuntimeWarning)






    Out[48]:





0.78551229330666639



In [49]:

    
display_two_lists(exact_top_mom_terms,approx_top_mom_terms)









    



('mom', 1507) ('mom', 1512)
('rt', 1236) ('rt', 1248)
('my', 941) ('my', 946)
('to', 901) ('to', 911)
('and', 745) ('and', 783)
('your', 710) ('your', 720)
('i', 648) ('i', 648)
('you', 551) ('you', 551)
('a', 431) ('mom:', 453)
('mom:', 424) ('a', 431)
('me', 415) ('the', 425)
('the', 413) ('me', 419)
('this', 283) ('is', 299)
('is', 279) ('this', 286)
('me:', 231) ('me:', 247)
('when', 229) ('when', 236)
('how', 219) ('her', 219)
('her', 205) ('how', 219)
('for', 201) ('for', 201)
('of', 185) ('of', 185)
('she', 175) ('she', 183)
('it', 160) ('been', 178)
('been', 157) ('through', 177)
(':', 156) ('@pearcegodeep:', 172)
('many', 156) ('"', 167)
('"', 156) ('it', 164)
('through', 150) ('https://t.co/esugkx28mv', 164)
('@pearcegodeep:', 142) (':', 160)
('https://t.co/esugkx28mv', 142) ('many', 156)
('do', 132) ('do', 154)

A harder problem: how to account for the varying importance of rank?

Over/under-indexing



In [51]:

    
# get some data around the term 'dad'

# public API version
dad_tweets = []
for _ in range(20):
    dad_tweets.extend( api.GetSearch("dad",count=100) )
    time.sleep(1)



In [53]:

    
"""
# Gnip API verions
rule = 'dad'
q.execute(rule,start=start_date,end=end_date)
dad_tweets = list(q.get_activity_set())
"""









    Out[53]:





"\n# Gnip API verions\nrule = 'dad'\nq.execute(rule,start=start_date,end=end_date)\ndad_tweets = list(q.get_activity_set())\n"



In [54]:

    
dad_terms,dad_sketch = token_frequency(dad_tweets,num_top_terms,delta,epsilon)
exact_top_dad_terms = list(reversed(sorted(dad_terms.items(),key = operator.itemgetter(1))))[:num_top_terms]
approx_top_dad_terms = [(term,count) for count,term in reversed(sorted(dad_sketch.top_k.values(),key = operator.itemgetter(0)))]



In [55]:

    
display_two_lists(exact_top_dad_terms,exact_top_mom_terms)









    



('dad', 1549) ('mom', 1507)
('rt', 1127) ('rt', 1236)
('to', 980) ('my', 941)
('i', 860) ('to', 901)
('the', 844) ('and', 745)
('me', 714) ('your', 710)
('you', 690) ('i', 648)
('your', 675) ('you', 551)
('on', 675) ('a', 431)
('he', 655) ('mom:', 424)
('my', 655) ('me', 415)
('before', 538) ('the', 413)
('please', 526) ('this', 283)
('yo!', 510) ('is', 279)
('planet.', 510) ('me:', 231)
('@donaldjtrumpjr!', 510) ('when', 229)
('destroys', 510) ('how', 219)
('ask', 510) ('her', 205)
('twitter.', 510) ('for', 201)
('follow', 510) ('of', 185)
('resign', 510) ('she', 175)
('noticed', 510) ('it', 160)
('thanks…', 490) ('been', 157)
('@sethrogen:', 490) (':', 156)
('and', 376) ('many', 156)
('is', 346) ('"', 156)
('a', 339) ('through', 150)
('for', 231) ('@pearcegodeep:', 142)
('are', 193) ('https://t.co/esugkx28mv', 142)
('this', 182) ('do', 132)

But we don't really care about rank here. We care about removing the effect of a baseline.



In [56]:

    
def normalize_by_difference(term_counts,baseline_counts):
    """define a normalized term frequency that subtracts off a baseline count"""
    normed_term_counts = {}
    for term,count in term_counts.items():
        try:
            normed_term_counts[term] = count - baseline_counts[term]
        except KeyError:
            normed_term_counts[term] = count
    return normed_term_counts



In [57]:

    
normalized_results = normalize_by_difference(dad_terms,mom_terms)

# look at top of list to see most "dad"-like terms
list(reversed(sorted(normalized_results.items(), key = operator.itemgetter(1))))[:20]









    Out[57]:





[('dad', 1462),
 ('on', 590),
 ('he', 563),
 ('before', 538),
 ('yo!', 510),
 ('planet.', 510),
 ('resign', 510),
 ('@donaldjtrumpjr!', 510),
 ('destroys', 510),
 ('twitter.', 510),
 ('noticed', 510),
 ('ask', 494),
 ('please', 492),
 ('thanks…', 490),
 ('@sethrogen:', 490),
 ('follow', 484),
 ('the', 431),
 ('me', 299),
 ('i', 212),
 ('you', 139)]



In [58]:

    
# and look at the bottom of the list to see the mom-like terms
list(reversed(sorted(normalized_results.items(), key = operator.itemgetter(1))))[-20:]









    Out[58]:





[('go', -77),
 ('at', -77),
 ('did', -86),
 ('a', -92),
 ('been', -95),
 ('times', -97),
 ('do', -99),
 ('this', -101),
 ('an', -102),
 ('someone', -105),
 ('rt', -109),
 ('her', -114),
 ('many', -153),
 ('"', -155),
 ('when', -160),
 ('how', -191),
 ('me:', -227),
 ('my', -286),
 ('and', -369),
 ('mom', -1412)]

But this normalization-by-difference only works if the two term frequency distributions have the same scale of counts.



In [59]:

    
def normalize_by_fraction(term_counts,baseline_counts):
    """normalize counts by the difference of the term-fractions for each distribution"""
    normed_term_counts = {}
    B = sum(baseline_counts.values())
    A = sum(term_counts.values())
    for term,count in term_counts.items():
        try:
            # fraction of baseline distribution for this term
            b_frac = baseline_counts[term]/B
        except KeyError:
            b_frac = 0
        
        # fraction of primary term frequency distribution for this term
        a_frac = count/A
        
        factor = (a_frac-b_frac)
        normed_term_counts[term] = count * factor 
    return normed_term_counts



In [60]:

    
normalized_results = normalize_by_fraction(dad_terms,mom_terms)
sorted_list = list(reversed(sorted(normalized_results.items(), key = operator.itemgetter(1))))
sorted_list[:20]









    Out[60]:





[('dad', 65.04625878816911),
 ('on', 11.35391535337112),
 ('he', 10.49533948411985),
 ('the', 9.62448294280985),
 ('before', 8.356979933593186),
 ('yo!', 7.50974447812906),
 ('planet.', 7.50974447812906),
 ('resign', 7.50974447812906),
 ('@donaldjtrumpjr!', 7.50974447812906),
 ('destroys', 7.50974447812906),
 ('twitter.', 7.50974447812906),
 ('noticed', 7.50974447812906),
 ('please', 7.42691656924851),
 ('ask', 7.25358374982895),
 ('follow', 7.0934832946413815),
 ('thanks…', 6.932293922332899),
 ('@sethrogen:', 6.932293922332899),
 ('me', 5.417262730735218),
 ('i', 3.85985063825802),
 ('you', 1.8111924301860725)]

Well, this sort of works. We could also comine the term-fractions in other ways, e.g. a ratio (a_frac/b_frac), or the relative difference ( [a_frac-b_frac]/a_frac ).

In the end, we need to think harder about what differences and similarities we want to be able to highlight and ignore. See Part Two...