Today's topic is term frequency inverse document frequency, which is a statistic for estimating the importance of words and phrases in a set of documents.


As an example, I've been tracking ...

In [164]:
%matplotlib inline
from matplotlib import pyplot as plt
import os
import random
import numpy as np
import pandas as pd
import twython
import time
import ConfigParser
from collections import defaultdict

In [165]:
propertiesFile = ""
cp = ConfigParser.ConfigParser()

APP_KEY            = cp.get('Params', 'app_key')
APP_SECRET         = cp.get('Params', 'app_secret')
OAUTH_TOKEN        = cp.get('Params', 'oauth_token')
OAUTH_TOKEN_SECRET = cp.get('Params', 'oauth_token_secret')


In [166]:
files = os.listdir('.')
followers = {}
for file in files:
    i = file.find('_followers.txt')
    if i > 0:
        f = open(file, 'r')
        followers[file[0:i]] = set('\n'))

In [167]:
keys = followers.keys()
unfollowers = []
allfollowers = set()
for i in range(len(keys)-1):
    today = followers[keys[i]]
    allfollowers = allfollowers.union(today)
    tomorrow = followers[keys[i+1]]
    unfollow = today.difference(tomorrow)

In [5]:
fol = random.sample(allfollowers, len(unfollowers))
nfol = unfollowers

In [6]:
follower_tweets = {}
unfollower_tweets = {}

In [21]:
errors = defaultdict(int)
for u in fol:
    except KeyError:
            tweets = twitter.get_user_timeline(screen_name=u)
            all = []
            for tweet in tweets:
            follower_tweets[u] = all
        except twython.TwythonError as e:
            if e.msg.find('Twitter API returned a 404 (Not Found)') == 0:
                ignoreDeletedAccount = True
                errors[e.msg] += 1
print errors

defaultdict(<type 'int'>, {'Twitter API returned a 401 (Unauthorized), An error occurred processing your request.': 9})

In [23]:
errors = defaultdict(int)
for u in nfol:
    except KeyError:
            tweets = twitter.get_user_timeline(screen_name=u)
            all = []
            for tweet in tweets:
            unfollower_tweets[u] = all
        except twython.TwythonError as e:
            if e.msg.find('Twitter API returned a 404 (Not Found)') == 0:
                ignoreDeletedAccount = True
                errors[e.msg] += 1
print errors

defaultdict(<type 'int'>, {'Twitter API returned a 401 (Unauthorized), An error occurred processing your request.': 30})

In [26]:
import pickle
pickle.dump({'unfollower_tweets': unfollower_tweets, 'follower_tweets': follower_tweets}, open('tfidfbackup.pkl', 'wb'))

In [168]:
import pickle
x = pickle.load(open('tfidfbackup.pkl', 'rb'))
unfollower_tweets = x['unfollower_tweets']
follower_tweets = x['follower_tweets']

In [177]:

In [180]:
from gensim import corpora, models, similarities
from collections import defaultdict

In [183]:
list_of_lists = unfollower_tweets.values()
documents = [val for sublist in list_of_lists for val in sublist]

texts = [[word for word in document.lower().split()]
          for document in documents]
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
        texts = [[token for token in text if frequency[token] > 1] for text in texts]

from pprint import pprint

In [169]:
In [6]:
import gensim

dictionary = gensim.corpora.Dictionary.load(os.path.join(MODELS_DIR, 
corpus = gensim.corpora.MmCorpus(os.path.join(MODELS_DIR, ""))

tfidf = gensim.models.TfidfModel(corpus, normalize=True)
corpus_tfidf = tfidf[corpus]

# project to 2 dimensions for visualization
lsi = gensim.models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)

# write out coordinates to file
fcoords = open(os.path.join(MODELS_DIR, "coords.csv"), 'wb')
for vector in lsi[corpus]:
    if len(vector) != 2:
    fcoords.write("%6.4f\t%6.4f\n" % (vector[0][1], vector[1][1]))

In [ ]:
token_dict = {}
stemmer = PorterStemmer()

#this can take some time
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())

In [21]:
import random


In [110]:
import random
import math
tokens = ['adf', 'klj', 'iuaod', 'ssfle', 'edfel', 'egfel', 'efhlef', 'efjlef', 'sokeof', 'adf', 'adffd', 'f', 'ff', 'fff', 'ffff', 'fffff', 'fa', 'faa', 'faaa']
w = []
for t in tokens:
    r = math.pow(random.random(), 3)
w = np.array(w)
w = w / sum(w)
df = pd.DataFrame({'token': tokens, 'w': w})
df.sort('w', inplace=True, ascending=True)
df.index = np.arange(df.shape[0]) + 1

In [130]:
if df.shape[0] > 10:
    tail = df.ix[:10, 'w'].sum()
    rem = df.iloc[10:].copy()
    rem.sort('w', inplace=True, ascending=False)
    rem.ix[rem.shape[0]] = pd.Series({'token': '--[other terms]--', 'w': tail})
    rem.index = np.arange(rem.shape[0])

In [131]:

token w
0 fa 0.182045
1 faa 0.173771
2 adf 0.148007
3 ssfle 0.139892
4 ff 0.101003
5 klj 0.058918
6 ffff 0.055602
7 egfel 0.032608
8 fffff 0.029047
9 --[other terms]-- 0.079108

In [163]:
colors = []
for c in range(rem.shape[0]-1):
plt.barh(rem.index * -1, rem['w'], color=colors)
plt.yticks(rem.index * -1 + 0.4, rem['token'])
plt.ylim(-1 * rem.shape[0] + 0.8, 1)
plt.ylabel('phrase weight')

