In [1]:
import pandas as pd
import ujson
import codecs
import cPickle
import nltk
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
import math
from collections import Counter

pi = open('/home/paul/backup/2014-09-18.pickle')
data = cPickle.load(pi)
print("opened.")

data = data[0:100000]


opened.

In [2]:
vocab =set()
texts = [d[0] for d in data]

all_toks = []
word_dict = {}
i=0
for t in texts:
    i=i+1
    if(i%2000==0): print i
    tokens = nltk.word_tokenize(t)
    for tok in tokens:
        all_toks.append(tok)


2000
4000
6000
8000

In [3]:
word_dict = Counter(all_toks)
word_dict.most_common(20)


Out[3]:
[('the', 4556),
 ('!', 4267),
 (',', 4026),
 ('hashsymbindyref', 3788),
 ('to', 3451),
 ('Scotland', 3117),
 ('hashsymbbettertogether', 2885),
 ('hashsymbVoteYes', 2690),
 ('a', 2258),
 ('I', 2226),
 ('and', 2046),
 ('vote', 1974),
 ('of', 1950),
 ('for', 1756),
 ("'s", 1751),
 ('in', 1702),
 ('is', 1630),
 ('you', 1427),
 ('be', 1303),
 ('it', 1281)]

In [5]:
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
import math

In [6]:
# Counter data, counter is your counter object
keys = word_dict.keys()

# get the counts for each key, assuming the values are numerical
freqs = [word_dict[k]+1 for k in keys]
freqs = set(freqs)
freqs = list(freqs)

freqs.sort(reverse=True)
ranks = np.arange(len(freqs))


plt.scatter(ranks, freqs)
plt.xlabel('Rank', fontsize=28)
plt.ylabel('Frequency', fontsize=28)
plt.title('Highly skewed term-frequency distribution')

plt.show()

In [48]:
# log-log data
lranks = [math.log(1+r, 3) for r in ranks]
lfreqs = [math.log(1+f, 3 ) for f in freqs]
plt.scatter(lfreqs, lranks)
plt.xlabel('Log rank',fontsize=28)
plt.ylabel('Log frequency',fontsize=28)
plt.title('Log-log rank frequency plot')

plt.show()