In [1]:
import pandas as pd
import ujson
import codecs
import cPickle
import nltk
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
import math
from collections import Counter
pi = open('/home/paul/backup/2014-09-18.pickle')
data = cPickle.load(pi)
print("opened.")
data = data[0:100000]
In [2]:
vocab =set()
texts = [d[0] for d in data]
all_toks = []
word_dict = {}
i=0
for t in texts:
i=i+1
if(i%2000==0): print i
tokens = nltk.word_tokenize(t)
for tok in tokens:
all_toks.append(tok)
In [3]:
word_dict = Counter(all_toks)
word_dict.most_common(20)
Out[3]:
In [5]:
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
import math
In [6]:
# Counter data, counter is your counter object
keys = word_dict.keys()
# get the counts for each key, assuming the values are numerical
freqs = [word_dict[k]+1 for k in keys]
freqs = set(freqs)
freqs = list(freqs)
freqs.sort(reverse=True)
ranks = np.arange(len(freqs))
plt.scatter(ranks, freqs)
plt.xlabel('Rank', fontsize=28)
plt.ylabel('Frequency', fontsize=28)
plt.title('Highly skewed term-frequency distribution')
plt.show()
In [48]:
# log-log data
lranks = [math.log(1+r, 3) for r in ranks]
lfreqs = [math.log(1+f, 3 ) for f in freqs]
plt.scatter(lfreqs, lranks)
plt.xlabel('Log rank',fontsize=28)
plt.ylabel('Log frequency',fontsize=28)
plt.title('Log-log rank frequency plot')
plt.show()