Word counts


In [46]:
import matplotlib as mpl
import matplotlib.pyplot as plt

%matplotlib inline
mpl.style.use('bmh')

In [2]:
import pandas as pd

In [22]:
df = pd.read_csv('data/token-counts.csv').sort_values('token_count')

In [45]:
x = df['token_count']
y = df['count']

In [44]:
plt.figure(figsize=(20, 10))

plt.title('Document lengths')
plt.xlabel('Word count')
plt.ylabel('Number of documents')
plt.yscale('log')
plt.xlim((0, 200000))
plt.scatter(x, y, s=5, c='r')
plt.show()