In [46]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
mpl.style.use('bmh')
In [2]:
import pandas as pd
In [22]:
df = pd.read_csv('data/token-counts.csv').sort_values('token_count')
In [45]:
x = df['token_count']
y = df['count']
In [44]:
plt.figure(figsize=(20, 10))
plt.title('Document lengths')
plt.xlabel('Word count')
plt.ylabel('Number of documents')
plt.yscale('log')
plt.xlim((0, 200000))
plt.scatter(x, y, s=5, c='r')
plt.show()