In [1]:
## imports,
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
names = ['term', 'counts','idf','percentage_of_documents']
df = pd.read_csv(r'..\..\data\text\bitcoin\src\corpus\by_day\frequencies.csv', names=names, skiprows=1, index_col='term')

In [3]:
df.head()


Out[3]:
counts idf percentage_of_documents
term
a!list!of!servers 1.0 7.847372 0.000391
newcomertrader 1.0 7.847372 0.000391
newcycle 1.0 7.847372 0.000391
newcryptos 1.0 7.847372 0.000391
newcrypto 1.0 7.847372 0.000391

In [4]:
df.idf.hist()


Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x298c2225dd8>

In [5]:
q1 = np.logical_and(df.idf > .5, df.idf < 2)
df_accepted_subset = df[q1]
df_cleaned_subset = df[np.logical_not(q1)]

dirty,


In [6]:
print(len(df_cleaned_subset))
df_cleaned_subset.idf.hist()


561913
Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x298bd4edb38>

usable,


In [7]:
print(len(df_accepted_subset))

df_accepted_subset.idf.hist()


8075
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x298bd7e46a0>

In [ ]: