In [1]:
## imports,
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
names = ['term', 'counts','idf','percentage_of_documents']
df = pd.read_csv(r'..\..\data\text\bitcoin\src\corpus\by_day\frequencies.csv', names=names, skiprows=1, index_col='term')
In [3]:
df.head()
Out[3]:
In [4]:
df.idf.hist()
Out[4]:
In [5]:
q1 = np.logical_and(df.idf > .5, df.idf < 2)
df_accepted_subset = df[q1]
df_cleaned_subset = df[np.logical_not(q1)]
In [6]:
print(len(df_cleaned_subset))
df_cleaned_subset.idf.hist()
Out[6]:
In [7]:
print(len(df_accepted_subset))
df_accepted_subset.idf.hist()
Out[7]:
In [ ]: