In [29]:
## Trying my column catergoriaztion code out on the Canada data set
In [30]:
import numpy as np
import pandas as pd
##data = pd.read_csv('Georgia_AllTenders_D1strow_.csv')
data = pd.read_csv('canada_example.csv')
In [31]:
##print data[:10] ##It drives me buts that this does not work, something about a header?
data.head(5)[data.columns[0:10]]
Out[31]:
In [32]:
col_name_list = list(data.columns.values)
len(col_name_list)
col_name_frequency_dict = {}
for item in col_name_list:
vc_f_item = data[item].value_counts() ##value counts for item
##print vc_f_item
length_entities_item = len(vc_f_item) ## the length of the value counts or the number of entities for an item
##print length_entities_item
col_name_frequency_dict[item] = length_entities_item
##print col_name_frequency_dict
## Now I want to import that colum dict into a Series
series_Column_Entities_Frequency_ = pd.Series(col_name_frequency_dict)
In [33]:
values = int(raw_input("How many columns have less then this number of entities:"))
## This is a boolean mask looking for values less then the number
##print (series_Column_Entities_Frequency_ < values).sum()
col_query = (series_Column_Entities_Frequency_ < values).sum()
print col_query
In [34]:
print "There are", len(col_name_list), "columns listed and their names are:", col_name_list
In [36]:
series_Column_Entities_Frequency_.sort()
series_Column_Entities_Frequency_
Out[36]:
In [35]:
In [35]: