In [29]:
##  Trying my column catergoriaztion code out on the Canada data set

In [30]:
import numpy as np
import pandas as pd

##data = pd.read_csv('Georgia_AllTenders_D1strow_.csv')
data = pd.read_csv('canada_example.csv')

In [31]:
##print data[:10]  ##It drives me buts that this does not work, something about a header?

data.head(5)[data.columns[0:10]]


Out[31]:
language title reference_number solicitation_number amendment_number publication_date date_closing amendment_date publishing_status gsin
0 English SPARES FOR VICTORIA CLASS SUBMARINE (GRK(W8482... PW-14-00632891 GRK(W8482-146269/000/A) 0 2014-04-29 2014-06-09 14:00 Eastern Standard Time (EST) NaN Active N3120: Bearings, Plain, Unmounted, N4720: Hose...
1 French PIÈCES DE RECHANGE POUR SOUS-MARIN DE CLASSE V... PW-14-00632891 GRK(W8482-146269/000/A) 0 2014-04-29 2014-06-09 14:00 Eastern Standard Time (EST) NaN Active N3120: Bearings, Plain, Unmounted, N4720: Hose...
2 English Paving Parking Lot – St Anthony, 65 West Stree... PW-14-00632902 M1010-5-0187 0 2014-04-29 2014-05-20 14:00 Atlantic Daylight Time (ADT) NaN Active 51: Construction Services
3 French Paver le stationnement du détachement de la GR... PW-14-00632902 M1010-5-0187 0 2014-04-29 2014-05-20 14:00 Atlantic Daylight Time (ADT) NaN Active 51: Construction Services
4 English Plumbing Repair and Maintenance Services on Gr... PW-14-00632910 201401212 0 2014-04-29 2014-05-20 14:00 Eastern Daylight Time (EDT) NaN Active 5161AB: PLUMBING CONTRACTOR SERVICES

In [32]:
col_name_list = list(data.columns.values)
len(col_name_list)

col_name_frequency_dict = {}

for item in col_name_list:
    vc_f_item = data[item].value_counts()  ##value counts for item
    ##print vc_f_item
    
    length_entities_item = len(vc_f_item)  ## the length of the value counts or the number of entities for an item
    ##print length_entities_item
    
    col_name_frequency_dict[item] = length_entities_item

##print col_name_frequency_dict

##  Now I want to import that colum dict into a Series
series_Column_Entities_Frequency_ = pd.Series(col_name_frequency_dict)

In [33]:
values = int(raw_input("How many columns have less then this number of entities:"))
##  This is a boolean mask looking for values less then the number
##print (series_Column_Entities_Frequency_ < values).sum()
col_query = (series_Column_Entities_Frequency_ < values).sum()
print col_query


How many columns have less then this number of entities:6
10

In [34]:
print "There are", len(col_name_list), "columns listed and their names are:", col_name_list


There are 23 columns listed and their names are: ['language', 'title', 'reference_number', 'solicitation_number', 'amendment_number', 'publication_date', 'date_closing', 'amendment_date', 'publishing_status', 'gsin', 'region_opportunity', 'region_delivery', 'notice_type', 'trade_agreement', 'tendering_procedure', 'competitive_procurement_strategy', 'non_competitive_procurement_strategy', 'procurement_entity', 'end_user_entity', 'description', 'contact', 'document', 'attachment']

In [36]:
series_Column_Entities_Frequency_.sort()
series_Column_Entities_Frequency_


Out[36]:
amendment_date                           1
publishing_status                        1
publication_date                         1
non_competitive_procurement_strategy     1
amendment_number                         1
competitive_procurement_strategy         2
language                                 2
attachment                               3
tendering_procedure                      5
region_opportunity                       5
notice_type                              6
trade_agreement                          6
procurement_entity                       8
end_user_entity                         13
document                                13
region_delivery                         13
date_closing                            14
gsin                                    19
reference_number                        19
solicitation_number                     19
contact                                 19
description                             38
title                                   38
dtype: int64

In [35]:


In [35]: