In [38]:
'''
Takes a csv file, and creates a dataframe, and a list of the column names.

Generates a frequency list for each column.  

For example the number of bidders for a Georgia dataset looks like this:

num_bidders   counters
1             37525
0             34447
2             15830
3             6847


Finally gets the length of that value counts, which should be the number of distinct entities within that column.

Smaller numbers are more frequently repeated.

'''


Out[38]:
'\nTakes a csv file, and creates a dataframe, and a list of the column names.\n\nGenerates a frequency list for each column.  \n\nFor example the number of bidders for a Georgia dataset looks like this:\n\nnum_bidders   counters\n1             37525\n0             34447\n2             15830\n3             6847\n\n\nFinally gets the length of that value counts, which should be the number of distinct entities within that column.\n\nSmaller numbers are more frequently repeated.\n\n'

In [39]:
import numpy as np
import pandas as pd

##  This is the csv file where I deleted the first row
data = pd.read_csv('Georgia_AllTenders_D1strow_.csv')

In [40]:
##  This format seems to work, which Sarah told me about and I found on stack overflow
##  http://stackoverflow.com/questions/16958513/showing-pandas-data-frame-as-a-table

data.head(3)[data.columns[0:20]]
##data.tender_registration_number


Out[40]:
id procurring_entity_id tender_type tender_registration_number tender_status tender_announcement_date bid_start_date bid_end_date estimated_value include_vat cpv_code offer_step guarantee_amount guarantee_period created_at updated_at dataset_id url_id num_bids num_bidders
0 1 1 ელექტრონული ტენდერი SPA130012785 ხელშეკრულება დადებულია 2013-06-11 2013-06-28 2013-07-03 22110 False 66500000 90.00 GEL 90.00 GEL 221 GEL 2013-06-26 11:41:52 UTC 2013-08-01 13:24:38 UTC 1 85103 1 1
1 2 2 ელექტრონული ტენდერი SPA130012786 ხელშეკრულება დადებულია 2013-06-11 2013-06-28 2013-07-03 2870 False 39200000 15.00 GEL 15.00 GEL 29 GEL 2013-06-26 11:41:52 UTC 2013-08-06 23:10:30 UTC 1 85104 1 1
2 3 3 გამარტივებული ელექტრონული ტენდერი SPA130012787 ხელშეკრულება დადებულია 2013-06-11 2013-06-13 2013-06-17 4935 False 22600000 50.00 GEL 50.00 GEL 49 GEL 2013-06-26 11:41:52 UTC 2013-07-19 23:03:30 UTC 1 85106 1 1

In [101]:
col_name_list = list(data.columns.values)
len(col_name_list)


Out[101]:
175

In [42]:
print col_name_list


['\xef\xbb\xbfid', 'procurring_entity_id', 'tender_type', 'tender_registration_number', 'tender_status', 'tender_announcement_date', 'bid_start_date', 'bid_end_date', 'estimated_value', 'include_vat', 'cpv_code', 'offer_step', 'guarantee_amount', 'guarantee_period', 'created_at', 'updated_at', 'dataset_id', 'url_id', 'num_bids', 'num_bidders', 'contract_value', 'winning_org_id', 'risk_indicators', 'procurer_name', 'supplier_name', 'sub_codes', 'inProgress', 'updated', 'is_new', 'procurer_code', 'winner_code', 'bidder_0_name', 'bidder_0_id', 'bidder_0_lowest_bid', 'bidder_0_black_or_white', 'bidder_1_name', 'bidder_1_id', 'bidder_1_lowest_bid', 'bidder_1_black_or_white', 'bidder_2_name', 'bidder_2_id', 'bidder_2_lowest_bid', 'bidder_2_black_or_white', 'bidder_3_name', 'bidder_3_id', 'bidder_3_lowest_bid', 'bidder_3_black_or_white', 'bidder_4_name', 'bidder_4_id', 'bidder_4_lowest_bid', 'bidder_4_black_or_white', 'bidder_5_name', 'bidder_5_id', 'bidder_5_lowest_bid', 'bidder_5_black_or_white', 'bidder_6_name', 'bidder_6_id', 'bidder_6_lowest_bid', 'bidder_6_black_or_white', 'bidder_7_name', 'bidder_7_id', 'bidder_7_lowest_bid', 'bidder_7_black_or_white', 'bidder_8_name', 'bidder_8_id', 'bidder_8_lowest_bid', 'bidder_8_black_or_white', 'bidder_9_name', 'bidder_9_id', 'bidder_9_lowest_bid', 'bidder_9_black_or_white', 'bidder_10_name', 'bidder_10_id', 'bidder_10_lowest_bid', 'bidder_10_black_or_white', 'bidder_11_name', 'bidder_11_id', 'bidder_11_lowest_bid', 'bidder_11_black_or_white', 'bidder_12_name', 'bidder_12_id', 'bidder_12_lowest_bid', 'bidder_12_black_or_white', 'bidder_13_name', 'bidder_13_id', 'bidder_13_lowest_bid', 'bidder_13_black_or_white', 'bidder_14_name', 'bidder_14_id', 'bidder_14_lowest_bid', 'bidder_14_black_or_white', 'bidder_15_name', 'bidder_15_id', 'bidder_15_lowest_bid', 'bidder_15_black_or_white', 'bidder_16_name', 'bidder_16_id', 'bidder_16_lowest_bid', 'bidder_16_black_or_white', 'bidder_17_name', 'bidder_17_id', 'bidder_17_lowest_bid', 'bidder_17_black_or_white', 'bidder_18_name', 'bidder_18_id', 'bidder_18_lowest_bid', 'bidder_18_black_or_white', 'bidder_19_name', 'bidder_19_id', 'bidder_19_lowest_bid', 'bidder_19_black_or_white', 'bidder_20_name', 'bidder_20_id', 'bidder_20_lowest_bid', 'bidder_20_black_or_white', 'bidder_21_name', 'bidder_21_id', 'bidder_21_lowest_bid', 'bidder_21_black_or_white', 'bidder_22_name', 'bidder_22_id', 'bidder_22_lowest_bid', 'bidder_22_black_or_white', 'bidder_23_name', 'bidder_23_id', 'bidder_23_lowest_bid', 'bidder_23_black_or_white', 'bidder_24_name', 'bidder_24_id', 'bidder_24_lowest_bid', 'bidder_24_black_or_white', 'bidder_25_name', 'bidder_25_id', 'bidder_25_lowest_bid', 'bidder_25_black_or_white', 'bidder_26_name', 'bidder_26_id', 'bidder_26_lowest_bid', 'bidder_26_black_or_white', 'bidder_27_name', 'bidder_27_id', 'bidder_27_lowest_bid', 'bidder_27_black_or_white', 'bidder_28_name', 'bidder_28_id', 'bidder_28_lowest_bid', 'bidder_28_black_or_white', 'bidder_29_name', 'bidder_29_id', 'bidder_29_lowest_bid', 'bidder_29_black_or_white', 'bidder_30_name', 'bidder_30_id', 'bidder_30_lowest_bid', 'bidder_30_black_or_white', 'bidder_31_name', 'bidder_31_id', 'bidder_31_lowest_bid', 'bidder_31_black_or_white', 'bidder_32_name', 'bidder_32_id', 'bidder_32_lowest_bid', 'bidder_32_black_or_white', 'bidder_33_name', 'bidder_33_id', 'bidder_33_lowest_bid', 'bidder_33_black_or_white', 'bidder_34_name', 'bidder_34_id', 'bidder_34_lowest_bid', 'bidder_34_black_or_white', 'bidder_35_name', 'bidder_35_id', 'bidder_35_lowest_bid', 'bidder_35_black_or_white']

In [44]:
col_name_frequency_dict = {}

for item in col_name_list:
    vc_f_item = data[item].value_counts()  ##value counts for item
    ##print vc_f_item
    
    length_entities_item = len(vc_f_item)  ## the length of the value counts or the number of entities for an item
    ##print length_entities_item
    
    col_name_frequency_dict[item] = length_entities_item

##print col_name_frequency_dict

In [73]:
##  Now I want to import that colum dict into a data

series_Column_Entities_Frequency_ = pd.Series(col_name_frequency_dict)

In [107]:
values = int(raw_input("How many columns have less then this number of entities:"))
##  This is a boolean mask looking for values less then the number
##print (series_Column_Entities_Frequency_ < values).sum()
col_query = (series_Column_Entities_Frequency_ < values).sum()
print col_query


How many columns have less then this number of entities:10
84

In [ ]: