notebook.community

Edit and run



In [38]:

    
'''
Takes a csv file, and creates a dataframe, and a list of the column names.

Generates a frequency list for each column.  

For example the number of bidders for a Georgia dataset looks like this:

num_bidders   counters
1             37525
0             34447
2             15830
3             6847


Finally gets the length of that value counts, which should be the number of distinct entities within that column.

Smaller numbers are more frequently repeated.

'''









    Out[38]:





'\nTakes a csv file, and creates a dataframe, and a list of the column names.\n\nGenerates a frequency list for each column.  \n\nFor example the number of bidders for a Georgia dataset looks like this:\n\nnum_bidders   counters\n1             37525\n0             34447\n2             15830\n3             6847\n\n\nFinally gets the length of that value counts, which should be the number of distinct entities within that column.\n\nSmaller numbers are more frequently repeated.\n\n'



In [39]:

    
import numpy as np
import pandas as pd

##  This is the csv file where I deleted the first row
data = pd.read_csv('Georgia_AllTenders_D1strow_.csv')



In [40]:

    
##  This format seems to work, which Sarah told me about and I found on stack overflow
##  http://stackoverflow.com/questions/16958513/showing-pandas-data-frame-as-a-table

data.head(3)[data.columns[0:20]]
##data.tender_registration_number









    Out[40]:






  
    
      
      id
      procurring_entity_id
      tender_type
      tender_registration_number
      tender_status
      tender_announcement_date
      bid_start_date
      bid_end_date
      estimated_value
      include_vat
      cpv_code
      offer_step
      guarantee_amount
      guarantee_period
      created_at
      updated_at
      dataset_id
      url_id
      num_bids
      num_bidders
    
  
  
    
      0
       1
       1
                     ელექტრონული ტენდერი
       SPA130012785
       ხელშეკრულება დადებულია
       2013-06-11
       2013-06-28
       2013-07-03
       22110
       False
       66500000
       90.00 GEL
       90.00 GEL
       221 GEL
       2013-06-26 11:41:52 UTC
       2013-08-01 13:24:38 UTC
       1
       85103
       1
       1
    
    
      1
       2
       2
                     ელექტრონული ტენდერი
       SPA130012786
       ხელშეკრულება დადებულია
       2013-06-11
       2013-06-28
       2013-07-03
        2870
       False
       39200000
       15.00 GEL
       15.00 GEL
        29 GEL
       2013-06-26 11:41:52 UTC
       2013-08-06 23:10:30 UTC
       1
       85104
       1
       1
    
    
      2
       3
       3
       გამარტივებული ელექტრონული ტენდერი
       SPA130012787
       ხელშეკრულება დადებულია
       2013-06-11
       2013-06-13
       2013-06-17
        4935
       False
       22600000
       50.00 GEL
       50.00 GEL
        49 GEL
       2013-06-26 11:41:52 UTC
       2013-07-19 23:03:30 UTC
       1
       85106
       1
       1



In [101]:

    
col_name_list = list(data.columns.values)
len(col_name_list)









    Out[101]:





175



In [42]:

    
print col_name_list









    



['\xef\xbb\xbfid', 'procurring_entity_id', 'tender_type', 'tender_registration_number', 'tender_status', 'tender_announcement_date', 'bid_start_date', 'bid_end_date', 'estimated_value', 'include_vat', 'cpv_code', 'offer_step', 'guarantee_amount', 'guarantee_period', 'created_at', 'updated_at', 'dataset_id', 'url_id', 'num_bids', 'num_bidders', 'contract_value', 'winning_org_id', 'risk_indicators', 'procurer_name', 'supplier_name', 'sub_codes', 'inProgress', 'updated', 'is_new', 'procurer_code', 'winner_code', 'bidder_0_name', 'bidder_0_id', 'bidder_0_lowest_bid', 'bidder_0_black_or_white', 'bidder_1_name', 'bidder_1_id', 'bidder_1_lowest_bid', 'bidder_1_black_or_white', 'bidder_2_name', 'bidder_2_id', 'bidder_2_lowest_bid', 'bidder_2_black_or_white', 'bidder_3_name', 'bidder_3_id', 'bidder_3_lowest_bid', 'bidder_3_black_or_white', 'bidder_4_name', 'bidder_4_id', 'bidder_4_lowest_bid', 'bidder_4_black_or_white', 'bidder_5_name', 'bidder_5_id', 'bidder_5_lowest_bid', 'bidder_5_black_or_white', 'bidder_6_name', 'bidder_6_id', 'bidder_6_lowest_bid', 'bidder_6_black_or_white', 'bidder_7_name', 'bidder_7_id', 'bidder_7_lowest_bid', 'bidder_7_black_or_white', 'bidder_8_name', 'bidder_8_id', 'bidder_8_lowest_bid', 'bidder_8_black_or_white', 'bidder_9_name', 'bidder_9_id', 'bidder_9_lowest_bid', 'bidder_9_black_or_white', 'bidder_10_name', 'bidder_10_id', 'bidder_10_lowest_bid', 'bidder_10_black_or_white', 'bidder_11_name', 'bidder_11_id', 'bidder_11_lowest_bid', 'bidder_11_black_or_white', 'bidder_12_name', 'bidder_12_id', 'bidder_12_lowest_bid', 'bidder_12_black_or_white', 'bidder_13_name', 'bidder_13_id', 'bidder_13_lowest_bid', 'bidder_13_black_or_white', 'bidder_14_name', 'bidder_14_id', 'bidder_14_lowest_bid', 'bidder_14_black_or_white', 'bidder_15_name', 'bidder_15_id', 'bidder_15_lowest_bid', 'bidder_15_black_or_white', 'bidder_16_name', 'bidder_16_id', 'bidder_16_lowest_bid', 'bidder_16_black_or_white', 'bidder_17_name', 'bidder_17_id', 'bidder_17_lowest_bid', 'bidder_17_black_or_white', 'bidder_18_name', 'bidder_18_id', 'bidder_18_lowest_bid', 'bidder_18_black_or_white', 'bidder_19_name', 'bidder_19_id', 'bidder_19_lowest_bid', 'bidder_19_black_or_white', 'bidder_20_name', 'bidder_20_id', 'bidder_20_lowest_bid', 'bidder_20_black_or_white', 'bidder_21_name', 'bidder_21_id', 'bidder_21_lowest_bid', 'bidder_21_black_or_white', 'bidder_22_name', 'bidder_22_id', 'bidder_22_lowest_bid', 'bidder_22_black_or_white', 'bidder_23_name', 'bidder_23_id', 'bidder_23_lowest_bid', 'bidder_23_black_or_white', 'bidder_24_name', 'bidder_24_id', 'bidder_24_lowest_bid', 'bidder_24_black_or_white', 'bidder_25_name', 'bidder_25_id', 'bidder_25_lowest_bid', 'bidder_25_black_or_white', 'bidder_26_name', 'bidder_26_id', 'bidder_26_lowest_bid', 'bidder_26_black_or_white', 'bidder_27_name', 'bidder_27_id', 'bidder_27_lowest_bid', 'bidder_27_black_or_white', 'bidder_28_name', 'bidder_28_id', 'bidder_28_lowest_bid', 'bidder_28_black_or_white', 'bidder_29_name', 'bidder_29_id', 'bidder_29_lowest_bid', 'bidder_29_black_or_white', 'bidder_30_name', 'bidder_30_id', 'bidder_30_lowest_bid', 'bidder_30_black_or_white', 'bidder_31_name', 'bidder_31_id', 'bidder_31_lowest_bid', 'bidder_31_black_or_white', 'bidder_32_name', 'bidder_32_id', 'bidder_32_lowest_bid', 'bidder_32_black_or_white', 'bidder_33_name', 'bidder_33_id', 'bidder_33_lowest_bid', 'bidder_33_black_or_white', 'bidder_34_name', 'bidder_34_id', 'bidder_34_lowest_bid', 'bidder_34_black_or_white', 'bidder_35_name', 'bidder_35_id', 'bidder_35_lowest_bid', 'bidder_35_black_or_white']



In [44]:

    
col_name_frequency_dict = {}

for item in col_name_list:
    vc_f_item = data[item].value_counts()  ##value counts for item
    ##print vc_f_item
    
    length_entities_item = len(vc_f_item)  ## the length of the value counts or the number of entities for an item
    ##print length_entities_item
    
    col_name_frequency_dict[item] = length_entities_item

##print col_name_frequency_dict



In [73]:

    
##  Now I want to import that colum dict into a data

series_Column_Entities_Frequency_ = pd.Series(col_name_frequency_dict)



In [107]:

    
values = int(raw_input("How many columns have less then this number of entities:"))
##  This is a boolean mask looking for values less then the number
##print (series_Column_Entities_Frequency_ < values).sum()
col_query = (series_Column_Entities_Frequency_ < values).sum()
print col_query









    



How many columns have less then this number of entities:10
84



In [ ]:

	id	procurring_entity_id	tender_type	tender_registration_number	tender_status	tender_announcement_date	bid_start_date	bid_end_date	estimated_value	include_vat	cpv_code	offer_step	guarantee_amount	guarantee_period	created_at	updated_at	dataset_id	url_id	num_bids	num_bidders
0	1	1	ელექტრონული ტენდერი	SPA130012785	ხელშეკრულება დადებულია	2013-06-11	2013-06-28	2013-07-03	22110	False	66500000	90.00 GEL	90.00 GEL	221 GEL	2013-06-26 11:41:52 UTC	2013-08-01 13:24:38 UTC	1	85103	1	1
1	2	2	ელექტრონული ტენდერი	SPA130012786	ხელშეკრულება დადებულია	2013-06-11	2013-06-28	2013-07-03	2870	False	39200000	15.00 GEL	15.00 GEL	29 GEL	2013-06-26 11:41:52 UTC	2013-08-06 23:10:30 UTC	1	85104	1	1
2	3	3	გამარტივებული ელექტრონული ტენდერი	SPA130012787	ხელშეკრულება დადებულია	2013-06-11	2013-06-13	2013-06-17	4935	False	22600000	50.00 GEL	50.00 GEL	49 GEL	2013-06-26 11:41:52 UTC	2013-07-19 23:03:30 UTC	1	85106	1	1