notebook.community

Edit and run



In [29]:

    
##  Trying my column catergoriaztion code out on the Canada data set



In [30]:

    
import numpy as np
import pandas as pd

##data = pd.read_csv('Georgia_AllTenders_D1strow_.csv')
data = pd.read_csv('canada_example.csv')



In [31]:

    
##print data[:10]  ##It drives me buts that this does not work, something about a header?

data.head(5)[data.columns[0:10]]









    Out[31]:






  
    
      
      language
      title
      reference_number
      solicitation_number
      amendment_number
      publication_date
      date_closing
      amendment_date
      publishing_status
      gsin
    
  
  
    
      0
       English
       SPARES FOR VICTORIA CLASS SUBMARINE (GRK(W8482...
       PW-14-00632891
       GRK(W8482-146269/000/A)
       0
       2014-04-29
        2014-06-09 14:00 Eastern Standard Time (EST)
       NaN
       Active
       N3120: Bearings, Plain, Unmounted, N4720: Hose...
    
    
      1
        French
       PIÈCES DE RECHANGE POUR SOUS-MARIN DE CLASSE V...
       PW-14-00632891
       GRK(W8482-146269/000/A)
       0
       2014-04-29
        2014-06-09 14:00 Eastern Standard Time (EST)
       NaN
       Active
       N3120: Bearings, Plain, Unmounted, N4720: Hose...
    
    
      2
       English
       Paving Parking Lot – St Anthony, 65 West Stree...
       PW-14-00632902
                  M1010-5-0187
       0
       2014-04-29
       2014-05-20 14:00 Atlantic Daylight Time (ADT)
       NaN
       Active
                               51: Construction Services
    
    
      3
        French
       Paver le stationnement du détachement de la GR...
       PW-14-00632902
                  M1010-5-0187
       0
       2014-04-29
       2014-05-20 14:00 Atlantic Daylight Time (ADT)
       NaN
       Active
                               51: Construction Services
    
    
      4
       English
       Plumbing Repair and Maintenance Services on Gr...
       PW-14-00632910
                     201401212
       0
       2014-04-29
        2014-05-20 14:00 Eastern Daylight Time (EDT)
       NaN
       Active
                    5161AB: PLUMBING CONTRACTOR SERVICES



In [32]:

    
col_name_list = list(data.columns.values)
len(col_name_list)

col_name_frequency_dict = {}

for item in col_name_list:
    vc_f_item = data[item].value_counts()  ##value counts for item
    ##print vc_f_item
    
    length_entities_item = len(vc_f_item)  ## the length of the value counts or the number of entities for an item
    ##print length_entities_item
    
    col_name_frequency_dict[item] = length_entities_item

##print col_name_frequency_dict

##  Now I want to import that colum dict into a Series
series_Column_Entities_Frequency_ = pd.Series(col_name_frequency_dict)



In [33]:

    
values = int(raw_input("How many columns have less then this number of entities:"))
##  This is a boolean mask looking for values less then the number
##print (series_Column_Entities_Frequency_ < values).sum()
col_query = (series_Column_Entities_Frequency_ < values).sum()
print col_query









    



How many columns have less then this number of entities:6
10



In [34]:

    
print "There are", len(col_name_list), "columns listed and their names are:", col_name_list









    



There are 23 columns listed and their names are: ['language', 'title', 'reference_number', 'solicitation_number', 'amendment_number', 'publication_date', 'date_closing', 'amendment_date', 'publishing_status', 'gsin', 'region_opportunity', 'region_delivery', 'notice_type', 'trade_agreement', 'tendering_procedure', 'competitive_procurement_strategy', 'non_competitive_procurement_strategy', 'procurement_entity', 'end_user_entity', 'description', 'contact', 'document', 'attachment']



In [36]:

    
series_Column_Entities_Frequency_.sort()
series_Column_Entities_Frequency_









    Out[36]:





amendment_date                           1
publishing_status                        1
publication_date                         1
non_competitive_procurement_strategy     1
amendment_number                         1
competitive_procurement_strategy         2
language                                 2
attachment                               3
tendering_procedure                      5
region_opportunity                       5
notice_type                              6
trade_agreement                          6
procurement_entity                       8
end_user_entity                         13
document                                13
region_delivery                         13
date_closing                            14
gsin                                    19
reference_number                        19
solicitation_number                     19
contact                                 19
description                             38
title                                   38
dtype: int64



In [35]:



In [35]:

	language	title	reference_number	solicitation_number	publication_date	date_closing	amendment_date	publishing_status	gsin
0	English	SPARES FOR VICTORIA CLASS SUBMARINE (GRK(W8482...	PW-14-00632891	GRK(W8482-146269/000/A)	2014-04-29	2014-06-09 14:00 Eastern Standard Time (EST)	NaN	Active	N3120: Bearings, Plain, Unmounted, N4720: Hose...
1	French	PIÈCES DE RECHANGE POUR SOUS-MARIN DE CLASSE V...	PW-14-00632891	GRK(W8482-146269/000/A)	2014-04-29	2014-06-09 14:00 Eastern Standard Time (EST)	NaN	Active	N3120: Bearings, Plain, Unmounted, N4720: Hose...
2	English	Paving Parking Lot – St Anthony, 65 West Stree...	PW-14-00632902	M1010-5-0187	2014-04-29	2014-05-20 14:00 Atlantic Daylight Time (ADT)	NaN	Active	51: Construction Services
3	French	Paver le stationnement du détachement de la GR...	PW-14-00632902	M1010-5-0187	2014-04-29	2014-05-20 14:00 Atlantic Daylight Time (ADT)	NaN	Active	51: Construction Services
4	English	Plumbing Repair and Maintenance Services on Gr...	PW-14-00632910	201401212	2014-04-29	2014-05-20 14:00 Eastern Daylight Time (EDT)	NaN	Active	5161AB: PLUMBING CONTRACTOR SERVICES