In [1]:
import pandas as pd
import numpy as np
df=pd.read_csv('../data/projects.csv')

df.secondary_focus_area = df.secondary_focus_area.fillna(df.primary_focus_area)
df.secondary_focus_subject = df.secondary_focus_subject.fillna(df.primary_focus_subject)
totalCount = df.shape[0]

for i in range(1,df.shape[1]):
    nullcount = df[df[df.columns[i]].isnull()].shape[0]
    percentage=float(nullcount)/float(totalCount) *100
    if(percentage>0):
        print df.columns[i],percentage,'%'


school_ncesid 6.43519480559 %
school_zip 0.000602320741818 %
school_metro 12.3337218302 %
school_district 0.142750015811 %
school_county 0.00255986315273 %
teacher_prefix 0.000602320741818 %
primary_focus_subject 0.00587262723273 %
primary_focus_area 0.00587262723273 %
secondary_focus_subject 0.00587262723273 %
secondary_focus_area 0.00587262723273 %
resource_type 0.00677610834546 %
grade_level 0.00135522166909 %
fulfillment_labor_materials 5.28265406612 %
students_reached 0.0219847070764 %

In [4]:
df.describe()


Out[4]:
school_ncesid school_latitude school_longitude school_zip fulfillment_labor_materials total_price_excluding_optional_support total_price_including_optional_support students_reached
count 6.213620e+05 664098.000000 664098.000000 664094.000000 629016.000000 664098.000000 664098.000000 663952.000000
mean 2.608577e+11 37.240948 -93.344760 52926.452644 27.495822 542.041520 644.712845 97.233547
std 1.589754e+11 4.668155 17.761908 31193.738658 8.663046 12677.383883 15458.631886 2283.327661
min 1.000050e+10 18.249140 -176.634989 410.000000 9.000000 0.000000 0.000000 0.000000
25% 1.000230e+11 34.038082 -112.003213 27589.000000 17.000000 267.200000 317.500000 22.000000
50% 2.615730e+11 37.662476 -87.681219 50702.000000 30.000000 411.000000 487.840000 30.000000
75% 3.704720e+11 40.758827 -79.193091 85043.000000 35.000000 586.490000 696.387500 100.000000
max 6.100009e+11 67.258157 -66.628036 99950.000000 35.000000 10250017.000000 12500020.730000 999999.000000

In [72]:
plt.hist(df.total_price_excluding_optional_support[df.total_price_excluding_optional_support< 1000],bins = 100)


Out[72]:
(array([   319.,     39.,     19.,     32.,     64.,     90.,    162.,
          194.,    261.,    398.,   1754.,   2444.,   3920.,   7678.,
        12237.,  13826.,  12657.,  11499.,  11548.,  11473.,  10627.,
        11247.,  10778.,  12142.,  12074.,  11129.,  10308.,  10194.,
        10811.,  10780.,  10679.,  11480.,  11701.,  14878.,  10778.,
        11660.,  11391.,  12190.,  11704.,  12120.,  11308.,  12769.,
        13244.,  17142.,  15300.,  13335.,  14845.,  14819.,  11069.,
         7219.,   7570.,   6559.,   5160.,   5335.,   6376.,   4739.,
         4243.,   4592.,   4917.,   4656.,   4224.,   3402.,   3994.,
         4216.,   3822.,   4830.,   3428.,   4097.,   3427.,   3781.,
         3569.,   3658.,   2859.,   3153.,   3125.,   3341.,   3063.,
         3229.,   3405.,   3356.,   3707.,   3562.,   3659.,   4453.,
         4373.,   3951.,   2451.,   2574.,   2535.,   2707.,   2805.,
         3519.,   2432.,   1693.,   1352.,   1195.,    908.,    956.,
          847.,    854.]),
 array([   0.    ,    9.9999,   19.9998,   29.9997,   39.9996,   49.9995,
         59.9994,   69.9993,   79.9992,   89.9991,   99.999 ,  109.9989,
        119.9988,  129.9987,  139.9986,  149.9985,  159.9984,  169.9983,
        179.9982,  189.9981,  199.998 ,  209.9979,  219.9978,  229.9977,
        239.9976,  249.9975,  259.9974,  269.9973,  279.9972,  289.9971,
        299.997 ,  309.9969,  319.9968,  329.9967,  339.9966,  349.9965,
        359.9964,  369.9963,  379.9962,  389.9961,  399.996 ,  409.9959,
        419.9958,  429.9957,  439.9956,  449.9955,  459.9954,  469.9953,
        479.9952,  489.9951,  499.995 ,  509.9949,  519.9948,  529.9947,
        539.9946,  549.9945,  559.9944,  569.9943,  579.9942,  589.9941,
        599.994 ,  609.9939,  619.9938,  629.9937,  639.9936,  649.9935,
        659.9934,  669.9933,  679.9932,  689.9931,  699.993 ,  709.9929,
        719.9928,  729.9927,  739.9926,  749.9925,  759.9924,  769.9923,
        779.9922,  789.9921,  799.992 ,  809.9919,  819.9918,  829.9917,
        839.9916,  849.9915,  859.9914,  869.9913,  879.9912,  889.9911,
        899.991 ,  909.9909,  919.9908,  929.9907,  939.9906,  949.9905,
        959.9904,  969.9903,  979.9902,  989.9901,  999.99  ]),
 <a list of 100 Patch objects>)

In [73]:
plt.show()

In [15]:
int(df.students_reached[3])/5


Out[15]:
2

resources


In [1]:
import pandas as pd
import numpy as np
df=pd.read_csv('../data/resources.csv')

totalCount = df.shape[0]

for i in range(1,df.shape[1]):
    nullcount = df[df[df.columns[i]].isnull()].shape[0]
    percentage=float(nullcount)/float(totalCount) *100
    if(percentage>0):
        print df.columns[i],percentage,'%'


vendorid 2.95234778853 %
vendor_name 2.95308404166 %
project_resource_type 0.00534465236172 %
item_name 0.373852978976 %
item_number 0.698158849067 %
item_unit_price 0.378870407723 %
item_quantity 0.226602352683 %

In [4]:
df.describe()


Out[4]:
vendorid item_unit_price item_quantity
count 3558948.000000 3.653323e+06 3658907.000000
mean 423.671676 1.798358e+02 3.305985
std 354.632748 2.346667e+05 519.314756
min 1.000000 -9.960000e+01 0.000000
25% 27.000000 6.290000e+00 1.000000
50% 752.000000 1.398000e+01 1.000000
75% 767.000000 3.600000e+01 2.000000
max 1020.000000 4.484216e+08 993108.000000

In [ ]: