Notebook for Data exploration

In [28]:
import numpy as np
import sklearn
import csv
from sklearn.feature_extraction import DictVectorizer

In [4]:
train_file = "WaterPump-training-values.csv"
train_labels = "WaterPump-training-labels.csv"
test_file = "WaterPump-test-values.csv"

def getData(lines=None, step=1, fileName="WaterPump-training-values.csv"):
    #method to import part or all of data from CSV into list
    #lines: how many lines to read; None means read to end of file
    #step: step size, so step=2 would read every 2nd line
    reader = csv.DictReader(open(fileName, 'rU'))
    result = []
    
    i=0
    #read to end of file, skipping lines if necessary
    if lines==None:
        for row in reader:
            if i%step==0:
                result.append(row)
            i += 1
    
    #read until specified number of lines are stored
    for row in reader:
        if i>=lines*step:
            break
        if i%step==0:
            result.append(row)
        i += 1
    
    return result

In [20]:
sampleData = getData(5)
print sampleData[:2]


[{'extraction_type': 'gravity', 'region_code': '11', 'gps_height': '1390', 'recorded_by': 'GeoData Consultants Ltd', 'longitude': '34.93809275', 'construction_year': '1999', 'installer': 'Roman', 'id': '69572', 'scheme_management': 'VWC', 'scheme_name': 'Roman', 'management': 'vwc', 'quantity_group': 'enough', 'source_class': 'groundwater', 'source': 'spring', 'subvillage': 'Mnyusi B', 'public_meeting': 'True', 'num_private': '0', 'latitude': '-9.85632177', 'waterpoint_type_group': 'communal standpipe', 'basin': 'Lake Nyasa', 'extraction_type_class': 'gravity', 'waterpoint_type': 'communal standpipe', 'wpt_name': 'none', 'management_group': 'user-group', 'lga': 'Ludewa', 'source_type': 'spring', 'district_code': '5', 'ward': 'Mundindi', 'payment': 'pay annually', 'population': '109', 'date_recorded': '2011-03-14', 'extraction_type_group': 'gravity', 'region': 'Iringa', 'amount_tsh': '6000.0', 'quality_group': 'good', 'payment_type': 'annually', 'water_quality': 'soft', 'permit': 'False', 'funder': 'Roman', 'quantity': 'enough'}, {'extraction_type': 'gravity', 'region_code': '20', 'gps_height': '1399', 'recorded_by': 'GeoData Consultants Ltd', 'longitude': '34.6987661', 'construction_year': '2010', 'installer': 'GRUMETI', 'id': '8776', 'scheme_management': 'Other', 'scheme_name': '', 'management': 'wug', 'quantity_group': 'insufficient', 'source_class': 'surface', 'source': 'rainwater harvesting', 'subvillage': 'Nyamara', 'public_meeting': '', 'num_private': '0', 'latitude': '-2.14746569', 'waterpoint_type_group': 'communal standpipe', 'basin': 'Lake Victoria', 'extraction_type_class': 'gravity', 'waterpoint_type': 'communal standpipe', 'wpt_name': 'Zahanati', 'management_group': 'user-group', 'lga': 'Serengeti', 'source_type': 'rainwater harvesting', 'district_code': '2', 'ward': 'Natta', 'payment': 'never pay', 'population': '280', 'date_recorded': '2013-03-06', 'extraction_type_group': 'gravity', 'region': 'Mara', 'amount_tsh': '0.0', 'quality_group': 'good', 'payment_type': 'never pay', 'water_quality': 'soft', 'permit': 'True', 'funder': 'Grumeti', 'quantity': 'insufficient'}]

In [18]:
def quantData(data):
    #getData() reads everything as string, so this changes appropriate continuous variables to ints/floats
    #TODO: how to deal with dates? ('date_recorded')
    for row in data:
        row['longitude'] = float(row['longitude'])
        row['latitude'] = float(row['latitude'])
        row['gps_height'] = int(row['gps_height'])
        row['region_code'] = int(row['region_code'])
        row['district_code'] = int(row['district_code'])
        row['amount_tsh'] = float(row['amount_tsh'])
        row['population'] = float(row['population'])
        row['construction_year'] = float(row['construction_year'])
        row['num_private'] = int(row['num_private'])  #what the heck is this??
    return data

def vectorizeData(data):
    #vectorize data, data should be dictionary
    #http://nbviewer.ipython.org/gist/sarguido/7423289
    pass

In [21]:
sampleData = quantData(sampleData)
print sampleData[:2]


[{'extraction_type': 'gravity', 'region_code': 11, 'gps_height': 1390, 'recorded_by': 'GeoData Consultants Ltd', 'longitude': 34.93809275, 'construction_year': '1999', 'installer': 'Roman', 'id': '69572', 'scheme_management': 'VWC', 'scheme_name': 'Roman', 'management': 'vwc', 'quantity_group': 'enough', 'source_class': 'groundwater', 'source': 'spring', 'subvillage': 'Mnyusi B', 'public_meeting': 'True', 'num_private': 0, 'latitude': -9.85632177, 'waterpoint_type_group': 'communal standpipe', 'basin': 'Lake Nyasa', 'extraction_type_class': 'gravity', 'waterpoint_type': 'communal standpipe', 'wpt_name': 'none', 'management_group': 'user-group', 'lga': 'Ludewa', 'source_type': 'spring', 'district_code': 5, 'ward': 'Mundindi', 'payment': 'pay annually', 'population': 109.0, 'date_recorded': '2011-03-14', 'extraction_type_group': 'gravity', 'region': 'Iringa', 'amount_tsh': 6000.0, 'quality_group': 'good', 'payment_type': 'annually', 'water_quality': 'soft', 'permit': 'False', 'funder': 'Roman', 'quantity': 'enough'}, {'extraction_type': 'gravity', 'region_code': 20, 'gps_height': 1399, 'recorded_by': 'GeoData Consultants Ltd', 'longitude': 34.6987661, 'construction_year': '2010', 'installer': 'GRUMETI', 'id': '8776', 'scheme_management': 'Other', 'scheme_name': '', 'management': 'wug', 'quantity_group': 'insufficient', 'source_class': 'surface', 'source': 'rainwater harvesting', 'subvillage': 'Nyamara', 'public_meeting': '', 'num_private': 0, 'latitude': -2.14746569, 'waterpoint_type_group': 'communal standpipe', 'basin': 'Lake Victoria', 'extraction_type_class': 'gravity', 'waterpoint_type': 'communal standpipe', 'wpt_name': 'Zahanati', 'management_group': 'user-group', 'lga': 'Serengeti', 'source_type': 'rainwater harvesting', 'district_code': 2, 'ward': 'Natta', 'payment': 'never pay', 'population': 280.0, 'date_recorded': '2013-03-06', 'extraction_type_group': 'gravity', 'region': 'Mara', 'amount_tsh': 0.0, 'quality_group': 'good', 'payment_type': 'never pay', 'water_quality': 'soft', 'permit': 'True', 'funder': 'Grumeti', 'quantity': 'insufficient'}]

I think the Pandas library might be a better fit for exploring the data and has some powerful features for manipulation - here's a little example of it. However, feel free to use Numpy or whatever you are most comfortable with - I guess I'm just lazy :)


In [32]:
from datetime import datetime, date, time
import sys

import pandas as pd
from pandas import Series, DataFrame, Panel

data = pd.read_csv(train_file, parse_dates=True,index_col='id') #read into dataframe, parse dates, and set ID as index
data.head()


Out[32]:
amount_tsh date_recorded funder gps_height installer longitude latitude wpt_name num_private basin ... payment_type water_quality quality_group quantity quantity_group source source_type source_class waterpoint_type waterpoint_type_group
id
69572 6000 2011-03-14 Roman 1390 Roman 34.938093 -9.856322 none 0 Lake Nyasa ... annually soft good enough enough spring spring groundwater communal standpipe communal standpipe
8776 0 2013-03-06 Grumeti 1399 GRUMETI 34.698766 -2.147466 Zahanati 0 Lake Victoria ... never pay soft good insufficient insufficient rainwater harvesting rainwater harvesting surface communal standpipe communal standpipe
34310 25 2013-02-25 Lottery Club 686 World vision 37.460664 -3.821329 Kwa Mahundi 0 Pangani ... per bucket soft good enough enough dam dam surface communal standpipe multiple communal standpipe
67743 0 2013-01-28 Unicef 263 UNICEF 38.486161 -11.155298 Zahanati Ya Nanyumbu 0 Ruvuma / Southern Coast ... never pay soft good dry dry machine dbh borehole groundwater communal standpipe multiple communal standpipe
19728 0 2011-07-13 Action In A 0 Artisan 31.130847 -1.825359 Shuleni 0 Lake Victoria ... never pay soft good seasonal seasonal rainwater harvesting rainwater harvesting surface communal standpipe communal standpipe

5 rows × 39 columns


In [17]:
data.columns


Out[17]:
Index([u'id', u'amount_tsh', u'date_recorded', u'funder', u'gps_height', u'installer', u'longitude', u'latitude', u'wpt_name', u'num_private', u'basin', u'subvillage', u'region', u'region_code', u'district_code', u'lga', u'ward', u'population', u'public_meeting', u'recorded_by', u'scheme_management', u'scheme_name', u'permit', u'construction_year', u'extraction_type', u'extraction_type_group', u'extraction_type_class', u'management', u'management_group', u'payment', u'payment_type', u'water_quality', u'quality_group', u'quantity', u'quantity_group', u'source', u'source_type', u'source_class', u'waterpoint_type', u'waterpoint_type_group'], dtype='object')

In [33]:
data.funder.str.lower() #set installer and funder to lower case if we want to do NLP
data.installer.str.lower()
#pd.isnull(data.date_recorded) #look for missing data


Out[33]:
id
69572                   roman
8776                  grumeti
34310            world vision
67743                  unicef
19728                 artisan
9944                      dwe
19816                    dwsp
54551                     dwe
53934               water aid
46144                 artisan
49056                 private
50409                  danida
36957            world vision
50495    lawatefuka water sup
53752                  wedeco
61848                     dwe
48451                     dwe
58155                     dwe
34169                     dwe
18274                   danid
48375                     twe
6091                      dwe
58500                     dwe
37862                     isf
51058                     dwe
22308                     dwe
55012             kilolo star
20145                     dwe
19685        district council
69124    lawatefuka water sup
                 ...         
14796                     NaN
20387                     dwe
29940                     tlc
15233                  distri
49651                     dwe
50998                     dwe
34716                     NaN
43986              government
38067                    acra
58255                      do
30647                   roman
67885                     dwe
47002                     dwe
44616              world bank
72148                 concern
34473                jaica co
34952                     dwe
26640                       0
72559                     dwe
30410                     dwe
13677                     dwe
44885              government
40607              government
48348                 private
11164                ml appro
60739                     ces
27263                    cefa
37057                     NaN
31282                    musa
26348                   world
Name: installer, dtype: object

In [ ]: