In [28]:
import numpy as np
import sklearn
import csv
from sklearn.feature_extraction import DictVectorizer
In [4]:
train_file = "WaterPump-training-values.csv"
train_labels = "WaterPump-training-labels.csv"
test_file = "WaterPump-test-values.csv"
def getData(lines=None, step=1, fileName="WaterPump-training-values.csv"):
#method to import part or all of data from CSV into list
#lines: how many lines to read; None means read to end of file
#step: step size, so step=2 would read every 2nd line
reader = csv.DictReader(open(fileName, 'rU'))
result = []
i=0
#read to end of file, skipping lines if necessary
if lines==None:
for row in reader:
if i%step==0:
result.append(row)
i += 1
#read until specified number of lines are stored
for row in reader:
if i>=lines*step:
break
if i%step==0:
result.append(row)
i += 1
return result
In [20]:
sampleData = getData(5)
print sampleData[:2]
In [18]:
def quantData(data):
#getData() reads everything as string, so this changes appropriate continuous variables to ints/floats
#TODO: how to deal with dates? ('date_recorded')
for row in data:
row['longitude'] = float(row['longitude'])
row['latitude'] = float(row['latitude'])
row['gps_height'] = int(row['gps_height'])
row['region_code'] = int(row['region_code'])
row['district_code'] = int(row['district_code'])
row['amount_tsh'] = float(row['amount_tsh'])
row['population'] = float(row['population'])
row['construction_year'] = float(row['construction_year'])
row['num_private'] = int(row['num_private']) #what the heck is this??
return data
def vectorizeData(data):
#vectorize data, data should be dictionary
#http://nbviewer.ipython.org/gist/sarguido/7423289
pass
In [21]:
sampleData = quantData(sampleData)
print sampleData[:2]
I think the Pandas library might be a better fit for exploring the data and has some powerful features for manipulation - here's a little example of it. However, feel free to use Numpy or whatever you are most comfortable with - I guess I'm just lazy :)
In [32]:
from datetime import datetime, date, time
import sys
import pandas as pd
from pandas import Series, DataFrame, Panel
data = pd.read_csv(train_file, parse_dates=True,index_col='id') #read into dataframe, parse dates, and set ID as index
data.head()
Out[32]:
In [17]:
data.columns
Out[17]:
In [33]:
data.funder.str.lower() #set installer and funder to lower case if we want to do NLP
data.installer.str.lower()
#pd.isnull(data.date_recorded) #look for missing data
Out[33]:
In [ ]: