Objective for training dataframe:
Objective for test dataframe:
In [50]:
import pandas as pd
import numpy as np
In [2]:
df = pd.read_csv('preparation.csv',delimiter=',')
In [3]:
df.shape
Out[3]:
In [4]:
!ls
In [5]:
59400*0.8
Out[5]:
In [6]:
dftrain = pd.read_csv('preparation.csv',delimiter=',',nrows=47520)
In [131]:
dfpayment = pd.read_csv('trainingset.csv',delimiter=',',nrows=47520)
In [132]:
dftrain.tail()
Out[132]:
In [133]:
dftrain['terrain'] = 'dataran tinggi'
In [139]:
dftrain['terrain'][dftrain['gps_height']<=500] = 'dataran rendah'
In [140]:
dftrain['terrain'][(dftrain['gps_height']>500) & (dftrain['gps_height']<=1000)] = 'dataran sedang'
In [141]:
dftrain['terrain'].replace('dataran rendah',0,inplace=True)
dftrain['terrain'].replace('dataran sedang',1,inplace=True)
dftrain['terrain'].replace('dataran tinggi',2,inplace=True)
In [144]:
dftrain.head()
Out[144]:
In [145]:
print dftrain.columns
In [146]:
dftrain['water_availability'] = dftrain.apply(lambda row: 0 if row['amount_tsh']==0.0 else 1,axis=1)
In [147]:
dftrain['payment'] = dfpayment['payment']
In [148]:
dftrain['payment'].unique()
Out[148]:
In [149]:
dftrain['payment'].replace('pay annually',1,inplace=True)
dftrain['payment'].replace('never pay',2,inplace=True)
dftrain['payment'].replace('pay per bucket',3,inplace=True)
dftrain['payment'].replace('unknown',0,inplace=True)
dftrain['payment'].replace('pay when scheme fails',4,inplace=True)
dftrain['payment'].replace('other',5,inplace=True)
dftrain['payment'].replace('pay monthly',6,inplace=True)
In [150]:
dftrain2 = dftrain[['id','water_availability','terrain','num_private','region_code','district_code','population','water_quality','quality_group','quantity_group','source_type','source_class','waterpoint_type','wp_age','daysdiff','payment','status_group']].copy()
In [151]:
dftrain2.tail()
Out[151]:
In [152]:
dftrain2.to_csv('dftrain.csv',sep=',',header=True,index=False)
The validation dataset must have same table structure with the training dataset so that we can calculate the accuracy.
In [153]:
dfvalidation = pd.read_csv('preparation.csv',delimiter=',',skiprows=range(1,47520),nrows=11880)
In [154]:
dfv_payment = pd.read_csv('trainingset.csv',delimiter=',',skiprows=range(1,47520),nrows=11880)
In [156]:
print dfvalidation.shape
dfvalidation.tail()
Out[156]:
In [157]:
dfvalidation['terrain'] = 'dataran tinggi'
In [158]:
dfvalidation['terrain'][dftrain['gps_height']<=500] = 'dataran rendah'
dfvalidation['terrain'][(dfvalidation['gps_height']>500) & (dfvalidation['gps_height']<=1000)] = 'dataran sedang'
In [159]:
dfvalidation['terrain'].replace('dataran rendah',0,inplace=True)
dfvalidation['terrain'].replace('dataran sedang',1,inplace=True)
dfvalidation['terrain'].replace('dataran tinggi',2,inplace=True)
In [160]:
dfvalidation.head()
Out[160]:
In [161]:
dfvalidation['water_availability'] = dfvalidation.apply(lambda row: 0 if row['amount_tsh']==0.0 else 1,axis=1)
In [162]:
dfvalidation['payment'] = dfv_payment['payment']
In [163]:
dfvalidation['payment'].replace('pay annually',1,inplace=True)
dfvalidation['payment'].replace('never pay',2,inplace=True)
dfvalidation['payment'].replace('pay per bucket',3,inplace=True)
dfvalidation['payment'].replace('unknown',0,inplace=True)
dfvalidation['payment'].replace('pay when scheme fails',4,inplace=True)
dfvalidation['payment'].replace('other',5,inplace=True)
dfvalidation['payment'].replace('pay monthly',6,inplace=True)
In [177]:
dfvalidation2 = dfvalidation[['id','water_availability','terrain','num_private','region_code','district_code','population','water_quality','quality_group','quantity_group','source_type','source_class','waterpoint_type','wp_age','daysdiff','payment','status_group']].copy()
In [178]:
dfvalidation2.head()
Out[178]:
In [179]:
dfvalidation2.to_csv('validation.csv',sep=',',header=True,index=False)
In [167]:
!ls -l
In [168]:
print "done"
In [169]:
dftest = pd.read_csv('testset.csv',delimiter=',')
In [170]:
dftest.head()
Out[170]:
In [171]:
dftest.isnull().sum().sum()
Out[171]:
In [172]:
print list(dftest.columns)
In [96]:
dftest['source_class'].isnull().sum().sum()
Out[96]:
In [80]:
dftest['public_meeting'].fillna('unknown',inplace=True)
In [83]:
dftest['payment'].unique()
Out[83]:
In [84]:
dftest['wp_age'] = dftest.apply(lambda row: (2017-row['construction_year']),axis=1)
In [85]:
dftest['water_quality'].replace('soft',1,inplace=True)
dftest['water_quality'].replace('salty',2,inplace=True)
dftest['water_quality'].replace('milky',3,inplace=True)
dftest['water_quality'].replace('fluoride',4,inplace=True)
dftest['water_quality'].replace('coloured',5,inplace=True)
dftest['water_quality'].replace('salty abandoned',6,inplace=True)
dftest['water_quality'].replace('fluoride abandoned',7,inplace=True)
dftest['water_quality'].replace('unknown',0,inplace=True)
In [89]:
dftest['quantity_group'].replace('enough',1,inplace=True)
dftest['quantity_group'].replace('insufficient',2,inplace=True)
dftest['quantity_group'].replace('dry',3,inplace=True)
dftest['quantity_group'].replace('seasonal',4,inplace=True)
dftest['quantity_group'].replace('unknown',0,inplace=True)
In [91]:
dftest['waterpoint_type'].replace('communal standpipe',1,inplace=True)
dftest['waterpoint_type'].replace('communal standpipe multiple',2,inplace=True)
dftest['waterpoint_type'].replace('hand pump',3,inplace=True)
dftest['waterpoint_type'].replace('other',0,inplace=True)
dftest['waterpoint_type'].replace('improved spring',4,inplace=True)
dftest['waterpoint_type'].replace('cattle trough',5,inplace=True)
dftest['waterpoint_type'].replace('dam',6,inplace=True)
In [93]:
dftest['source_type'].replace('spring',1,inplace=True)
dftest['source_type'].replace('rainwater harvesting',2,inplace=True)
dftest['source_type'].replace('dam',3,inplace=True)
dftest['source_type'].replace('borehole',4,inplace=True)
dftest['source_type'].replace('other',0,inplace=True)
dftest['source_type'].replace('shallow well',5,inplace=True)
dftest['source_type'].replace('river/lake',6,inplace=True)
In [95]:
dftest['quality_group'].replace('good',1,inplace=True)
dftest['quality_group'].replace('salty',2,inplace=True)
dftest['quality_group'].replace('milky',3,inplace=True)
dftest['quality_group'].replace('unknown',0,inplace=True)
dftest['quality_group'].replace('fluoride',4,inplace=True)
dftest['quality_group'].replace('colored',5,inplace=True)
In [97]:
dftest['source_class'].replace('groundwater',1,inplace=True)
dftest['source_class'].replace('surface',2,inplace=True)
dftest['source_class'].replace('unknown',0,inplace=True)
In [98]:
import datetime as dt
In [99]:
dftest['today'] = dftest.apply(lambda row: dt.datetime.today().strftime('%Y-%m-%d'),axis=1)
In [100]:
dftest['date_recorded'] = [time.date() for time in dftest['date_recorded'].astype('datetime64[ns]')]
dftest['today'] = [time.date() for time in dftest['today'].astype('datetime64[ns]')]
In [101]:
dftest['lastcheckdays'] = abs(dftest['date_recorded'].sub(dftest['today'], axis=0))
In [102]:
dftest['daysdiff'] = dftest['lastcheckdays'] / np.timedelta64(1, 'D')
In [114]:
dftest['terrain'] = 'dataran tinggi'
In [115]:
dftest['terrain'][dftest['gps_height']<=500] = 'dataran rendah'
dftest['terrain'][(dftest['gps_height']>500) & (dftest['gps_height']<=1000)] = 'dataran sedang'
In [116]:
dftest['terrain'].replace('dataran rendah',0,inplace=True)
dftest['terrain'].replace('dataran sedang',1,inplace=True)
dftest['terrain'].replace('dataran tinggi',2,inplace=True)
In [117]:
dftest['water_availability'] = dftest.apply(lambda row: 0 if row['amount_tsh']==0.0 else 1,axis=1)
In [118]:
dftest2 = dftest._get_numeric_data()
In [119]:
dftest2.head()
Out[119]:
In [120]:
dftest3 = dftest2[['id','water_availability','terrain','num_private','region_code','district_code','population','water_quality','quality_group','quantity_group','source_type','source_class','waterpoint_type','wp_age','daysdiff']].copy()
In [121]:
dftest3.head()
Out[121]:
In [122]:
dftest3.to_csv('testsetfix.csv',sep=',',header=True,index=False)
In [173]:
dftest3 = pd.read_csv('testsetfix.csv',delimiter=',')
In [174]:
dftest3['payment'] = dftest['payment']
In [175]:
dftest3['payment'].unique()
Out[175]:
In [176]:
dftest3['payment'].replace('pay annually',1,inplace=True)
dftest3['payment'].replace('never pay',2,inplace=True)
dftest3['payment'].replace('pay per bucket',3,inplace=True)
dftest3['payment'].replace('unknown',0,inplace=True)
dftest3['payment'].replace('pay when scheme fails',4,inplace=True)
dftest3['payment'].replace('other',5,inplace=True)
dftest3['payment'].replace('pay monthly',6,inplace=True)
In [180]:
dftest3 = dftest3[['id','water_availability','terrain','num_private','region_code','district_code','population','water_quality','quality_group','quantity_group','source_type','source_class','waterpoint_type','wp_age','daysdiff','payment']].copy()
In [181]:
dftest3.head()
Out[181]:
In [182]:
dftest3.to_csv('testsetfix.csv',sep=',',header=True,index=False)
In [ ]: