In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import os
import graphlab as gl
In [2]:
na_values = ['-99900.0','-99901.0','-99903.0','999.0','nan']
train = gl.SFrame.read_csv(os.path.join("data", "train_2013.csv"))
In [4]:
train.head()
Out[4]:
In [5]:
na_values = ['-99900.0','-99901.0','-99903.0','999.0','nan']
In [6]:
def split_row(x):
temp = x.strip().split()
result = []
for x in temp:
if x not in na_values:
result += [float(x)]
if result == []:
return np.nan
return result[-1]
In [7]:
features = train.column_names()
print features
In [8]:
train_splitted = gl.SFrame()
for column in ['TimeToEnd', 'DistanceToRadar', 'Composite', 'HybridScan', 'HydrometeorType', 'Kdp', 'RR1', 'RR2', 'RR3', 'RadarQualityIndex', 'Reflectivity', 'ReflectivityQC', 'RhoHV', 'Velocity', 'Zdr', 'LogWaterVolume', 'MassWeightedMean', 'MassWeightedSD']:
train_splitted[column] = train[column].apply(split_row)
In [10]:
train_splitted["Id"] = train["Id"]
In [11]:
train_splitted["Expected"] = train["Expected"]
In [12]:
train_splitted.head()
Out[12]:
In [15]:
def bin_expected(x):
if x >= 69:
return 70
else:
return int(x)
In [25]:
train_splitted["Expected"] = train_splitted["Expected"]
In [30]:
train_splitted["Expected"] = train_splitted["Expected"].apply(bin_expected)
In [31]:
#save train_mean to file
train_splitted.to_dataframe().to_csv(os.path.join("data", "train_last.csv"), index=False)
In [32]:
test = gl.SFrame.read_csv(os.path.join("data", "test_2014.csv"))
In [33]:
test_splitted = gl.SFrame()
for column in ['TimeToEnd', 'DistanceToRadar', 'Composite', 'HybridScan', 'HydrometeorType', 'Kdp', 'RR1', 'RR2', 'RR3', 'RadarQualityIndex', 'Reflectivity', 'ReflectivityQC', 'RhoHV', 'Velocity', 'Zdr', 'LogWaterVolume', 'MassWeightedMean', 'MassWeightedSD']:
test_splitted[column] = test[column].apply(split_row)
In [34]:
test_splitted["Id"] = test["Id"]
In [36]:
test_splitted.to_dataframe().to_csv(os.path.join("data", "test_last.csv"), index=False)
In [ ]: