In [2]:
from __future__ import division
import pandas as pd
import numpy as np
import os
import graphlab as gl
In [3]:
na_values = ['-99900.0','-99901.0','-99903.0','999.0','nan']
train = gl.SFrame.read_csv(os.path.join("data", "train_2013.csv"))
[INFO] Start server at: ipc:///tmp/graphlab_server-5561 - Server binary: /usr/local/lib/python2.7/dist-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1428708432.log
[INFO] GraphLab Server Version: 1.3.0
PROGRESS: Finished parsing file /media/vladimir/1ab2d5e6-a134-47e7-ba27-b2d70ac5ffc5/workspace/kaggle_rain/data/train_2013.csv
PROGRESS: Parsing completed. Parsed 100 lines in 6.03064 secs.
PROGRESS: Read 55681 lines. Lines per second: 62515.5
PROGRESS: Read 613141 lines. Lines per second: 99579.2
PROGRESS: Finished parsing file /media/vladimir/1ab2d5e6-a134-47e7-ba27-b2d70ac5ffc5/workspace/kaggle_rain/data/train_2013.csv
PROGRESS: Parsing completed. Parsed 1126694 lines in 10.6648 secs.
------------------------------------------------------
Inferred types from first line of file as
column_type_hints=[int,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,float]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
In [5]:
a = train.head()
a
Out[5]:
Id
TimeToEnd
DistanceToRadar
Composite
1
56.0 37.0 31.0 25.0 19.0
13.0 7.0 2.0 ...
30.0 30.0 30.0 30.0 30.0
30.0 30.0 30.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
2
58.0 48.0 38.0 29.0 19.0
9.0 ...
77.0 77.0 77.0 77.0 77.0
77.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
3
59.0 20.0
75.0 75.0
-99900.0 -99900.0
4
53.0 43.0 34.0 24.0 14.0
5.0 ...
21.0 21.0 21.0 21.0 21.0
21.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
5
56.0 52.0 43.0 59.0 54.0
48.0 42.0 36.0 31.0 57.0 ...
69.0 69.0 69.0 83.0 83.0
83.0 83.0 83.0 83.0 54.0 ...
23.0 24.0 22.0 15.5 14.5
16.0 15.0 18.5 12.5 16.0 ...
6
56.0 47.0 37.0 27.0 18.0
8.0 ...
1.0 1.0 1.0 1.0 1.0 1.0
-99900.0 -99900.0
-99900.0 -4.0 -99900.0 ...
7
59.0 55.0 51.0 46.0 42.0
38.0 33.0 29.0 25.0 20.0 ...
42.0 42.0 42.0 42.0 42.0
42.0 42.0 42.0 42.0 42.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
8
57.0 54.0 51.0 48.0 44.0
41.0 38.0 35.0 22.0 43.0 ...
10.0 10.0 10.0 10.0 10.0
10.0 10.0 10.0 10.0 8.0 ...
26.0 38.0 39.5 36.0 33.0
37.0 37.5 36.5 -99900.0 ...
9
36.0 26.0
92.0 92.0
-99900.0 -99900.0
10
15.0 5.0 53.0 43.0 33.0
14.0 9.0 3.0 ...
90.0 90.0 63.0 63.0 63.0
12.0 12.0 12.0 ...
13.0 12.0 -99900.0
-99900.0 -99900.0 9.5 ...
HybridScan
HydrometeorType
Kdp
RR1
-99900.0 -99900.0
-99900.0 -99900.0 ...
8.0 8.0 8.0 8.0 8.0 8.0
8.0 8.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
8.0 8.0 8.0 8.0 8.0 8.0
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0 0.0
-99900.0 -99900.0
8.0 8.0
0.0 0.0
0.0 0.0
-99900.0 -99900.0
-99900.0 -99900.0 ...
8.0 8.0 8.0 8.0 8.0 8.0
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0 0.0
13.5 15.5 19.0 -99900.0
-99900.0 -99900.0 ...
9.0 9.0 9.0 8.0 8.0 8.0
8.0 9.0 9.0 9.0 9.0 9.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0 0.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 1.27899 0.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
8.0 8.0 8.0 8.0 8.0 8.0
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0 0.0
-99900.0 -99900.0
-99900.0 -99900.0 ...
8.0 8.0 8.0 8.0 8.0 8.0
8.0 8.0 8.0 8.0 8.0 8.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0 0.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 -99900.0 ...
47.0 44.0 42.0 32.0 41.5
26.5 30.5 -99900.0 ...
9.0 13.0 13.0 13.0 9.0
9.0 13.0 9.0 8.0 8.0 8.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0 0.0 ...
0.0 6.048 4.66107 4.46988
3.07344 4.99969 4.3752 ...
-99900.0 -99900.0
8.0 8.0
0.0 0.0
0.0 0.0
11.0 13.5 -99900.0
-99900.0 -5.0 9.5 11.5 ...
8.0 9.0 8.0 8.0 8.0 8.0
8.0 8.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 ...
RR2
RR3
RadarQualityIndex
Reflectivity
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.006246 0.0200476
0.0113924 0.217157 ...
13.0 17.5 14.0 8.5 7.0
11.0 9.0 9.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
999.0 999.0 999.0 999.0
999.0 999.0 ...
15.0 18.5 10.5 3.0 0.5
-3.0 ...
-99900.0 -99900.0
-99900.0 -99900.0
999.0 999.0
6.5 4.0
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
11.0 14.0 12.0 11.0 13.0
15.5 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
1.0 1.0 1.0 0.0 0.0 0.0
0.0 1.0 0.0 0.996433 0.0 ...
14.0 14.0 17.0 24.5 23.5
21.5 25.0 16.0 21.0 16.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
999.0 999.0 999.0 999.0
999.0 999.0 ...
-13.5 -8.5 9.5 14.0 13.0
15.5 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.0483593 0.0583249
0.103542 0.0419694 ...
15.5 16.0 3.5 -6.5 13.5
15.0 18.0 3.0 17.5 21.0 ...
-99900.0 9.72906 6.48322
6.28992 2.42506 4.78497 ...
-99900.0 -1.31778 18.6753
-6.69155 12.8562 -8.9 ...
1.0 1.0 1.0 1.0 1.0 1.0
1.0 0.948379 0.407035 ...
26.0 38.0 37.0 36.0 32.5
37.0 37.5 8.5 11.0 17.5 ...
-99900.0 -99900.0
-99900.0 -99900.0
999.0 999.0
17.0 5.5
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.0789454 0.765964
0.178427 0.0386251 ...
13.0 12.0 4.5 16.0 2.5
9.5 11.5 12.0 ...
ReflectivityQC
RhoHV
Velocity
Zdr
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.865 0.841667 0.765
0.985 0.768333 0.491667 ...
-99901.0 -99901.0
-99901.0 -99901.0 ...
7.9375 4.5 4.1875 5.5625
3.375 7.0625 5.3125 6 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.635 0.851667 0.891667
0.638333 0.791667 ...
-4.0 -3.0 -2.0 -0.5 -4.0
3.0 ...
2.6875 3.0 2.375 6.25
3.125 6.0625 ...
-99900.0 -99900.0
0.998333 0.891667
-99900.0 -3.5
-6.5 -4.6875
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.688333 0.518333
0.708333 0.805 0.708333 ...
-7.0 -12.0 -11.5 -8.5
-8.0 -13.0 ...
-0.375 5.0625 1.1875 2.0
2.0625 0.3125 ...
14.0 14.0 17.0 -99900.0
-99900.0 -99900.0 ...
1.01833 1.01167 0.991667
1.015 1.015 1.005 1.0 ...
14.0 13.5 12.5 -13.5
-19.5 -16.0 -15.0 -14.0 ...
0.9375 -0.875 -0.75 0.0
0.0625 0.3125 0.5625 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
1.05167 0.988333 0.298333
0.215 0.301667 0.235 ...
15.0 8.0 5.5 7.5 7.0 7.0
-0.9375 0.8125 3.1875
3.3125 -1.1875 1.6875 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.901667 0.661667
0.688333 0.465 0.845 ...
-2.0 13.5 -3.5 5.5 -4.5
3.0 -1.5 1.0 0.5 3.5 7.5 ...
7.5 -0.0625 5.5625
-4.5625 1.5 1.5625 2. ...
26.0 38.0 37.0 36.0 32.5
37.0 37.5 8.5 -99900.0 ...
0.958333 0.978333
0.988333 0.991667 0.995 ...
10.5 10.0 10.0 12.0 12.5
12.5 14.5 -99900.0 16.0 ...
0.375 -0.3125 0.5625
1.0625 1.5625 0.8125 ...
-99900.0 -99900.0
0.948333 0.641667
-10.0 -99900.0
0.5 -5.8125
13.0 12.0 -99900.0
-99900.0 -99900.0 9.5 ...
0.971667 1.05167 0.915
0.208333 0.888333 1.0 ...
-12.0 -9.0 -18.5 -8.5
-99900.0 13.5 14.0 13.0 ...
2.375 0.0 0.25 4.1875
4.0625 0.625 0.875 -1.5 ...
LogWaterVolume
MassWeightedMean
MassWeightedSD
Expected
nan nan nan nan nan nan
nan nan ...
nan nan nan nan nan nan
nan nan ...
nan nan nan nan nan nan
nan nan ...
0.0
nan nan nan nan nan nan
nan nan nan nan nan nan
nan nan nan nan nan nan
0.0
nan nan
nan nan
nan nan
0.0
nan nan nan nan nan nan
nan nan nan nan nan nan
nan nan nan nan nan nan
0.0
-13.4793885769
-12.1370512402 ...
1.86413642918
1.27740873124 ...
0.755068594278
0.502681241559 ...
0.0
nan nan nan nan nan nan
nan nan nan nan nan nan
nan nan nan nan nan nan
0.0
nan nan nan nan nan nan
nan nan nan nan nan nan ...
nan nan nan nan nan nan
nan nan nan nan nan nan ...
nan nan nan nan nan nan
nan nan nan nan nan nan ...
0.0
-10.3712052779
-7.19622143405 ...
1.73805086725
1.51845864912 ...
0.653683396104
0.585207001984 ...
5.6
nan nan
nan nan
nan nan
0.0
-14.7984398705
-13.315794926 nan nan ...
2.54496998173
1.57016791355 nan nan ...
1.05628634415
0.639012953381 nan nan ...
0.0
[10 rows x 20 columns]
In [74]:
def split_row(row):
result = []
len_t = len(row["TimeToEnd"].strip().split())
columns = row.keys()
columns.sort()
for column in columns:
if column != "Id" and column != "Expected":
temp = []
for i in row[column].strip().split():
if i not in na_values:
temp += [float(i)]
else:
temp += [np.nan]
result += [temp]
else:
result += [[row[column]] * len_t]
result = np.matrix(result).transpose()
return result.tolist()
In [75]:
c=a.column_names()
c.sort()
In [76]:
b = a.flat_map(c, split_row)
In [77]:
b.print_rows(num_rows=10)
+-----------+-----------------+----------+------------+-----------------+-----+
| Composite | DistanceToRadar | Expected | HybridScan | HydrometeorType | Id |
+-----------+-----------------+----------+------------+-----------------+-----+
| nan | 30.0 | 0.0 | nan | 8.0 | 1.0 |
| nan | 30.0 | 0.0 | nan | 8.0 | 1.0 |
| nan | 30.0 | 0.0 | nan | 8.0 | 1.0 |
| nan | 30.0 | 0.0 | nan | 8.0 | 1.0 |
| nan | 30.0 | 0.0 | nan | 8.0 | 1.0 |
| nan | 30.0 | 0.0 | nan | 8.0 | 1.0 |
| nan | 30.0 | 0.0 | nan | 8.0 | 1.0 |
| nan | 30.0 | 0.0 | nan | 8.0 | 1.0 |
| nan | 77.0 | 0.0 | nan | 8.0 | 2.0 |
| nan | 77.0 | 0.0 | nan | 8.0 | 2.0 |
+-----------+-----------------+----------+------------+-----------------+-----+
+-----+----------------+------------------+----------------+-----+-----+-----+
| Kdp | LogWaterVolume | MassWeightedMean | MassWeightedSD | RR1 | RR2 | RR3 |
+-----+----------------+------------------+----------------+-----+-----+-----+
| 0.0 | nan | nan | nan | 0.0 | nan | nan |
| 0.0 | nan | nan | nan | 0.0 | nan | nan |
| 0.0 | nan | nan | nan | 0.0 | nan | nan |
| 0.0 | nan | nan | nan | 0.0 | nan | nan |
| 0.0 | nan | nan | nan | 0.0 | nan | nan |
| 0.0 | nan | nan | nan | 0.0 | nan | nan |
| 0.0 | nan | nan | nan | 0.0 | nan | nan |
| 0.0 | nan | nan | nan | 0.0 | nan | nan |
| 0.0 | nan | nan | nan | 0.0 | nan | nan |
| 0.0 | nan | nan | nan | 0.0 | nan | nan |
+-----+----------------+------------------+----------------+-----+-----+-----+
+-------------------+--------------+----------------+----------+-----------+
| RadarQualityIndex | Reflectivity | ReflectivityQC | RhoHV | TimeToEnd |
+-------------------+--------------+----------------+----------+-----------+
| 0.006246 | 13.0 | nan | 0.865 | 56.0 |
| 0.0200476 | 17.5 | nan | 0.841667 | 37.0 |
| 0.0113924 | 14.0 | nan | 0.765 | 31.0 |
| 0.217157 | 8.5 | nan | 0.985 | 25.0 |
| 0.0285667 | 7.0 | nan | 0.768333 | 19.0 |
| 0.00374591 | 11.0 | nan | 0.491667 | 13.0 |
| 0.0214067 | 9.0 | nan | 0.775 | 7.0 |
| 0.147393 | 9.0 | nan | 1.05167 | 2.0 |
| nan | 15.0 | nan | 0.635 | 58.0 |
| nan | 18.5 | nan | 0.851667 | 48.0 |
+-------------------+--------------+----------------+----------+-----------+
+----------+--------+
| Velocity | Zdr |
+----------+--------+
| nan | 7.9375 |
| nan | 4.5 |
| nan | 4.1875 |
| nan | 5.5625 |
| nan | 3.375 |
| nan | 7.0625 |
| nan | 5.3125 |
| nan | 6.125 |
| -4.0 | 2.6875 |
| -3.0 | 3.0 |
| ... | ... |
+----------+--------+
[98 rows x 20 columns]
In [78]:
train_modified = train.flat_map(c, split_row)
In [79]:
train_modified.head()
Out[79]:
Composite
DistanceToRadar
Expected
HybridScan
HydrometeorType
Id
Kdp
LogWaterVolume
MassWeightedMean
nan
30.0
0.0
nan
8.0
1.0
0.0
nan
nan
nan
30.0
0.0
nan
8.0
1.0
0.0
nan
nan
nan
30.0
0.0
nan
8.0
1.0
0.0
nan
nan
nan
30.0
0.0
nan
8.0
1.0
0.0
nan
nan
nan
30.0
0.0
nan
8.0
1.0
0.0
nan
nan
nan
30.0
0.0
nan
8.0
1.0
0.0
nan
nan
nan
30.0
0.0
nan
8.0
1.0
0.0
nan
nan
nan
30.0
0.0
nan
8.0
1.0
0.0
nan
nan
nan
77.0
0.0
nan
8.0
2.0
0.0
nan
nan
nan
77.0
0.0
nan
8.0
2.0
0.0
nan
nan
MassWeightedSD
RR1
RR2
RR3
RadarQualityIndex
Reflectivity
ReflectivityQC
RhoHV
TimeToEnd
Velocity
nan
0.0
nan
nan
0.006246
13.0
nan
0.865
56.0
nan
nan
0.0
nan
nan
0.0200476
17.5
nan
0.841667
37.0
nan
nan
0.0
nan
nan
0.0113924
14.0
nan
0.765
31.0
nan
nan
0.0
nan
nan
0.217157
8.5
nan
0.985
25.0
nan
nan
0.0
nan
nan
0.0285667
7.0
nan
0.768333
19.0
nan
nan
0.0
nan
nan
0.00374591
11.0
nan
0.491667
13.0
nan
nan
0.0
nan
nan
0.0214067
9.0
nan
0.775
7.0
nan
nan
0.0
nan
nan
0.147393
9.0
nan
1.05167
2.0
nan
nan
0.0
nan
nan
nan
15.0
nan
0.635
58.0
-4.0
nan
0.0
nan
nan
nan
18.5
nan
0.851667
48.0
-3.0
Zdr
7.9375
4.5
4.1875
5.5625
3.375
7.0625
5.3125
6.125
2.6875
3.0
[10 rows x 20 columns]
In [81]:
train_modified.save('data/train_splitted.csv', format='csv')
In [83]:
test = gl.SFrame.read_csv(os.path.join("data", "test_2014.csv"))
PROGRESS: Finished parsing file /media/vladimir/1ab2d5e6-a134-47e7-ba27-b2d70ac5ffc5/workspace/kaggle_rain/data/test_2014.csv
PROGRESS: Parsing completed. Parsed 100 lines in 5.10212 secs.
PROGRESS: Read 52033 lines. Lines per second: 68736.7
PROGRESS: Finished parsing file /media/vladimir/1ab2d5e6-a134-47e7-ba27-b2d70ac5ffc5/workspace/kaggle_rain/data/test_2014.csv
PROGRESS: Parsing completed. Parsed 630452 lines in 5.32314 secs.
------------------------------------------------------
Inferred types from first line of file as
column_type_hints=[int,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
In [84]:
c=test.column_names()
c.sort()
In [86]:
test_modified = test.flat_map(c, split_row)
In [87]:
test_modified.save('data/test_splitted.csv', format='csv')
In [ ]:
Content source: ternaus/kaggle_rain
Similar notebooks: