In [2]:
from __future__ import division
import pandas as pd
import numpy as np
import os
import graphlab as gl

In [3]:
na_values = ['-99900.0','-99901.0','-99903.0','999.0','nan']
train = gl.SFrame.read_csv(os.path.join("data", "train_2013.csv"))


[INFO] Start server at: ipc:///tmp/graphlab_server-5561 - Server binary: /usr/local/lib/python2.7/dist-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1428708432.log
[INFO] GraphLab Server Version: 1.3.0
PROGRESS: Finished parsing file /media/vladimir/1ab2d5e6-a134-47e7-ba27-b2d70ac5ffc5/workspace/kaggle_rain/data/train_2013.csv
PROGRESS: Parsing completed. Parsed 100 lines in 6.03064 secs.
PROGRESS: Read 55681 lines. Lines per second: 62515.5
PROGRESS: Read 613141 lines. Lines per second: 99579.2
PROGRESS: Finished parsing file /media/vladimir/1ab2d5e6-a134-47e7-ba27-b2d70ac5ffc5/workspace/kaggle_rain/data/train_2013.csv
PROGRESS: Parsing completed. Parsed 1126694 lines in 10.6648 secs.
------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[int,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,float]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------

In [5]:
a = train.head()
a


Out[5]:
Id TimeToEnd DistanceToRadar Composite
1 56.0 37.0 31.0 25.0 19.0
13.0 7.0 2.0 ...
30.0 30.0 30.0 30.0 30.0
30.0 30.0 30.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
2 58.0 48.0 38.0 29.0 19.0
9.0 ...
77.0 77.0 77.0 77.0 77.0
77.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
3 59.0 20.0 75.0 75.0 -99900.0 -99900.0
4 53.0 43.0 34.0 24.0 14.0
5.0 ...
21.0 21.0 21.0 21.0 21.0
21.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
5 56.0 52.0 43.0 59.0 54.0
48.0 42.0 36.0 31.0 57.0 ...
69.0 69.0 69.0 83.0 83.0
83.0 83.0 83.0 83.0 54.0 ...
23.0 24.0 22.0 15.5 14.5
16.0 15.0 18.5 12.5 16.0 ...
6 56.0 47.0 37.0 27.0 18.0
8.0 ...
1.0 1.0 1.0 1.0 1.0 1.0 -99900.0 -99900.0
-99900.0 -4.0 -99900.0 ...
7 59.0 55.0 51.0 46.0 42.0
38.0 33.0 29.0 25.0 20.0 ...
42.0 42.0 42.0 42.0 42.0
42.0 42.0 42.0 42.0 42.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
8 57.0 54.0 51.0 48.0 44.0
41.0 38.0 35.0 22.0 43.0 ...
10.0 10.0 10.0 10.0 10.0
10.0 10.0 10.0 10.0 8.0 ...
26.0 38.0 39.5 36.0 33.0
37.0 37.5 36.5 -99900.0 ...
9 36.0 26.0 92.0 92.0 -99900.0 -99900.0
10 15.0 5.0 53.0 43.0 33.0
14.0 9.0 3.0 ...
90.0 90.0 63.0 63.0 63.0
12.0 12.0 12.0 ...
13.0 12.0 -99900.0
-99900.0 -99900.0 9.5 ...
HybridScan HydrometeorType Kdp RR1
-99900.0 -99900.0
-99900.0 -99900.0 ...
8.0 8.0 8.0 8.0 8.0 8.0
8.0 8.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
8.0 8.0 8.0 8.0 8.0 8.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
-99900.0 -99900.0 8.0 8.0 0.0 0.0 0.0 0.0
-99900.0 -99900.0
-99900.0 -99900.0 ...
8.0 8.0 8.0 8.0 8.0 8.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
13.5 15.5 19.0 -99900.0
-99900.0 -99900.0 ...
9.0 9.0 9.0 8.0 8.0 8.0
8.0 9.0 9.0 9.0 9.0 9.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0 0.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 1.27899 0.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
8.0 8.0 8.0 8.0 8.0 8.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
-99900.0 -99900.0
-99900.0 -99900.0 ...
8.0 8.0 8.0 8.0 8.0 8.0
8.0 8.0 8.0 8.0 8.0 8.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0 0.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 -99900.0 ...
47.0 44.0 42.0 32.0 41.5
26.5 30.5 -99900.0 ...
9.0 13.0 13.0 13.0 9.0
9.0 13.0 9.0 8.0 8.0 8.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 0.0 0.0 0.0 0.0 ...
0.0 6.048 4.66107 4.46988
3.07344 4.99969 4.3752 ...
-99900.0 -99900.0 8.0 8.0 0.0 0.0 0.0 0.0
11.0 13.5 -99900.0
-99900.0 -5.0 9.5 11.5 ...
8.0 9.0 8.0 8.0 8.0 8.0
8.0 8.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 ...
0.0 0.0 0.0 0.0 0.0 0.0
0.0 0.0 ...
RR2 RR3 RadarQualityIndex Reflectivity
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.006246 0.0200476
0.0113924 0.217157 ...
13.0 17.5 14.0 8.5 7.0
11.0 9.0 9.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
999.0 999.0 999.0 999.0
999.0 999.0 ...
15.0 18.5 10.5 3.0 0.5
-3.0 ...
-99900.0 -99900.0 -99900.0 -99900.0 999.0 999.0 6.5 4.0
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.0 0.0 0.0 0.0 0.0 0.0 11.0 14.0 12.0 11.0 13.0
15.5 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
1.0 1.0 1.0 0.0 0.0 0.0
0.0 1.0 0.0 0.996433 0.0 ...
14.0 14.0 17.0 24.5 23.5
21.5 25.0 16.0 21.0 16.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
999.0 999.0 999.0 999.0
999.0 999.0 ...
-13.5 -8.5 9.5 14.0 13.0
15.5 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.0483593 0.0583249
0.103542 0.0419694 ...
15.5 16.0 3.5 -6.5 13.5
15.0 18.0 3.0 17.5 21.0 ...
-99900.0 9.72906 6.48322
6.28992 2.42506 4.78497 ...
-99900.0 -1.31778 18.6753
-6.69155 12.8562 -8.9 ...
1.0 1.0 1.0 1.0 1.0 1.0
1.0 0.948379 0.407035 ...
26.0 38.0 37.0 36.0 32.5
37.0 37.5 8.5 11.0 17.5 ...
-99900.0 -99900.0 -99900.0 -99900.0 999.0 999.0 17.0 5.5
-99900.0 -99900.0
-99900.0 -99900.0 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.0789454 0.765964
0.178427 0.0386251 ...
13.0 12.0 4.5 16.0 2.5
9.5 11.5 12.0 ...
ReflectivityQC RhoHV Velocity Zdr
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.865 0.841667 0.765
0.985 0.768333 0.491667 ...
-99901.0 -99901.0
-99901.0 -99901.0 ...
7.9375 4.5 4.1875 5.5625
3.375 7.0625 5.3125 6 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.635 0.851667 0.891667
0.638333 0.791667 ...
-4.0 -3.0 -2.0 -0.5 -4.0
3.0 ...
2.6875 3.0 2.375 6.25
3.125 6.0625 ...
-99900.0 -99900.0 0.998333 0.891667 -99900.0 -3.5 -6.5 -4.6875
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.688333 0.518333
0.708333 0.805 0.708333 ...
-7.0 -12.0 -11.5 -8.5
-8.0 -13.0 ...
-0.375 5.0625 1.1875 2.0
2.0625 0.3125 ...
14.0 14.0 17.0 -99900.0
-99900.0 -99900.0 ...
1.01833 1.01167 0.991667
1.015 1.015 1.005 1.0 ...
14.0 13.5 12.5 -13.5
-19.5 -16.0 -15.0 -14.0 ...
0.9375 -0.875 -0.75 0.0
0.0625 0.3125 0.5625 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
1.05167 0.988333 0.298333
0.215 0.301667 0.235 ...
15.0 8.0 5.5 7.5 7.0 7.0 -0.9375 0.8125 3.1875
3.3125 -1.1875 1.6875 ...
-99900.0 -99900.0
-99900.0 -99900.0 ...
0.901667 0.661667
0.688333 0.465 0.845 ...
-2.0 13.5 -3.5 5.5 -4.5
3.0 -1.5 1.0 0.5 3.5 7.5 ...
7.5 -0.0625 5.5625
-4.5625 1.5 1.5625 2. ...
26.0 38.0 37.0 36.0 32.5
37.0 37.5 8.5 -99900.0 ...
0.958333 0.978333
0.988333 0.991667 0.995 ...
10.5 10.0 10.0 12.0 12.5
12.5 14.5 -99900.0 16.0 ...
0.375 -0.3125 0.5625
1.0625 1.5625 0.8125 ...
-99900.0 -99900.0 0.948333 0.641667 -10.0 -99900.0 0.5 -5.8125
13.0 12.0 -99900.0
-99900.0 -99900.0 9.5 ...
0.971667 1.05167 0.915
0.208333 0.888333 1.0 ...
-12.0 -9.0 -18.5 -8.5
-99900.0 13.5 14.0 13.0 ...
2.375 0.0 0.25 4.1875
4.0625 0.625 0.875 -1.5 ...
LogWaterVolume MassWeightedMean MassWeightedSD Expected
nan nan nan nan nan nan
nan nan ...
nan nan nan nan nan nan
nan nan ...
nan nan nan nan nan nan
nan nan ...
0.0
nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan 0.0
nan nan nan nan nan nan 0.0
nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan 0.0
-13.4793885769
-12.1370512402 ...
1.86413642918
1.27740873124 ...
0.755068594278
0.502681241559 ...
0.0
nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan 0.0
nan nan nan nan nan nan
nan nan nan nan nan nan ...
nan nan nan nan nan nan
nan nan nan nan nan nan ...
nan nan nan nan nan nan
nan nan nan nan nan nan ...
0.0
-10.3712052779
-7.19622143405 ...
1.73805086725
1.51845864912 ...
0.653683396104
0.585207001984 ...
5.6
nan nan nan nan nan nan 0.0
-14.7984398705
-13.315794926 nan nan ...
2.54496998173
1.57016791355 nan nan ...
1.05628634415
0.639012953381 nan nan ...
0.0
[10 rows x 20 columns]


In [74]:
def split_row(row):
    result = []    
    len_t = len(row["TimeToEnd"].strip().split())
    columns = row.keys()
    columns.sort()
    
    for column in columns:
        if column != "Id" and column != "Expected":
            temp = []
            for i in row[column].strip().split():
                if i not in na_values:
                    temp += [float(i)]
                else:
                    temp += [np.nan]
            result += [temp]            
        else:
            result += [[row[column]] * len_t]
    result = np.matrix(result).transpose()
    
    return result.tolist()

In [75]:
c=a.column_names()
c.sort()

In [76]:
b = a.flat_map(c, split_row)

In [77]:
b.print_rows(num_rows=10)


+-----------+-----------------+----------+------------+-----------------+-----+
| Composite | DistanceToRadar | Expected | HybridScan | HydrometeorType |  Id |
+-----------+-----------------+----------+------------+-----------------+-----+
|    nan    |       30.0      |   0.0    |    nan     |       8.0       | 1.0 |
|    nan    |       30.0      |   0.0    |    nan     |       8.0       | 1.0 |
|    nan    |       30.0      |   0.0    |    nan     |       8.0       | 1.0 |
|    nan    |       30.0      |   0.0    |    nan     |       8.0       | 1.0 |
|    nan    |       30.0      |   0.0    |    nan     |       8.0       | 1.0 |
|    nan    |       30.0      |   0.0    |    nan     |       8.0       | 1.0 |
|    nan    |       30.0      |   0.0    |    nan     |       8.0       | 1.0 |
|    nan    |       30.0      |   0.0    |    nan     |       8.0       | 1.0 |
|    nan    |       77.0      |   0.0    |    nan     |       8.0       | 2.0 |
|    nan    |       77.0      |   0.0    |    nan     |       8.0       | 2.0 |
+-----------+-----------------+----------+------------+-----------------+-----+
+-----+----------------+------------------+----------------+-----+-----+-----+
| Kdp | LogWaterVolume | MassWeightedMean | MassWeightedSD | RR1 | RR2 | RR3 |
+-----+----------------+------------------+----------------+-----+-----+-----+
| 0.0 |      nan       |       nan        |      nan       | 0.0 | nan | nan |
| 0.0 |      nan       |       nan        |      nan       | 0.0 | nan | nan |
| 0.0 |      nan       |       nan        |      nan       | 0.0 | nan | nan |
| 0.0 |      nan       |       nan        |      nan       | 0.0 | nan | nan |
| 0.0 |      nan       |       nan        |      nan       | 0.0 | nan | nan |
| 0.0 |      nan       |       nan        |      nan       | 0.0 | nan | nan |
| 0.0 |      nan       |       nan        |      nan       | 0.0 | nan | nan |
| 0.0 |      nan       |       nan        |      nan       | 0.0 | nan | nan |
| 0.0 |      nan       |       nan        |      nan       | 0.0 | nan | nan |
| 0.0 |      nan       |       nan        |      nan       | 0.0 | nan | nan |
+-----+----------------+------------------+----------------+-----+-----+-----+
+-------------------+--------------+----------------+----------+-----------+
| RadarQualityIndex | Reflectivity | ReflectivityQC |  RhoHV   | TimeToEnd |
+-------------------+--------------+----------------+----------+-----------+
|      0.006246     |     13.0     |      nan       |  0.865   |    56.0   |
|     0.0200476     |     17.5     |      nan       | 0.841667 |    37.0   |
|     0.0113924     |     14.0     |      nan       |  0.765   |    31.0   |
|      0.217157     |     8.5      |      nan       |  0.985   |    25.0   |
|     0.0285667     |     7.0      |      nan       | 0.768333 |    19.0   |
|     0.00374591    |     11.0     |      nan       | 0.491667 |    13.0   |
|     0.0214067     |     9.0      |      nan       |  0.775   |    7.0    |
|      0.147393     |     9.0      |      nan       | 1.05167  |    2.0    |
|        nan        |     15.0     |      nan       |  0.635   |    58.0   |
|        nan        |     18.5     |      nan       | 0.851667 |    48.0   |
+-------------------+--------------+----------------+----------+-----------+
+----------+--------+
| Velocity |  Zdr   |
+----------+--------+
|   nan    | 7.9375 |
|   nan    |  4.5   |
|   nan    | 4.1875 |
|   nan    | 5.5625 |
|   nan    | 3.375  |
|   nan    | 7.0625 |
|   nan    | 5.3125 |
|   nan    | 6.125  |
|   -4.0   | 2.6875 |
|   -3.0   |  3.0   |
|   ...    |  ...   |
+----------+--------+
[98 rows x 20 columns]

Let's split each row.


In [78]:
train_modified = train.flat_map(c, split_row)

In [79]:
train_modified.head()


Out[79]:
Composite DistanceToRadar Expected HybridScan HydrometeorType Id Kdp LogWaterVolume MassWeightedMean
nan 30.0 0.0 nan 8.0 1.0 0.0 nan nan
nan 30.0 0.0 nan 8.0 1.0 0.0 nan nan
nan 30.0 0.0 nan 8.0 1.0 0.0 nan nan
nan 30.0 0.0 nan 8.0 1.0 0.0 nan nan
nan 30.0 0.0 nan 8.0 1.0 0.0 nan nan
nan 30.0 0.0 nan 8.0 1.0 0.0 nan nan
nan 30.0 0.0 nan 8.0 1.0 0.0 nan nan
nan 30.0 0.0 nan 8.0 1.0 0.0 nan nan
nan 77.0 0.0 nan 8.0 2.0 0.0 nan nan
nan 77.0 0.0 nan 8.0 2.0 0.0 nan nan
MassWeightedSD RR1 RR2 RR3 RadarQualityIndex Reflectivity ReflectivityQC RhoHV TimeToEnd Velocity
nan 0.0 nan nan 0.006246 13.0 nan 0.865 56.0 nan
nan 0.0 nan nan 0.0200476 17.5 nan 0.841667 37.0 nan
nan 0.0 nan nan 0.0113924 14.0 nan 0.765 31.0 nan
nan 0.0 nan nan 0.217157 8.5 nan 0.985 25.0 nan
nan 0.0 nan nan 0.0285667 7.0 nan 0.768333 19.0 nan
nan 0.0 nan nan 0.00374591 11.0 nan 0.491667 13.0 nan
nan 0.0 nan nan 0.0214067 9.0 nan 0.775 7.0 nan
nan 0.0 nan nan 0.147393 9.0 nan 1.05167 2.0 nan
nan 0.0 nan nan nan 15.0 nan 0.635 58.0 -4.0
nan 0.0 nan nan nan 18.5 nan 0.851667 48.0 -3.0
Zdr
7.9375
4.5
4.1875
5.5625
3.375
7.0625
5.3125
6.125
2.6875
3.0
[10 rows x 20 columns]


In [81]:
train_modified.save('data/train_splitted.csv', format='csv')

In [83]:
test = gl.SFrame.read_csv(os.path.join("data", "test_2014.csv"))


PROGRESS: Finished parsing file /media/vladimir/1ab2d5e6-a134-47e7-ba27-b2d70ac5ffc5/workspace/kaggle_rain/data/test_2014.csv
PROGRESS: Parsing completed. Parsed 100 lines in 5.10212 secs.
PROGRESS: Read 52033 lines. Lines per second: 68736.7
PROGRESS: Finished parsing file /media/vladimir/1ab2d5e6-a134-47e7-ba27-b2d70ac5ffc5/workspace/kaggle_rain/data/test_2014.csv
PROGRESS: Parsing completed. Parsed 630452 lines in 5.32314 secs.
------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[int,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------

In [84]:
c=test.column_names()
c.sort()

In [86]:
test_modified = test.flat_map(c, split_row)

In [87]:
test_modified.save('data/test_splitted.csv', format='csv')

In [ ]: