In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import os.path

from include.dataset_fnames import generate_station_data_fname, generate_data_fname
from include.feature_lists import numeric_features, numeric_missing_features_list, numeric_features_to_work_on, categoric_features

In [2]:
station_id = 'L0S01'
prev_id = 'L0S00'

In [3]:
fname = generate_station_data_fname(station_id, sample_type='train', data_type='numeric', use_product=False, allow_nan_values=False)
print fname
station_features = numeric_features[station_id]    
features = ['Id'] + station_features + ['time']
station_df = pd.read_csv(fname, usecols=features, index_col=['Id'])


d:/Kaggle_ws/Bosch/data/stations/train\L0S01_notnull_numeric.csv

In [4]:
fname2 = generate_station_data_fname(prev_id, sample_type='train', data_type='numeric', use_product=False, allow_nan_values=False)
print fname2
station_features = numeric_features[prev_id]    
features = ['Id'] + station_features
station_df2 = pd.read_csv(fname2, usecols=features, index_col=['Id'])


d:/Kaggle_ws/Bosch/data/stations/train\L0S00_notnull_numeric.csv

In [ ]:


In [5]:
station_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 673904 entries, 4 to 2367495
Data columns (total 3 columns):
L0_S1_F24    673902 non-null float64
L0_S1_F28    673904 non-null float64
time         673904 non-null float64
dtypes: float64(3)
memory usage: 20.6 MB

In [6]:
station_df2.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 673862 entries, 4 to 2367495
Data columns (total 12 columns):
L0_S0_F0     673862 non-null float64
L0_S0_F2     673862 non-null float64
L0_S0_F4     673862 non-null float64
L0_S0_F6     673862 non-null float64
L0_S0_F8     673862 non-null float64
L0_S0_F10    673862 non-null float64
L0_S0_F12    673862 non-null float64
L0_S0_F14    673862 non-null float64
L0_S0_F16    673862 non-null float64
L0_S0_F18    673862 non-null float64
L0_S0_F20    673862 non-null float64
L0_S0_F22    673862 non-null float64
dtypes: float64(12)
memory usage: 66.8 MB

In [36]:
indices = station_df.index
indices


Out[36]:
Int64Index([      4,       7,       9,      11,      13,      18,      26,
                 27,      28,      31,
            ...
            2367469, 2367472, 2367473, 2367476, 2367480, 2367487, 2367490,
            2367492, 2367493, 2367495],
           dtype='int64', name=u'Id', length=673904)

In [40]:
%%time
station_merged = pd.concat([station_df, station_df2], axis=1)
station_merged = station_merged.loc[indices]
station_merged.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 673904 entries, 4 to 2367495
Data columns (total 15 columns):
L0_S1_F24    673902 non-null float64
L0_S1_F28    673904 non-null float64
time         673904 non-null float64
L0_S0_F0     673063 non-null float64
L0_S0_F2     673063 non-null float64
L0_S0_F4     673063 non-null float64
L0_S0_F6     673063 non-null float64
L0_S0_F8     673063 non-null float64
L0_S0_F10    673063 non-null float64
L0_S0_F12    673063 non-null float64
L0_S0_F14    673063 non-null float64
L0_S0_F16    673063 non-null float64
L0_S0_F18    673063 non-null float64
L0_S0_F20    673063 non-null float64
L0_S0_F22    673063 non-null float64
dtypes: float64(15)
memory usage: 102.3 MB
Wall time: 409 ms

In [41]:
%%time
station_merged = pd.merge(station_df, station_df2, how='left', left_index=True, right_index=True)
m
station_merged = station_merged.loc[indices]
station_merged.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 673904 entries, 4 to 2367495
Data columns (total 15 columns):
L0_S1_F24    673902 non-null float64
L0_S1_F28    673904 non-null float64
time         673904 non-null float64
L0_S0_F0     673063 non-null float64
L0_S0_F2     673063 non-null float64
L0_S0_F4     673063 non-null float64
L0_S0_F6     673063 non-null float64
L0_S0_F8     673063 non-null float64
L0_S0_F10    673063 non-null float64
L0_S0_F12    673063 non-null float64
L0_S0_F14    673063 non-null float64
L0_S0_F16    673063 non-null float64
L0_S0_F18    673063 non-null float64
L0_S0_F20    673063 non-null float64
L0_S0_F22    673063 non-null float64
dtypes: float64(15)
memory usage: 102.3 MB
Wall time: 245 ms

In [9]:
print station_df.head(10)
print "----------------------------------------------------------"
print station_df2.head(10)
print "----------------------------------------------------------"
print station_merged.head(10)


    L0_S1_F24  L0_S1_F28     time
Id                               
4      -0.271      0.167    82.24
7       0.057     -0.079  1618.70
9       0.147     -0.007  1149.20
11     -0.012     -0.046   602.64
13     -0.051     -0.074  1331.66
18      0.001     -0.014   517.64
26      0.170      0.010  1104.78
27     -0.194      0.116   392.85
28     -0.194      0.069    55.44
31     -0.045      0.125    98.99
----------------------------------------------------------
    L0_S0_F0  L0_S0_F2  L0_S0_F4  L0_S0_F6  L0_S0_F8  L0_S0_F10  L0_S0_F12  \
Id                                                                           
4      0.030    -0.034    -0.197    -0.179     0.118      0.116     -0.015   
7      0.088     0.086     0.003    -0.052     0.161      0.025     -0.015   
9     -0.036    -0.064     0.294     0.330     0.074      0.161      0.022   
11    -0.055    -0.086     0.294     0.330     0.118      0.025      0.030   
13     0.003     0.019     0.294     0.312     0.031      0.161      0.022   
18    -0.016    -0.041    -0.179    -0.179    -0.056      0.161     -0.007   
26     0.016     0.093    -0.015    -0.016     0.031      0.116     -0.007   
27    -0.062    -0.153    -0.197    -0.179    -0.187     -0.384      0.000   
28    -0.075    -0.093     0.367     0.348    -0.056      0.025      0.008   
31    -0.003    -0.093    -0.161    -0.216     0.118     -0.021     -0.015   

    L0_S0_F14  L0_S0_F16  L0_S0_F18  L0_S0_F20  L0_S0_F22  
Id                                                         
4      -0.032      0.020      0.083     -0.273     -0.273  
7      -0.072     -0.225     -0.147      0.250      0.250  
9       0.128     -0.026     -0.046     -0.253     -0.253  
11      0.168     -0.169     -0.099      0.042      0.042  
13      0.088     -0.005     -0.003     -0.035     -0.035  
18     -0.032     -0.082      0.163     -0.220     -0.220  
26     -0.072      0.209      0.035      0.202      0.202  
27      0.088     -0.199      0.013     -0.285     -0.285  
28      0.048     -0.031      0.077     -0.303     -0.303  
31      0.048     -0.031      0.051      0.152      0.152  
----------------------------------------------------------
    L0_S1_F24  L0_S1_F28     time  L0_S0_F0  L0_S0_F2  L0_S0_F4  L0_S0_F6  \
Id                                                                          
4      -0.271      0.167    82.24     0.030    -0.034    -0.197    -0.179   
7       0.057     -0.079  1618.70     0.088     0.086     0.003    -0.052   
9       0.147     -0.007  1149.20    -0.036    -0.064     0.294     0.330   
11     -0.012     -0.046   602.64    -0.055    -0.086     0.294     0.330   
13     -0.051     -0.074  1331.66     0.003     0.019     0.294     0.312   
18      0.001     -0.014   517.64    -0.016    -0.041    -0.179    -0.179   
26      0.170      0.010  1104.78     0.016     0.093    -0.015    -0.016   
27     -0.194      0.116   392.85    -0.062    -0.153    -0.197    -0.179   
28     -0.194      0.069    55.44    -0.075    -0.093     0.367     0.348   
31     -0.045      0.125    98.99    -0.003    -0.093    -0.161    -0.216   

    L0_S0_F8  L0_S0_F10  L0_S0_F12  L0_S0_F14  L0_S0_F16  L0_S0_F18  \
Id                                                                    
4      0.118      0.116     -0.015     -0.032      0.020      0.083   
7      0.161      0.025     -0.015     -0.072     -0.225     -0.147   
9      0.074      0.161      0.022      0.128     -0.026     -0.046   
11     0.118      0.025      0.030      0.168     -0.169     -0.099   
13     0.031      0.161      0.022      0.088     -0.005     -0.003   
18    -0.056      0.161     -0.007     -0.032     -0.082      0.163   
26     0.031      0.116     -0.007     -0.072      0.209      0.035   
27    -0.187     -0.384      0.000      0.088     -0.199      0.013   
28    -0.056      0.025      0.008      0.048     -0.031      0.077   
31     0.118     -0.021     -0.015      0.048     -0.031      0.051   

    L0_S0_F20  L0_S0_F22  
Id                        
4      -0.273     -0.273  
7       0.250      0.250  
9      -0.253     -0.253  
11      0.042      0.042  
13     -0.035     -0.035  
18     -0.220     -0.220  
26      0.202      0.202  
27     -0.285     -0.285  
28     -0.303     -0.303  
31      0.152      0.152  

In [10]:
station_merged.index


Out[10]:
Int64Index([      4,       7,       9,      11,      13,      18,      26,
                 27,      28,      31,
            ...
            2367469, 2367472, 2367473, 2367476, 2367480, 2367487, 2367490,
            2367492, 2367493, 2367495],
           dtype='int64', name=u'Id', length=673904)

In [11]:
data = np.resize(range(25),(5,5))
data


Out[11]:
array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24]])

In [12]:
df1 = pd.DataFrame(data, columns=['A', 'B', 'C', 'D', 'E'])
df1.index = df1.A
df1.index.rename('Id', inplace=True)
df1


Out[12]:
A B C D E
Id
0 0 1 2 3 4
5 5 6 7 8 9
10 10 11 12 13 14
15 15 16 17 18 19
20 20 21 22 23 24

In [13]:
df2 = pd.DataFrame(2*data, columns=['F', 'G', 'H', 'I', 'J'])
df2.index = df2.F
df2.index.rename('Id', inplace=True)
df2


Out[13]:
F G H I J
Id
0 0 2 4 6 8
10 10 12 14 16 18
20 20 22 24 26 28
30 30 32 34 36 38
40 40 42 44 46 48

In [14]:
merged = pd.concat([df1, df2], axis=1)
merged


Out[14]:
A B C D E F G H I J
Id
0 0.0 1.0 2.0 3.0 4.0 0.0 2.0 4.0 6.0 8.0
5 5.0 6.0 7.0 8.0 9.0 NaN NaN NaN NaN NaN
10 10.0 11.0 12.0 13.0 14.0 10.0 12.0 14.0 16.0 18.0
15 15.0 16.0 17.0 18.0 19.0 NaN NaN NaN NaN NaN
20 20.0 21.0 22.0 23.0 24.0 20.0 22.0 24.0 26.0 28.0
30 NaN NaN NaN NaN NaN 30.0 32.0 34.0 36.0 38.0
40 NaN NaN NaN NaN NaN 40.0 42.0 44.0 46.0 48.0

In [32]:
%%time
indices = df1.index
merged =  merged.loc[indices]
merged


Wall time: 1e+03 µs

In [33]:
%%time
m = pd.merge(df1, df2, how='left', left_index=True, right_index=True)
m


Wall time: 45 ms

In [ ]: