In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import os.path
from include.dataset_fnames import generate_station_data_fname, generate_data_fname
from include.feature_lists import numeric_features, numeric_missing_features_list, numeric_features_to_work_on, categoric_features
In [2]:
station_id = 'L0S01'
prev_id = 'L0S00'
In [3]:
fname = generate_station_data_fname(station_id, sample_type='train', data_type='numeric', use_product=False, allow_nan_values=False)
print fname
station_features = numeric_features[station_id]
features = ['Id'] + station_features + ['time']
station_df = pd.read_csv(fname, usecols=features, index_col=['Id'])
In [4]:
fname2 = generate_station_data_fname(prev_id, sample_type='train', data_type='numeric', use_product=False, allow_nan_values=False)
print fname2
station_features = numeric_features[prev_id]
features = ['Id'] + station_features
station_df2 = pd.read_csv(fname2, usecols=features, index_col=['Id'])
In [ ]:
In [5]:
station_df.info()
In [6]:
station_df2.info()
In [36]:
indices = station_df.index
indices
Out[36]:
In [40]:
%%time
station_merged = pd.concat([station_df, station_df2], axis=1)
station_merged = station_merged.loc[indices]
station_merged.info()
In [41]:
%%time
station_merged = pd.merge(station_df, station_df2, how='left', left_index=True, right_index=True)
m
station_merged = station_merged.loc[indices]
station_merged.info()
In [9]:
print station_df.head(10)
print "----------------------------------------------------------"
print station_df2.head(10)
print "----------------------------------------------------------"
print station_merged.head(10)
In [10]:
station_merged.index
Out[10]:
In [11]:
data = np.resize(range(25),(5,5))
data
Out[11]:
In [12]:
df1 = pd.DataFrame(data, columns=['A', 'B', 'C', 'D', 'E'])
df1.index = df1.A
df1.index.rename('Id', inplace=True)
df1
Out[12]:
In [13]:
df2 = pd.DataFrame(2*data, columns=['F', 'G', 'H', 'I', 'J'])
df2.index = df2.F
df2.index.rename('Id', inplace=True)
df2
Out[13]:
In [14]:
merged = pd.concat([df1, df2], axis=1)
merged
Out[14]:
In [32]:
%%time
indices = df1.index
merged = merged.loc[indices]
merged
In [33]:
%%time
m = pd.merge(df1, df2, how='left', left_index=True, right_index=True)
m
In [ ]: