In [56]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from include.feature_lists import L0_stations, L1_stations, L2_stations, L3_stations
from include.dataset_fnames import train_numeric_fname
%matplotlib inline
In [2]:
df = pd.read_csv("item_station_date.csv", index_col=['Id'])
In [145]:
df.info()
In [62]:
%time response_df = pd.read_csv(train_numeric_fname, usecols=['Id', 'Response'], index_col=['Id'], dtype=int)
response_df.head()
Out[62]:
In [63]:
full_df = df.join(response_df)
In [70]:
train_df = full_df[full_df['Response'].isnull() == False]
In [71]:
train_df.head()
Out[71]:
In [157]:
sample_df = train_df.head(1000000)
sample_df['idx'] = range(len(sample_df))
p_df = sample_df[sample_df['Response'] > 0]
n_df = sample_df.drop(p_df.index)
# X = range(1000)
# y = train_df.head(1000)['Minimum']
# c = ['red' if train_df.loc[idx, 'Response'] > 0 else 'blue' for idx in train_df.head(1000).index]
# m = ['x' if train_df.loc[idx, 'Response'] > 0 else '.' for idx in train_df.head(1000).index]
In [158]:
p_df['Response'].sum()
Out[158]:
In [ ]:
In [167]:
plt.figure(figsize=(20,20))
#plt.scatter(n_df['idx'], n_df['Minimum'], c='blue', marker='.')
plt.scatter(n_df['idx'], n_df['L0_Minimum'], c='black', marker='.')
plt.scatter(n_df['idx'], n_df['L1_Minimum'], c='yellow', marker='.')
plt.scatter(n_df['idx'], n_df['L2_Minimum'], c='green', marker='.')
plt.scatter(n_df['idx'], n_df['L3_Minimum'], c='cyan', marker='.')
#plt.scatter(p_df['idx'], p_df['Minimum'], c='red', marker='x')
plt.scatter(p_df['idx'], p_df['L0_Minimum'], c='red', marker='x')
plt.scatter(p_df['idx'], p_df['L1_Minimum'], c='red', marker='x')
plt.scatter(p_df['idx'], p_df['L2_Minimum'], c='red', marker='x')
plt.scatter(p_df['idx'], p_df['L3_Minimum'], c='red', marker='x')
# plt.xlim(800,1000)
# plt.ylim(0,10)
plt.plot()
Out[167]:
In [160]:
plt.figure(figsize=(20,20))
plt.scatter(n_df['idx'], n_df['Total_Duration'], c='blue', marker='.')
plt.scatter(p_df['idx'], p_df['Total_Duration'], c='red', marker='x')
#plt.xlim(800,1000)
# plt.ylim(0,10)
plt.plot()
Out[160]:
In [159]:
p_df['Total_Duration'].max()
Out[159]:
In [166]:
a=plt.hist(sample_df['Total_Duration'], bins=100, log=True)
In [138]:
Out[138]:
In [ ]:
In [33]:
df['Next'] = -1
del df['Next']
In [35]:
for i, id in enumerate(df.head(10).index):
next_min = df.iloc[i+1]['Minimum']
df.loc[id, 'Next'] = next_min
In [45]:
df.head()
Out[45]:
In [46]:
L0_df = df[L0_stations + ['L0_Minimum', 'L0_Maximum', 'L0_Duration']]
L1_df = df[L1_stations + ['L1_Minimum', 'L1_Maximum', 'L1_Duration']]
L2_df = df[L2_stations + ['L2_Minimum', 'L2_Maximum', 'L2_Duration']]
L3_df = df[L3_stations + ['L3_Minimum', 'L3_Maximum', 'L3_Duration']]
In [49]:
n = df[df['L0_Minimum'] != df['Minimum']]
In [53]:
n[['L0_Minimum', 'L1_Minimum', 'L2_Minimum', 'L3_Minimum', 'Minimum']]
Out[53]:
In [54]:
df.loc[340159]
Out[54]:
In [ ]:
L0_df_corr = L0_df.corr()
In [ ]:
plt.figure(figsize=(20,20))
sns.heatmap(L0_df_corr, annot=True)
plt.show()
In [ ]:
L1_df_corr = L1_df.corr()
In [ ]:
plt.figure(figsize=(20,20))
sns.heatmap(L1_df_corr, annot=True)
plt.show()
In [ ]:
In [ ]:
L2_df_corr = L2_df.corr()
In [ ]:
plt.figure(figsize=(20,20))
sns.heatmap(L2_df_corr, annot=True)
plt.show()
In [ ]:
In [ ]:
L3_df_corr = L3_df.corr()
In [ ]:
plt.figure(figsize=(20,20))
sns.heatmap(L3_df_corr, annot=True)
plt.show()
In [ ]:
L3_df.head(10)
In [130]:
time_df = df[['L0_Minimum', 'L0_Maximum', 'L0_Duration', 'L1_Minimum', 'L1_Maximum', 'L1_Duration', 'L2_Minimum', 'L2_Maximum', 'L2_Duration', 'L3_Minimum', 'L3_Maximum', 'L3_Duration', 'Minimum', 'Maximum', 'Total_Duration', 'Duration_Sum', 'Duration_Lag']]
In [107]:
time_df = df[['L0_Duration', 'L1_Duration', 'L2_Duration', 'L3_Duration', 'Duration_Sum', 'Total_Duration']]
In [ ]:
time_df.corr()
In [132]:
time_df_corr = time_df.corr()
In [133]:
plt.figure(figsize=(10,10))
sns.heatmap(time_df_corr, annot=True)
plt.show()
In [131]:
time_df.head(10)
Out[131]:
In [33]:
df = df.sort_values(L0_stations + L1_stations + L2_stations + L3_stations)
In [34]:
df_flow = df[L0_stations + L1_stations + L2_stations + L3_stations]
In [35]:
df_flow.shape[0] / 4000
Out[35]:
In [36]:
df_flow[df_flow.notnull()] = 1
df_flow[df_flow.isnull()] = 0
In [14]:
img = [df_flow.loc[idx] if (idx % 4000 == 0) else for idx in df_flow.index]
In [37]:
df_flow = df_flow.reset_index()
del df_flow['Id']
In [38]:
img = df_flow[df_flow.index % 4000 == 0]
In [39]:
plt.figure(figsize=(20,20))
plt.imshow(img.values)
Out[39]:
In [ ]: