In [5]:
import os
import math
import pandas as pd
import numpy as np
from scipy import stats
# print stats.hmean([ -50.2 , 100.5 ])

In [6]:
path = '/home/zongyi/bimbo_data/submission/'
os.listdir(path)


Out[6]:
['submission3_04481.csv',
 'w9_n1000_45133.csv',
 'w9_n500_45423.csv',
 'w8_f14_n1000_c0.7_45736.csv',
 'ensemble_fulll.csv',
 'ensemble_w8_2.csv',
 'ensemble-kele-0442.csv',
 'w9_full1200_47233.csv',
 'w8_f15_n1000_c0.7_mean_46121.csv',
 'w9_f14_n1200_c0.7_44987.csv',
 'w9_n500_45282.csv',
 'w9_f25_n1500_c0.5_46191.csv',
 'w8_n500_45918.csv',
 'w8_f14_n1200_c0.7.csv',
 'ensemble_full_04387.csv',
 'submission_448.csv',
 'ensemble_w9_3_44613.csv',
 'w9_full_47401.csv',
 'ensemble_448_449.csv',
 'w8_n500_45986.csv',
 'ensemble_448_446.csv',
 'ensemble_w8_1.csv',
 'ensemble-452-474-45034.csv',
 'w8_f26_n1000_c0.5_46928.csv']

In [15]:
'''111111111111111111111111'''
def corr(a,b):
    corr_df = pd.DataFrame(columns=('corr', 'sub'))
    
    df2 = pd.read_csv(path + b)
    df2.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd2'}, inplace=True)    
    i=0    
    for f in a:

        df1 = pd.read_csv(path + f)
        df1.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd1'}, inplace=True)        
        df = pd.merge(df1, df2, how='left', on=['id'])
        corr_df.loc[i] = [stats.pearsonr(df['d1'], df['d2'])[0], f]
        i=i+1
#         print stats.pearsonr(df['d1'], df['d2'])[0], f

    corr_df.sort_values(by=['corr'], ascending=True, inplace=True)

    return corr_df
    
# corr(['ensemble-452-474-45034.csv'],'submission3_04481.csv')
corr(os.listdir(path),'ensemble_w8_3.csv')


Out[15]:
corr sub
18 0.867686 submission_448.csv
0 0.868805 submission3_04481.csv
2 0.884868 w9_n1000_45133.csv
20 0.895226 w9_full_47401.csv
13 0.898021 w9_n500_45282.csv
9 0.902306 w9_full1200_47233.csv
4 0.904556 w9_n500_45423.csv
25 0.909982 ensemble-452-474-45034.csv
17 0.915628 ensemble_full_04387.csv
14 0.919632 w9_f25_n1500_c0.5_46191.csv
21 0.922719 ensemble_448_449.csv
8 0.928814 ensemble-kele-0442.csv
23 0.933655 ensemble_448_446.csv
12 0.938900 w9_f14_n1200_c0.7_44987.csv
19 0.943600 ensemble_w9_3_44613.csv
15 0.943662 w8_n500_45918.csv
10 0.955491 w8_f15_n1000_c0.7_mean_46121.csv
11 0.956544 full_ensemble1.csv
1 0.959973 ensemble_w89_hmean.csv
6 0.960842 ensemble_fulll.csv
5 0.961220 w8_f14_n1000_c0.7_45736.csv
26 0.961475 w8_f26_n1000_c0.5_46928.csv
22 0.969054 w8_n500_45986.csv
16 0.974775 w8_f14_n1200_c0.7.csv
7 0.990446 ensemble_w8_2.csv
24 0.991481 ensemble_w8_1.csv
3 1.000000 ensemble_w8_3.csv

In [9]:
def corr(a,b):
    corr_df = pd.DataFrame(columns=('corr', 'sub'))
    
    df2 = pd.read_csv(path + b)
    df2.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd2'}, inplace=True)    
    i=0    
    for f in a:

        df1 = pd.read_csv(path + f)
        df1.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd1'}, inplace=True)        
        df = pd.merge(df1, df2, how='left', on=['id'])
        corr_df.loc[i] = [stats.pearsonr(df['d1'], df['d2'])[0], f]
        i=i+1
#         print stats.pearsonr(df['d1'], df['d2'])[0], f

    corr_df.sort_values(by=['corr'], ascending=True, inplace=True)

    return corr_df
    
# corr(['ensemble-452-474-45034.csv'],'submission3_04481.csv')
corr(os.listdir(path),'ensemble_448_446.csv')


Out[9]:
corr sub
4 0.862485 w8_f14_n1000_c0.7_45736.csv
13 0.881927 w8_n500_45918.csv
11 0.919127 w9_n500_45282.csv
1 0.919986 w9_n1000_45133.csv
3 0.920603 w9_n500_45423.csv
20 0.923707 w8_n500_45986.csv
22 0.925075 ensemble_w8_1.csv
14 0.927284 w8_f14_n1200_c0.7.csv
9 0.928828 w8_f15_n1000_c0.7_mean_46121.csv
24 0.932939 w8_f26_n1000_c0.5_46928.csv
2 0.933655 ensemble_w8_3.csv
18 0.934701 w9_full_47401.csv
6 0.935353 ensemble_w8_2.csv
23 0.937098 ensemble-452-474-45034.csv
12 0.940893 w9_f25_n1500_c0.5_46191.csv
8 0.941164 w9_full1200_47233.csv
16 0.967772 submission_448.csv
0 0.968101 submission3_04481.csv
10 0.979604 w9_f14_n1200_c0.7_44987.csv
17 0.981210 ensemble_w9_3_44613.csv
7 0.988718 ensemble-kele-0442.csv
5 0.990798 ensemble_fulll.csv
15 0.993198 ensemble_full_04387.csv
19 0.996194 ensemble_448_449.csv
21 1.000000 ensemble_448_446.csv

In [17]:
def corr(a,b):
    corr_df = pd.DataFrame(columns=('corr', 'sub'))
    
    df2 = pd.read_csv(path + b)
    df2.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd2'}, inplace=True)    
    i=0    
    for f in a:

        df1 = pd.read_csv(path + f)
        df1.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd1'}, inplace=True)        
        df = pd.merge(df1, df2, how='left', on=['id'])
        corr_df.loc[i] = [stats.pearsonr(df['d1'], df['d2'])[0], f]
        i=i+1
#         print stats.pearsonr(df['d1'], df['d2'])[0], f

    corr_df.sort_values(by=['corr'], ascending=True, inplace=True)

    return corr_df
    
# corr(['ensemble-452-474-45034.csv'],'submission3_04481.csv')
corr(os.listdir(path),'w9_f14_n1200_c0.7_44987.csv')


Out[17]:
corr sub
5 0.872798 w8_f14_n1000_c0.7_45736.csv
15 0.886107 w8_n500_45918.csv
18 0.909359 submission_448.csv
2 0.909703 w9_n1000_45133.csv
0 0.910303 submission3_04481.csv
13 0.915589 w9_n500_45282.csv
4 0.916780 w9_n500_45423.csv
10 0.926178 w8_f15_n1000_c0.7_mean_46121.csv
16 0.929386 w8_f14_n1200_c0.7.csv
22 0.929458 w8_n500_45986.csv
24 0.929819 ensemble_w8_1.csv
20 0.930186 w9_full_47401.csv
25 0.933206 ensemble-452-474-45034.csv
26 0.933990 w8_f26_n1000_c0.5_46928.csv
9 0.936211 w9_full1200_47233.csv
14 0.938021 w9_f25_n1500_c0.5_46191.csv
7 0.938067 ensemble_w8_2.csv
3 0.938900 ensemble_w8_3.csv
17 0.953344 ensemble_full_04387.csv
8 0.960103 ensemble-kele-0442.csv
1 0.961204 ensemble_w89_hmean.csv
6 0.974454 ensemble_fulll.csv
21 0.974840 ensemble_448_449.csv
23 0.979604 ensemble_448_446.csv
11 0.981431 full_ensemble1.csv
19 0.990829 ensemble_w9_3_44613.csv
12 1.000000 w9_f14_n1200_c0.7_44987.csv

In [ ]:


In [ ]:


In [14]:
df1 = pd.read_csv(path+'ensemble_448_446.csv')
df2 = pd.read_csv(path+'ensemble_w8_2.csv')
df1.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd1'}, inplace=True)
df2.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd2'}, inplace=True)
df = pd.merge(df1, df2, how='left', on=['id'])
df['Demanda_uni_equil'] = 0.8*df['d1'] + 0.2*df['d2']
sub = df[['id','Demanda_uni_equil']]
sub.to_csv(path+'full_ensemble1.csv', index=False)

In [ ]:


In [29]:
df1 = pd.read_csv(path+'submission_448.csv')
df2 = pd.read_csv(path+'ensemble_w9_3_44613.csv')
df1.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd1'}, inplace=True)
df2.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd2'}, inplace=True)
df = pd.merge(df1, df2, how='left', on=['id'])
df['Demanda_uni_equil'] = 0.55*df['d1'] + 0.45*df['d2']
sub = df[['id','Demanda_uni_equil']]
sub.to_csv(path+'ensemble_448_446.csv', index=False)

In [ ]:


In [14]:
df1 = pd.read_csv(path+'w9_f14_n1200_c0.7_44987.csv')
df2 = pd.read_csv(path+'w9_n1000_45133.csv')
df3 = pd.read_csv(path+'w9_f25_n1500_c0.5_46191.csv')
df1.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd1'}, inplace=True)
df2.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd2'}, inplace=True)
df3.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd3'}, inplace=True)
df = pd.merge(df1, df2, how='left', on=['id'])
df = pd.merge(df, df3, how='left', on=['id'])
df['Demanda_uni_equil'] = 0.6*df['d1'] + 0.28*df['d2'] + 0.12*df['d3']
sub = df[['id','Demanda_uni_equil']]
sub.to_csv(path+'ensemble_w9_3.csv', index=False)

In [8]:
df1 = pd.read_csv(path+'w8_f14_n1200_c0.7.csv')
df2 = pd.read_csv(path+'w8_f14_n1000_c0.7_45736.csv')
df3 = pd.read_csv(path+'w8_f26_n1000_c0.5_46928.csv')
df1.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd1'}, inplace=True)
df2.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd2'}, inplace=True)
df3.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd3'}, inplace=True)
df = pd.merge(df1, df2, how='left', on=['id'])
df = pd.merge(df, df3, how='left', on=['id'])
df['Demanda_uni_equil'] = 0.5*df['d1'] + 0.35*df['d2'] + 0.15*df['d3']
sub = df[['id','Demanda_uni_equil']]
sub.to_csv(path+'ensemble_w8_3.csv', index=False)

In [ ]:
df1 = pd.read_csv(path+'w9_f14_n1200_c0.7_44987.csv')
df2 = pd.read_csv(path+'w9_n1000_45133.csv')
df3 = pd.read_csv(path+'w8_f14_n1000_c0.7_45736.csv')
df4 = pd.read_csv(path+'w9_f25_n1500_c0.5_46191.csv')
df1.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd1'}, inplace=True)
df2.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd2'}, inplace=True)
df3.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd3'}, inplace=True)
df4.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd4'}, inplace=True)
df = pd.merge(df1, df2, how='left', on=['id'])
df = pd.merge(df, df3, how='left', on=['id'])
df = pd.merge(df, df4, how='left', on=['id'])

df['Demanda_uni_equil'] = 0.56*df['d1'] + 0.23*df['d2'] + 0.12*df['d3'] + 0.09*df['d4']
sub = df[['id','Demanda_uni_equil']]
sub.to_csv(path+'ensemble_w89_4.csv', index=False)

In [12]:
df1 = pd.read_csv(path+'w9_f14_n1200_c0.7_44987.csv')
df2 = pd.read_csv(path+'w9_n1000_45133.csv')
df3 = pd.read_csv(path+'w9_f25_n1500_c0.5_46191.csv')
df4 = pd.read_csv(path+'w8_f14_n1200_c0.7.csv')
df5 = pd.read_csv(path+'w8_f14_n1000_c0.7_45736.csv')
df1.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd1'}, inplace=True)
df2.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd2'}, inplace=True)
df3.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd3'}, inplace=True)
df4.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd4'}, inplace=True)
df5.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd5'}, inplace=True)
df = pd.merge(df1, df2, how='left', on=['id'])
df = pd.merge(df, df3, how='left', on=['id'])
df = pd.merge(df, df4, how='left', on=['id'])
df = pd.merge(df, df5, how='left', on=['id'])
df['d1'] = df['d1'].apply(lambda x: max(x,0.000000001))    
df['d2'] = df['d2'].apply(lambda x: max(x,0.000000001))
df['d3'] = df['d3'].apply(lambda x: max(x,0.000000001))  
df['d4'] = df['d4'].apply(lambda x: max(x,0.000000001))  
df['d5'] = df['d5'].apply(lambda x: max(x,0.000000001)) 
def f(x):
    return stats.hmean(x)
df['Demanda_uni_equil'] = df[['d1', 'd2', 'd3','d4','d5']].apply(f, axis=1)
sub = df[['id','Demanda_uni_equil']]
sub.to_csv(path+'ensemble_w89_hmean.csv', index=False)

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [70]:
df1.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd1'}, inplace=True)
df2.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd2'}, inplace=True)
df3.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd3'}, inplace=True)
df4.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd4'}, inplace=True)
df5.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd5'}, inplace=True)

In [71]:
df = pd.merge(df1, df2, how='left', on=['id'])
df = pd.merge(df, df3, how='left', on=['id'])
df = pd.merge(df, df4, how='left', on=['id'])
df = pd.merge(df, df5, how='left', on=['id'])

In [74]:
print stats.pearsonr(df['d1'], df['d2'])
print stats.pearsonr(df['d1'], df['d3'])
print stats.pearsonr(df['d1'], df['d4'])
print stats.pearsonr(df['d1'], df['d5'])
print stats.pearsonr(df['d2'], df['d3'])
print stats.pearsonr(df['d2'], df['d4'])
print stats.pearsonr(df['d2'], df['d5'])
print stats.pearsonr(df['d3'], df['d4'])
print stats.pearsonr(df['d3'], df['d5'])
print stats.pearsonr(df['d4'], df['d5'])


(0.90859839944163689, 0.0)
(0.92805488735048192, 0.0)
(0.88624683036048535, 0.0)
(0.87027126832752355, 0.0)
(0.92529220805822276, 0.0)
(0.92288602605536907, 0.0)
(0.90392357835496917, 0.0)
(0.91427252279179827, 0.0)
(0.89463221754723543, 0.0)
(0.93983579944164275, 0.0)

In [77]:
df[:3]


Out[77]:
d1 id d2 d3 d4 d5 Demanda_uni_equil
0 3.32189 4721633 3.43315 3.22524 3.24967 3.23493 3.315742
1 2.08302 6035419 2.29651 2.08906 2.01552 1.89152 2.093581
2 1.62305 3462602 1.70056 1.65728 1.52238 1.61837 1.627146

In [ ]:
def f(x):
    return 0.3 * (0.7 * x['d1'] + 0.3 * x['d2']) + 0.7 * x['d3']
df['Demanda_uni_equil'] = df[['d1', 'd2','d3','d4','d5']].apply(f, axis=1)

In [84]:
df['Demanda_uni_equil'] = 0.5*df['d1'] + 0.25*(0.55*df['d4']+0.45*df['d5']) + 0.25*(0.65*df['d2']+0.35*df['d3'])

In [ ]:


In [ ]:


In [78]:
path = '/home/zongyi/bimbo_data/submission/'
for f in os.listdir(path):
    print f


w9_f14_3xn500_c0.7_mean_45482.csv
w9_n1000_45133.csv
w9_n500_45423.csv
w8_f14_n1000_c0.7_45736.csv
ensemble-kele-0442.csv
w9_full1200_47233.csv
w8_f15_n1000_c0.7_mean_46121.csv
w8_f15_n1000_c0.7_48181.csv
w9_n500_45282.csv
w9_f25_n1500_c0.5_46191.csv
w9_full1.csv
w8_n500_45918.csv
w9_f14_2n1000_c0.7_mean_4539.csv
w9_f17_1500_49xx.csv
ensemble_full_04387.csv
w9_full_47401.csv
w8_f14_n1200_c0.7.3_47317.csv
w8_n500_45986.csv
w8_n1000_48381.csv
ensemble-452-474-45034.csv
w9_f17_n500_c0.6_48153.csv
w8_f26_n1000_c0.5_46928.csv

In [81]:
df6 = pd.read_csv(path+'w8_f14_n1000_c0.7_45736.csv')
df7 = pd.read_csv(path+'w8_f15_n1000_c0.7_mean_46121.csv')
df8 = pd.read_csv(path+'w8_f26_n1000_c0.5_46928.csv')

In [82]:
df6.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd6'}, inplace=True)
df7.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd7'}, inplace=True)
df8.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd8'}, inplace=True)
df = pd.merge(df, df6, how='left', on=['id'])
df = pd.merge(df, df7, how='left', on=['id'])
df = pd.merge(df, df8, how='left', on=['id'])

In [83]:
print stats.pearsonr(df['d6'], df['d7'])
print stats.pearsonr(df['d6'], df['d8'])
print stats.pearsonr(df['d7'], df['d8'])


(0.87384838745332227, 0.0)
(0.90843548653792394, 0.0)
(0.93853149179902695, 0.0)

In [85]:
df['Demanda_uni_equil1'] = 0.6*df['d6'] + 0.2*df['d7'] + 0.2*df['d8']

In [90]:
df[:3]


Out[90]:
d1 id d2 d3 d4 d5 Demanda_uni_equil d6 d7 d8 Demanda_uni_equil1 d10
0 3.32189 4721633 3.43315 3.22524 3.24967 3.23493 3.554216 3.75740 3.58134 3.59559 3.689826 3.615020
1 2.08302 6035419 2.29651 2.08906 2.01552 1.89152 2.055526 1.94513 1.81329 2.16537 1.962810 2.067495
2 1.62305 3462602 1.70056 1.65728 1.52238 1.61837 1.631257 1.71653 1.69398 1.75552 1.719818 1.610435

In [ ]:


In [87]:
df10 = pd.read_csv(path+'ensemble-kele-0442.csv')
df10.rename(columns={'id': 'id', 'Demanda_uni_equil': 'd10'}, inplace=True)
df = pd.merge(df, df10, how='left', on=['id'])

In [89]:
df['Demanda_uni_equil'] = 0.4*(0.6*df['Demanda_uni_equil'] + 0.4*df['Demanda_uni_equil1']) + 0.6*df['d10']

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [18]:
# result['d1'] = result['d1'].apply(lambda x: max(x,0.000000001))    
# result['d2'] = result['d2'].apply(lambda x: max(x,0.000000001))
# result['d3'] = result['d3'].apply(lambda x: max(x,0.000000001))

In [19]:
def f(x):
    return stats.hmean(x)
result['Demanda_uni_equil'] = result[['d1', 'd2', 'd3']].apply(f, axis=1)

In [11]:
def f(x):
    return max(math.expm1(0.25 * math.log1p(x['d1']) + 0.5 * math.log1p(x['d2']) + 0.25 * math.log1p(x['d3'])),0)
result['Demanda_uni_equil'] = result[['d1', 'd2', 'd3']].apply(f, axis=1)

In [9]:
def f(x):
    return 0.1 * x['d1'] + 0.6 * x['d2'] + 0.3 * x['d3']
result['Demanda_uni_equil'] = result[['d1', 'd2', 'd3']].apply(f, axis=1)

In [11]:
def f(x):
    return 0.6 * x['d2'] + 0.4 * x['d3']
result['Demanda_uni_equil'] = result[['d2', 'd3']].apply(f, axis=1)

先log1p 再 hmean 再 expm1????


In [91]:
sub = df[['id','Demanda_uni_equil']]
sub.to_csv(path+'ensemble_fulll.csv', index=False)