Simple average is taken as final submission.


In [1]:
import pandas as pd

In [3]:
sub1 = pd.read_csv('final.csv')
sub2 = pd.read_csv('xgb_final.csv')

In [7]:
sub1.shape, sub2.shape


Out[7]:
((2345796, 7), (2345796, 6))

In [6]:
sub1 = sub1.drop('test_id',axis=1)
sub2 = sub2.drop('test_id',axis=1)

In [18]:
dup = (sub1.fold1 + sub1.fold2 + sub1.fold3 + sub1.fold4 + sub1.fold5 + sub1.fold6 + sub1.fold7 + sub2.fold0 + sub2.fold1 + sub2.fold2 + sub2.fold3 +
      sub2.fold4 + sub2.fold5)/13

In [19]:
import matplotlib.pyplot as plt
plt.hist(dup,bins=50)
plt.show()



In [3]:
sub = pd.read_csv('sub_av.csv')

In [23]:
sub.is_duplicate = dup

In [24]:
sub.head()


Out[24]:
is_duplicate test_id
0 0.000206 0
1 0.130279 1
2 0.188597 2
3 0.000090 3
4 0.075666 4

In [25]:
sub.to_csv('sub_final.csv',index = False)
! rm -rf test.zip
! zip -r test.zip sub_final.csv


  adding: sub_final.csv (deflated 59%)