In [ ]:
import pandas as pd
import numpy as np
import os
import scipy.io

In [ ]:
data_type = 'test'

In [ ]:
targets = ['Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2']

In [ ]:
def mydist(j1, j2):
    return -2.*np.dot(j1,j2)/(np.dot(j1,j1) + np.dot(j2,j2))

In [ ]:
FNAME_IN = 'submissions/SUBMISSON.csv'

In [ ]:
FNAME_OUT = 'submissions/IMPROVED_SUBMISSON.csv'

In [ ]:
scores = pd.read_csv(FNAME_IN, index_col='clip', squeeze=True)
out_scores = scores.copy()

In [ ]:
W=0.25
T=0.1
D=-0.5

for target in targets:
    edges = []
    for target in targets:
        for segment in range(1,1000000):
            fname = 'seizure-data/%s/%s_%s_segment_%04d.mat'%(target,target,data_type,segment)
            try:
                data = scipy.io.loadmat(fname)
                k = '%s_segment_%d'%(data_type,segment)
                d = data[k]['data'][0,0]
                edges.append((d[:,:1].astype(float),d[:,-1:].astype(float)))
            except:
                break
    
    N = len(edges)
    d = np.zeros((N,N))
    for i in range(N):
        ei = edges[i]
        for j in range(i+1,N):
            ej = edges[j]
            d[i,j] = mydist(ei[1][:,-1],ej[0][:,0])
            d[j,i] = mydist(ej[1][:,-1],ei[0][:,0])

    for i in range(N):
        d[i,i] = 1
    
    print
    N = d.shape[0]
    print target, N
    dord = np.unravel_index(d.ravel().argsort(),d.shape)
    Nsequences = N/6
    
    # find good pairs of segments that are likely to be paired in time
    next_segment = [-1]*N
    previous_segment = [-1]*N
    for i,(s1,s2) in enumerate(np.array(dord).T):
        dist = d[s1,s2]
        if dist > D:
            break
        if next_segment[s1] != -1:
            #print i,'right conflict',dist
            continue
        if previous_segment[s2] != -1:
            #print i,'left conflict',dist
            continue
        c = 1
        j = s1
        while previous_segment[j] != -1:
            j = previous_segment[j]
            c += 1
        j = s2
        c += 1
        while next_segment[j] != -1:
            j = next_segment[j]
            c += 1
        if c > 6:
            continue
        next_segment[s1] = s2
        previous_segment[s2] = s1
    # check code
    for i in range(N):
        if next_segment[i] != -1:
            assert previous_segment[next_segment[i]] == i

    # find good sequences
    sequences = []
    for i in range(N):
        if previous_segment[i] == -1 and next_segment[i] != -1:
            j = i
            sequence = [j]
            while next_segment[j] != -1:
                j = next_segment[j]
                sequence.append(j)
            sequences.append(sequence)
    len_sequences = [len(sequence) for sequence in sequences]
    print '#sequences',len(sequences), '%segments that was sequenced',sum(len_sequences)/float(N), 'longest sequence', max(len_sequences)
    print sequences

    #compute probability for sequences
    sequences_prb = []
    for sequence in sequences:
        probs = np.array([scores['%s_test_segment_%04d.mat'%(target,s+1)] for s in sequence])
        wgts = np.exp(probs/T)
        wgts /= np.sum(wgts)
        p = np.dot(wgts, probs)
        sequences_prb.append(p)
    # fix probability for segments in sequences
    for p,sequence in zip(sequences_prb,sequences):
        # all segments in the same sequence will be assigned the same probability
        n = 1./len(sequence)
        for i, s in enumerate(sequence):
            out_scores['%s_test_segment_%04d.mat'%(target,s+1)] = W*scores['%s_test_segment_%04d.mat'%(target,s+1)] +(1.-W)*p

In [ ]:
out_scores = out_scores/out_scores.max()

In [ ]:
out_scores.to_csv(FNAME_OUT, header=True)