In [ ]:
import glob
import os
import itertools
import time

import numpy as np
import pandas as pd
import scipy.stats as sps

import seaborn
import matplotlib.pyplot as plt
%matplotlib inline

In [ ]:
def IndivStatTest(original_data,test_data, model, sigma, filename_out):
# IN: 3D np array, list of strings with length=arr[X,:,:] (array axis 0), name of csv file
    test_data = test_data[np.nonzero(test_data)]

    test_ks = sps.ks_2samp(original_data, test_data)
    # outputs [ks-statistic, p-value]
    # If the K-S statistic is small or the p-value is high, then we cannot reject the hypothesis that the distributions of the two samples are the same.
    test_mwu = sps.mannwhitneyu(original_data[np.nonzero(original_data)], test_data)
    # returns [mwu-statistic, p-value];  One-sided p-value assuming a asymptotic normal distribution.
    # Use only when the number of observation in each sample is > 20 and you have 2 independent samples of ranks. Mann-Whitney U is significant if the u-obtained is LESS THAN or equal to the critical value of U.
    # This test corrects for ties and by default uses a continuity correction. The reported p-value is for a one-sided hypothesis, to get the two-sided p-value multiply the returned p-value by 2.

    with open(filename_out, 'a') as f:
        csv.writer(f).writerows([[model+'_'+sigma,  test_ks[0], test_ks[1], TestPasses(test_ks[1], 0.05), test_mwu[0], test_mwu[1], TestPasses(test_mwu[1], 0.05)]])
                      

def TestPasses(pval, cutoff):
    if pval < cutoff: 
        return 'different'
    elif pval >= cutoff: 
        return 'same'

In [ ]:
os.path.join(os.getcwd(),'ValTest.txt')

In [ ]:
#  Import 

starttime = time.clock()   # returns time in seconds

# # for cmd line call
# script, input_file, compare_to_sim, val_alpha, val_sigma = argv

# invivo_data = pd.read_csv(input_file)
# simulation_data = pd.read_csv(compare_to_sim)


# for jupyter notebook
simnumber_to_test = '1'

input_file = glob.glob('*iteration' + simnumber_to_test + '_random_distances*.csv')[0]

simulation_data = pd.read_csv(input_file, index_col=(0,1,2))

val_alpha = 'twothirdsD'
val_sigma = 'six'

In [ ]:
# get genotype from filename
genotype = input_file.split('_')[0]  # split the string at the underscore, returns a list of the parts (no underscores), take the first/before part

In [ ]:
simuldata = simulation_data.T
simuldata = simuldata[val_alpha][val_sigma].values.flatten()
simuldata = simuldata[np.nonzero(simuldata)]

In [ ]:


In [ ]:
# Constants and basic indicies

alphas = (5/10, 6/10, 2/3, 7/10, 3/4, 8/10, 9/10, 1)
idx_alphas = ['halfD','sixtenthsD','twothirdsD','sevententhsD','threequartersD','eighttenthsD','ninetenthsD','oneD']
basicinfo = ['dvlength','dentincell']
idx_positions = basicinfo + idx_alphas

numberofstdevs = np.array([[5], [6], [7], [8], [9]])
idx_sigmas = ['five','six','seven','eight','nine']

In [ ]:
# calculate statistics of datasets

# establish basics again
# TODO this can be merged with the initial things above 'MAIN'
# idx_alphas =  ['halfD','sixtenthsD','twothirdsD','sevententhsD','threequartersD','eighttenthsD','ninetenthsD','oneD']
# idx_sigmas = ['five','six','seven','eight','nine']


# 
# example filename: yw_iteration9940_random_distances_20160127_192532_iterationtime=24.116169999993872.csv
names = []
for fname in glob.glob('*iteration*distances*.csv')[:20]:
    names.append(fname)
    
now = time.strftime("%Y%m%d_%H%M%S")


for name in names:
    frame = pd.read_csv(name, index_col=(0,1,2))
    # read in csv file, with columns indices specificed
    frame = frame.T 
    frame.columns.names = [None, None, None]
    # flip, because it's easier to work with columns than with rows indexed... and clear out names

    for elem in itertools.product(idx_alphas,idx_sigmas): 
        model,sigma = elem
        subframe = frame[model][sigma].values.flatten()
        csvname = 'Validation_against_'+ val_alpha +'_'+ val_sigma +'_' + genotype +'_statistics_' + sigma +'_'+ now +'.csv'
        IndivStatTest(simuldata, subframe, model, sigma, csvname)

In [ ]:


In [ ]:
# summarize statistics data 
names = []
for fname in glob.glob('*_statistics_*.csv'):
    names.append(fname)

idx_stats = ['iter','model','ks_stat','ks_pvalue','ks_sameness','mwu_stat','mwu_pvalue','mwu_sameness']
terms = {'same':1,'different':0}
splitter = 'ks'

for name in names: 
#     genotype = name.split('_')[0]
    frame = pd.read_csv(name,header=None, names=idx_stats[1:])
    frame = frame.sort_values(by='model')

    frame.index = list(range(0,int(len(frame)/len(idx_alphas))))*len(idx_alphas)
    
    stdev = name.split('_')[6]

    fr2 = pd.DataFrame()
    fr3 = pd.DataFrame()
    fr4 = pd.DataFrame()
    
#     for modelname in (y+z for y,z in itertools.product((x+'_' for x in idx_alphas),stdev)):
    for modelname in (model + '_' + stdev for model in idx_alphas):
        temp = frame[frame['model'] == modelname]
        
        fr2[modelname.split('2')[0].split('_')[0]] = frame[frame['model'] == modelname][splitter + '_pvalue']
        fr3[modelname.split('2')[0].split('_')[0]] = frame[frame['model'] == modelname][splitter + '_sameness']
        fr3[modelname.split('2')[0].split('_')[0]] = fr3[modelname.split('2')[0].split('_')[0]].map(terms)
    
    # save
    fr2.to_csv(genotype + '_' + stdev + '_' + splitter + '_pvalue' + '.csv')
    fr3.to_csv(genotype + '_' + stdev + '_' + splitter + '_sameness' + '.csv')

    # calculate fractions that pass/fail, summarize, and make plots
    fr4['p<0.05'] = (1-(fr3.sum()/len(fr3)))*100
    fr4['p>0.05'] = (fr3.sum()/len(fr3))*100

    myplot = fr4.plot(kind='bar',stacked=True,color=['b','y'])
    myplot.legend(bbox_to_anchor=(.55,1.1),loc=2,ncol=2,borderaxespad=0.5)
    plt.title('s = ' + stdev, loc='left')
    fig = myplot.get_figure()
    fig.savefig('Validation_against_'+ val_alpha +'_'+ val_sigma +'_' + genotype+'_'+stdev+'.png',bbox_inches='tight')
#     plt.close(fig)

    fr4.to_csv('Validation_against_'+ val_alpha +'_'+ val_sigma +'_' + genotype + '_' + stdev + '_' + splitter + '_fractions' + '.csv')

In [ ]:
list(model + '_' + stdev for model in idx_alphas)

In [ ]:
modelname

In [ ]:


In [ ]: