In [1]:
from __future__ import division
from sys import argv
import glob
import os
import itertools
import time
import string
import csv
import numpy as np
import pandas as pd
import scipy.stats as sps
import seaborn
import matplotlib.pyplot as plt
from num2words import num2words
In [3]:
os.getcwd()
os.chdir('MonteCarlo_uniformdist_Output_SummedDVlength_20160220_152731/')
In [6]:
terms = {'same':1,'different':0}
In [8]:
stats = pd.read_csv('MonteCarlo_absolute_distances_statistics.csv',header=None)
In [15]:
stats.columns = ['a','ksstat','pval','same','b','c','d']
In [ ]:
In [16]:
st = stats.same.map(terms)
In [18]:
st.sum()
Out[18]:
In [9]:
stats
Out[9]:
In [ ]:
In [ ]:
In [5]:
stats.ix[1:.sum()
Out[5]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
# for jupyter notebook
input_file, cells_to_simulate = 'yw_CellbyCell.csv', 1
data = pd.read_csv(input_file)
# get genotype from filename
# split the string at the underscore, returns a list of the parts (no
# underscores), take the first/before part
genotype = input_file.split('_')[0]
# clear out missing data
data = data.replace(0, np.nan) # turn zeros into NaNs
# drop any column (axis=0) or row (axis=1) where ALL values are NaN
data = data.dropna(how='all')
# turn NaNs back into zeros, so arithmetic can be normal for 'true-zero' values
data = data.replace(np.nan, 0)
# data = data[data.dentincell != 1] # drop columns where the cell has
# only 1 denticle
# save the invivo denticle separation distances to a 1xN matrix
invivo_d = data.as_matrix(columns=data.columns[10:]).flatten()
invivo_d = invivo_d[np.nonzero(invivo_d)]
# using the 'absolute' DV length (dist between edge markers):
# dvlen_type = 'AbsoluteDVlength'
# invivo_ln = pd.DataFrame(data,columns=['dentincell','Dvlen'])
# invivo_ln.columns = ['dentincell','dvlength']
# using the summed DVlength (sum of dent-edge, dent-dent ... dent-dent,
# dent-edge); sum to get the additive, rather than absolute, DV length:
dvlen_type = 'SummedDVlength'
invivo_ln = pd.DataFrame(data, columns=['dentincell'])
invivo_ln['dvlength'] = data['dentEdgeL'] + data['dentEdgeR'] + (data[data.columns[10:]]).sum(axis=1, skipna=True, numeric_only=True)
invivo_ln = invivo_ln.T
In [369]:
def IndivStatTest(simdata, model, filename_out):
# IN: 3D np array, list of strings with length=arr[X,:,:] (array axis 0), name of csv file
simdata = simdata[np.nonzero(simdata)]
test_ks = sps.ks_2samp(invivo_d, simdata)
# outputs [ks-statistic, p-value]
# If the K-S statistic is small or the p-value is high, then we cannot reject the hypothesis that the distributions of the two samples are the same.
test_mwu = sps.mannwhitneyu(invivo_d[np.nonzero(invivo_d)], simdata)
# returns [mwu-statistic, p-value]; One-sided p-value assuming a asymptotic normal distribution.
# Use only when the number of observation in each sample is > 20 and you have 2 independent samples of ranks. Mann-Whitney U is significant if the u-obtained is LESS THAN or equal to the critical value of U.
# This test corrects for ties and by default uses a continuity correction. The reported p-value is for a one-sided hypothesis, to get the two-sided p-value multiply the returned p-value by 2.
with open(filename_out, 'a') as f:
csv.writer(f).writerows([[model, test_ks[0], test_ks[1], TestPasses(test_ks[1], 0.05), test_mwu[0], test_mwu[1], TestPasses(test_mwu[1], 0.05)]])
def TestPasses(pval, cutoff):
if pval < cutoff:
return 'different'
elif pval >= cutoff:
return 'same'
In [458]:
# maxDent = int(max(invivo_ln.ix['dentincell']))
maxDent = 6
cells_to_simulate = 4
iteration = 0
randompositions = np.zeros((len(invivo_ln.T), (cells_to_simulate), maxDent))
randomdistances_rel = np.zeros((len(invivo_ln.T), (cells_to_simulate), (maxDent-1)))
randomdistances_abs = np.zeros((len(invivo_ln.T), (cells_to_simulate), (maxDent-1)))
# randompositions[:,0:2,0] = invivo_ln.T
info = invivo_ln.T
# generate random positions
for iteration in range(0, int(cells_to_simulate)):
ittime = time.clock()
positions = np.zeros((len(invivo_ln.T),maxDent))
for dex, cellinfo in enumerate(invivo_ln.T.values):
denticlenumber, celllength = cellinfo
denticlenumber = int(denticlenumber)
positions[dex,:denticlenumber] = np.sort(np.random.rand(int(denticlenumber)),axis=0)
# sort smallest to largest
relativedistances = positions[:,1:] - positions[:, 0:-1]
relativedistances[relativedistances<0] = 0
# calculate the distance between the points
absolutedistances =(info.dvlength.values * relativedistances.T).T
IndivStatTest(absolutedistances.reshape(absolutedistances.size,1), 'AbsoluteDistances','MonteCarlo_absdist.csv')
np.savetxt(genotype +'_'+str(iteration) +'_relativepositions_' + dvlen_type +'_'+ time.strftime("%Y%m%d_%H%M%S") + '_iterationtime=' + str(elapsedtime) + '.csv',positions,delimiter=',')
np.savetxt(genotype +'_'+str(iteration) +'_'+'_relativedistances_' + dvlen_type +'_'+ time.strftime("%Y%m%d_%H%M%S") + '_iterationtime=' + str(elapsedtime) + '.csv',relativedistances,delimiter=',')
adframe = pd.DataFrame(np.concatenate([info.values, absolutedistances], axis=1),columns=[['dentincell','celllength']+list(string.ascii_lowercase[:maxDent-1])])
adframe['dentword'] = ad_frame.dentincell.map(lambda x: num2words.num2words(x))
adframe = adframe.drop('dentincell',axis=1)
adframe = adframe.replace(0,np.nan)
adframe = adframe.set_index(['dentword','celllength'])
adframe = adframe.sort_index(axis=0)
adframe = adframe.stack()
adframe.to_csv(genotype +'_'+ str(iteration) + '_MonteCarlo_absolute_distances_' + dvlen_type +'_'+ time.strftime("%Y%m%d_%H%M%S") + '_iterationtime=' + str(elapsedtime) + '.csv')
for number in (num2words.num2words(x) for x in range(1,maxDent)):
dentnumb = adframe[number]
dentnumb.to_csv(genotype+'_'+number+'_absolute_distances.csv')
In [465]:
for number in (num2words.num2words(x) for x in range(1,maxDent)):
dentnumb = adframe[number]
dentnumb.to_csv(genotype+'_'+number+'_absolute_distances.csv')
In [466]:
dentnumb
Out[466]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [463]:
a = list(num2words.num2words(x) for x in range(1,maxDent))
a
Out[463]:
In [373]:
absolutedistances
Out[373]:
In [386]:
ad_frame = pd.DataFrame(np.concatenate([info.values, absolutedistances], axis=1),columns=[['dentincell','celllength']+list(string.ascii_lowercase[:maxDent-1])])
In [385]:
['dentincell','celllength']+list(string.ascii_lowercase[:maxDent-1])
Out[385]:
In [ ]:
In [377]:
info.values.shape, absolutedistances.shape
Out[377]:
In [424]:
ad_frame['dentword'] = ad_frame.dentincell.map(lambda x: num2words.num2words(x))
ad_frame = ad_frame.replace(0,np.nan)
ad_frame2 = ad_frame
ad_frame.columns
ad_frame2 = ad_frame2.set_index(['dentword','celllength'])
ad_frame2 = ad_frame2.sort_index(axis=0)
ad_frame2.drop('dentincell',axis=1, inplace=True)
In [425]:
ad_frame2
Out[425]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
`
In [270]:
a = b.T
In [275]:
a['dvlength'] = info['dvlength']
In [284]:
a.columns
Out[284]:
In [ ]:
In [250]:
b.ix[:6].T
Out[250]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
info = invivo_ln.T
info.columns
In [ ]:
info['dicword'] = info.dentincell.map(lambda x: num2words.num2words(x))
In [ ]:
info.ix[:5]
In [ ]:
randomdistance_abs_frame['basicinfo','dentword'] = randomdistance_abs_frame.basicinfo.dentincell.map(lambda x: num2words.num2words(x))
In [ ]:
randomdistance_abs_frame.columns
In [ ]:
randomdistance_abs_frame.drop(['basicinfo','dentword'],level='B')
In [ ]:
a = randomdistance_rel_frame.ix[:5]
a
In [ ]:
a.columns
In [ ]:
In [ ]:
In [ ]:
In [ ]:
randomdistance_rel_frame.ix[:5].
In [ ]:
In [ ]:
a = pd.read_csv('yw_MonteCarlo_relative_distances_SummedDVlength20160219_180731_iterationtime=0.9710289999999979.csv')
In [ ]:
a
In [ ]:
rl = pd.read_csv('./yw_iteration0_random_distances_20160218_161159_iterationtime=23.862026999999998.csv')
In [ ]:
rl
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
maxDent = int(max(invivo_ln.ix['dentincell']))
for dentno in range(1,maxDent):
celldata = invivo_ln[invivo_ln['dentincell']==dentno].T
for dex, row in enumerate(celldata):
celllength = celldata[row]['dvlength']
positions = np.zeros(((cells * dentno), iterations))
fname = str(dent) + '_montecarlo_positions_replicates.csv'
for it in range(0, iterations):
this = np.reshape(np.random.rand(cells, dent), (1, -1))
positions[:, it] = this
np.savetxt(fname, positions, delimiter=',')
distances = positions[1:, :] - positions[0:-1,:]
np.savetxt(fname2, distances, delimiter=',')
In [ ]:
invivo_ln.columns
In [ ]:
a = invivo_ln[invivo_ln['dentincell']==1]['dvlength']
a.repeat(2)
In [ ]:
cellstosim = [(2,12)] #,(3,476),(4,130)] (1,1284),
iterations = 10
for elem in cellstosim:
dent, cells = elem
positions = np.zeros(((cells*dent),iterations))
fname = str(dent)+'_montecarlo_positions_replicates.csv'
for it in range(0,iterations):
this = np.reshape(np.random.rand(cells, dent),(1,-1))
positions[:,it] = this
In [ ]:
np.random.rand(5)
In [ ]:
enumerate
In [ ]:
invivo_ln
In [ ]:
invivo_ln.T
In [ ]:
for dex, cellinfo in enumerate(invivo_ln.values):
1
In [ ]:
a = np.zeros((5,6))
a
In [ ]:
a[0,:] = [1,1,1,1] + [0,0]
In [ ]:
def GenerateDist_Uniformrandom(celllength, denticlenumber):
positions = np.random.rand(denticlenumber)
positions = np.sort(positions,axis=0)
# sort smallest to largest
relativedistances = positions[1:] - positions[0:-1]
absolutedistances = relativedistances * celllength
# calculate the distance between the points
return positions, relativedistances, absolutedistances
In [ ]:
denticlenumber
In [ ]:
invivo_ln = invivo_ln.T
In [ ]:
invivo_ln.shape
In [ ]:
invivo_ln.T.values
In [ ]:
type(cellinfo)
In [ ]:
cellinfo.shape, positions.shape, np.zeros(maxDent - denticlenumber).shape
In [ ]:
randompositions.shape
In [ ]:
np.concatenate((cellinfo, positions, np.zeros(maxDent - denticlenumber)))
In [ ]:
randompositions[:,0:2,0] = invivo_ln.T
In [ ]:
randompositionpanel #.T.index
In [ ]:
# maxDent = int(max(invivo_ln.ix['dentincell']))
maxDent = 6
cells_to_simulate = 1
iteration = 0
randompositions = np.zeros((len(invivo_ln.T), (cells_to_simulate), maxDent))
randomdistances_rel = np.zeros((len(invivo_ln.T), (cells_to_simulate), (maxDent-1)))
randomdistances_abs = np.zeros((len(invivo_ln.T), (cells_to_simulate), (maxDent-1)))
# randompositions[:,0:2,0] = invivo_ln.T
# generate random positions
for iteration in range(0, int(cells_to_simulate)):
ittime = time.clock()
for dex, cellinfo in enumerate(invivo_ln.T.values):
denticlenumber, cellsize = cellinfo
denticlenumber = int(denticlenumber)
positions = np.sort(np.random.rand(int(denticlenumber)),axis=0)
# sort smallest to largest
relativedistances = positions[1:] - positions[0:-1]
absolutedistances = relativedistances * celllength
# calculate the distance between the points
randompositions[dex, iteration, :] = np.concatenate((positions, np.zeros(maxDent - denticlenumber)))
randomdistances_rel[dex, iteration, :] = np.concatenate((relativedistances, np.zeros(maxDent - denticlenumber)))
randomdistances_abs[dex, iteration, :] = np.concatenate((absolutedistances, np.zeros(maxDent - denticlenumber)))
randompositionpanel = pd.Panel(randompositions,
minor_axis=[list(string.ascii_lowercase[:maxDent])]).to_frame()
randomdist_relpanel = pd.Panel(randomdistances_rel,
minor_axis=[list(string.ascii_lowercase[:maxDent-1])]).to_frame()
randomdist_abspanel = pd.Panel(randomdistances_abs,
minor_axis=[list(string.ascii_lowercase[:maxDent-1])]).to_frame()
info = invivo_ln.T
info.columns = [['basicinfo','basicinfo'], ['dentincell', 'dvlength']]
info.index = randompositionpanel.columns
randompositionframe = pd.concat([info.T,randompositionpanel]).T
randomdistance_rel_frame = pd.concat([info.T,randomdist_relpanel]).T
randomdistance_abs_frame = pd.concat([info.T, randomdist_abspanel]).T
elapsedtime = time.clock() - ittime
randompositionframe.to_csv(genotype + '_' + str(iteration) + '_MonteCarlo_positions_' + dvlen_type + time.strftime("%Y%m%d_%H%M%S") + '_iterationtime=' + str(elapsedtime) + '.csv')
randomdistance_rel_frame.to_csv(genotype + '_' + str(iteration) + '_MonteCarlo_relative_distances_' + dvlen_type + time.strftime("%Y%m%d_%H%M%S") + '_iterationtime=' + str(elapsedtime) + '.csv')
randomdistance_abs_frame.to_csv(genotype + '_' + str(iteration) + '_MonteCarlo_absolute_distances_' + dvlen_type + time.strftime("%Y%m%d_%H%M%S") + '_iterationtime=' + str(elapsedtime) + '.csv')
In [ ]:
randomdistance_abs_frame
In [ ]:
In [ ]:
In [ ]:
a = randompositionpanel.to_frame()
In [ ]:
a = a.replace(0,np.nan)
b = a.T
b
# b['basicinfo']['dentnumber','celllength'] = invivo_ln.T
b.ix[:5]
In [ ]:
b.columns.names
In [ ]:
d = invivo_ln.T
In [ ]:
d.shape
d.ix[:5]
In [ ]:
d.columns
In [ ]:
d = invivo_ln.T
d.columns = [['basicinfo','basicinfo'], ['dentincell', 'dvlength']]
d.index = b.index
f = pd.concat([b.T,d.T])
f.T
In [ ]:
d.columns = [['basicinfo','basicinfo'], ['dentincell', 'dvlength']]
d[:5]
In [ ]:
d.columns.names = b.columns.names
In [ ]:
d.shape
In [ ]:
b.shape
In [ ]:
d.ix[:5]
In [ ]:
b.index
In [ ]:
d.index = b.index
In [ ]:
f = pd.concat([b.T,d.T])
f.T
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: