In [1]:
#import library
import numpy as np
import pylab as pl
%matplotlib inline
import scipy as sp
import xlrd as xl
import scipy.stats as sps

In [2]:
#Load book, define sheet as X1, print the number of rows, set n to number of rows
#of sheet 1 column 1, numbers is column,row format.  List is X1a, creates array z.
X = xl.open_workbook('avpfloat.xlsx')
X1 = X.sheet_by_name('NoFormulas')

In [3]:
#id test, should be 34730 x 36
m =X1.ncols
n =X1.nrows
print 'cols',m,'rows',n


cols 36 rows 34730

In [4]:
#test and append if AD shows significant regulation. Append ratios of AD (WT), BD(CTD), CD(core)
#pad-> pvalue AD
#rad-> ratio AD
#rbd-> ratio BD
#rcd-> ratio CD
#a-> RPKM b_mean
#d-> RPKM D_mean
#AD-> list of AD induction values when logic test is true
#BD-> list of BD induction values when logic test is true
#CD-> list of CD induction values when logic test is true
#MD-> list of multiplied doain induction values when logic test is true
#GD-> difference between the combined domains and WT if WT effect is positive
#LD-> difference between the combined domains and WT if WT effect is negative

AD = []
BD = []
CD = []
MD = []
DD = []
GD = []
LD = []
for i in range (1,n):
    pad= X1.cell_value(rowx=i, colx=5)
    rad= X1.cell_value(rowx=i, colx=4)
    rbd= X1.cell_value(rowx=i, colx=8)
    rcd= X1.cell_value(rowx=i, colx=12)
    a= X1.cell_value(rowx=i, colx=3)
    d= X1.cell_value(rowx=i, colx=6)
    if (pad<=0.05 and (rad>=2 or rad<=0.5) and (a>=3 or d>=3) and rad!=0):
        AD.append(rad)
        BD.append(rbd)
        CD.append(rcd)
        MD.append(rcd*rbd)
        if rad>=2:
            GD.append((rcd*rbd)/rad)
        if rad<=0.5:
            LD.append((rcd*rbd)/rad)

In [5]:
import matplotlib.pyplot as plt
#Scatter plot generation
wild_type = plt.scatter(AD, AD, s=1, alpha=1, color='k', linewidth=None)
ctd = plt.scatter(AD, BD, s=1, alpha=1, color='b', linewidth=None)
core = plt.scatter(AD, CD, s=1, alpha=1, color='y', linewidth=None)

#Logarithmic scales with grid
plt.xscale('log', basex=2)
plt.yscale('log', basey=2)
plt.grid(True)

#Font size
font = {'family' : 'Arial','weight' : 'normal','size'   :  14}

plt.rc('font', **font)

axis_font = {'fontname':'Serif', 'size':'14', 'weight':'bold'}
title_font = {'fontname': 'Serif', 'size':'16', 'weight':'bold'}

#set x-axis ticks
xtck = [0.00390625, 0.0078125, 0.015625, 0.03125, .0625, .125, .25, .5, 1, 2, 4, 8, 16, 32]
xlbl = ['1/256', '1/128', '1/64', '1/32', '1/16', '1/8', '1/4', '1/2', '1', '2', '4', '8', '16', '32']
plt.xticks(xtck,xlbl,y=-0.02, rotation=270)

#set y-axis ticks
ytck= [0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, .0625, .125, .25, .5, 1, 2, 4, 8, 16, 32, 64, 128]
ylbl= ['1/512', '1/256', '1/128', '1/64', '1/32', '1/16', '1/8', '1/4', '1/2', '1', '2', '4', '8', '16', '32', '64', '128']
plt.yticks(ytck,ylbl, x=-0.02)

#Axis labels

plt.ylabel('Sample RPKM / CD$^{-}$/CTD$^{-}$ RPKM', **axis_font)
plt.xlabel('WT AvrPto RPKM / CD$^{-}$/CTD$^{-}$ RPKM', **axis_font)

#Reference lines

plt.axhline(y=1, color='k')
plt.axvline(x=1, color='k')

#Legend
plt.legend((wild_type, core, ctd),('WT AvrPto', 'CD', 'CTD'),scatterpoints=1,markerscale=4,loc='upper left',ncol=1,fontsize=14)

#Plot size and settings
fig = plt.gcf()
fig.set_size_inches(8,8)
fig.set_dpi(300)
plt.tight_layout()
fig.savefig('WT vs core and CTD.png',bbox_inches='tight',dpi=300)
plt.show()



In [6]:
import matplotlib.pyplot as plt
#Scatter plot generation
wild_type = plt.scatter(AD, AD, s=1, alpha=1, color='k', linewidth=None)
mult = plt.scatter(AD, MD, s=1, alpha=1, color='r', linewidth=None)

#Logarithmic scales with grid
plt.xscale('log', basex=2)
plt.yscale('log', basey=2)
plt.grid(True)

#Font size
font = {'family' : 'Arial',
        'weight' : 'normal',
        'size'   :  14}

plt.rc('font', **font)

axis_font = {'fontname':'Serif', 'size':'14', 'weight':'bold'}
title_font = {'fontname': 'Serif', 'size':'16', 'weight':'bold'}

#set x-axis ticks
xtck = [0.00390625, 0.0078125, 0.015625, 0.03125, .0625, .125, .25, .5, 1, 2, 4, 8, 16, 32]
xlbl = ['1/256', '1/128', '1/64', '1/32', '1/16', '1/8', '1/4', '1/2', '1', '2', '4', '8', '16', '32']
plt.xticks(xtck,xlbl,y=-0.02, rotation=270)

#set y-axis ticks
ytck= [0.001953125, 0.00390625, 0.0078125, 0.015625, 0.03125, .0625, .125, .25, .5, 1, 2, 4, 8, 16, 32, 64, 128]
ylbl= ['1/512', '1/256', '1/128', '1/64', '1/32', '1/16', '1/8', '1/4', '1/2', '1', '2', '4', '8', '16', '32', '64', '128']
plt.yticks(ytck,ylbl, x=-0.02)

#Axis labels

plt.ylabel('Sample RPKM / CD$^{-}$/CTD$^{-}$ RPKM', **axis_font)
plt.xlabel('WT AvrPto RPKM / CD$^{-}$/CTD$^{-}$ RPKM', **axis_font)

#Reference lines

plt.axhline(y=1, color='k')
plt.axvline(x=1, color='k')

#Legend
plt.legend((wild_type, mult),('WT AvrPto', 'CD x CTD'),scatterpoints=1,markerscale=4,loc='upper left',ncol=1,fontsize=14)

#Plot size and settings
fig = plt.gcf()
fig.set_size_inches(8,8)
fig.set_dpi(300)
plt.tight_layout()
fig.savefig('WT vs mult.png',bbox_inches='tight',dpi=300)
plt.show()



In [7]:
#Median and standard deviation values
from scipy import stats
print len(LD)
print len(GD)
logLD = np.log(LD)
logGD = np.log(GD)
MLD = np.median(logLD)
MGD = np.median(logGD)
SLD = sp.stats.tstd(logLD, limits = ((MLD-np.log(2)), (MLD+np.log(2))))
SGD = sp.stats.tstd(logGD, limits = ((MGD-np.log(2)), (MGD+np.log(2))))
print MLD
print SLD
print MGD
print SGD
print np.exp(MLD)
print np.exp(MGD)
print np.exp(SLD)
print np.exp(SGD)


1281
1472
0.0330898528282
0.263981770508
0.290877884586
0.206539798099
1.03364341085
1.33760123225
1.30210445938
1.22941666169

In [44]:
#Histogram of the domain residuals, ND is a normal distribution. For down-regulated genes.
from scipy.stats import norm
ND = np.random.lognormal(mean=MLD, sigma=SLD, size=len(LD))
import matplotlib.pyplot as plt
#set bins to 2^n's
newbins=[]
for n in np.arange(-2,2,0.0625):
    newbins.append(2**n)

#Font size
font = {'family' : 'Arial',
        'weight' : 'normal',
        'size'   :  18}

plt.rc('font', **font)

#Weight so that the sum of all bars will be 1 (unity)
#Creates an array of ones, then divides by the total number of values
#weight is for the dataset, weightND is for the normal distribution
weight = np.ones_like(LD)/len(LD)
weightND = np.ones_like(ND)/len(ND)



axis_font = {'fontname':'Serif', 'size':'18', 'weight':'bold'}
title_font = {'fontname': 'Serif', 'size':'12', 'weight':'bold'}
plt.xscale('log')   
n, bins, boxes = plt.hist(LD, newbins, weights = weight, normed=False, histtype='stepfilled', color='gray', linewidth=2, edgecolor='k', label = 'Observed distribution')
nND, binsND, boxesND = sim = plt.hist(ND, newbins, weights = weightND, normed=False, histtype='step', color = 'r', linewidth=2, label = 'Simulated lognormal distribution')

plt.xlabel('difference in fold induction', **axis_font)
plt.ylabel('probability', **axis_font)
plt.ylim(0,0.11)
plt.xlim(0.25,4)
plt.axvline(x=1, color='k', linewidth=2)

#Legend
plt.legend(loc = 'upper left', fontsize = 16)

#set x-axis ticks
xtck = [.25, .5, 1, 2, 4]
xlbl = ['1/4', '1/2', '1', '2', '4']
plt.xticks(xtck,xlbl, rotation=270)

plt.tight_layout()
fig = plt.gcf()
fig.set_size_inches(12, 6)
fig.savefig('HistDown.png',bbox_inches='tight',dpi=300)

#Checksum - sum of all probabilities should be 1 or nearly 1. Some values are cut off of the plot,
#and I think this is why the data returns slightly less than 1, but it is extremely close.
print(sum(n))
print(sum(nND))


0.9968774395
1.0

In [47]:
#Histogram of the domain residuals, ND is a normal distribution. For up-regulated genes.
from scipy.stats import norm
ND = np.random.lognormal(mean=MGD, sigma=SGD, size=len(GD))
import matplotlib.pyplot as plt
#set bins to 2^n's
newbins=[]
for n in np.arange(-2,2,0.0625):
    newbins.append(2**n)
    
#Font size
font = {'family' : 'Arial',
        'weight' : 'normal',
        'size'   :  18}

plt.rc('font', **font)

#Weight so that the sum of all bars will be 1 (unity)
#Creates an array of ones, then divides by the total number of values
#weight is for the dataset, weightND is for the normal distribution
weight = np.ones_like(GD)/len(GD)
weightND = np.ones_like(ND)/len(ND)


axis_font = {'fontname':'Serif', 'size':'18', 'weight':'bold'}
title_font = {'fontname': 'Serif', 'size':'12', 'weight':'bold'}
plt.xscale('log')    
n, bins, boxes = plt.hist(GD, newbins, weights=weight, normed=False, histtype='stepfilled', color='gray', linewidth=2, edgecolor='k', label = 'Observed distribution')
n, binsND, boxesND = plt.hist(ND, newbins, weights=weightND, normed=False, histtype='step', color = 'r', linewidth=2, label = 'Simulated lognormal distribution')

plt.xlabel('difference in fold induction', **axis_font)
plt.ylabel('probability', **axis_font)
plt.ylim(0,0.11)
plt.xlim(0.25,4)
plt.axvline(x=1, color='k', linewidth=2)

#Legend
plt.legend(loc = 'upper left', fontsize = 16)

#set x-axis ticks
xtck = [.25, .5, 1, 2, 4]
xlbl = ['1/4', '1/2', '1', '2', '4']
plt.xticks(xtck,xlbl,y=-0.02, rotation=270)

plt.tight_layout()
fig = plt.gcf()
fig.set_size_inches(12,6)
fig.savefig('HistUp.png',bbox_inches='tight',dpi=300)

#Checksum - sum of all probabilities should be 1 or nearly 1. Some values are cut off of the plot,
#and I think this is why the data returns slightly less than 1, but it is extremely close.
print(sum(n))
print(sum(nND))


1.0
1.0

In [28]:
#probability plot for log-transformed down-regulated data set normality

import matplotlib.pyplot as plt
res = stats.probplot(np.log2(LD), plot=plt)
plt.tight_layout()
plt.title('Down-regulated WT vs. model probability plot')
plt.ylabel('Ordered log2 values')
fig = plt.gcf()
fig.set_size_inches(12,6)
fig.savefig('probplot_downreg.png',bbox_inches='tight',dpi=300)



In [29]:
#probability plot for log-transformed up-regulated data set normality

res2 = stats.probplot(np.log2(GD), plot=plt)
plt.tight_layout()
plt.ylabel('Ordered log2 values')
plt.title('Up-regulated WT vs. model probability plot')
fig = plt.gcf()
fig.set_size_inches(12,6)
fig.savefig('probplot_upreg.png',bbox_inches='tight',dpi=300)



In [30]:
#probability plot for up-regulated genes, not log-transformed, for comparison here only.

res3 = stats.probplot(GD, plot=plt)
plt.tight_layout()
plt.ylabel('Ordered values')
plt.title('Up-regulated WT vs. model probability plot, non logarithmic')
fig = plt.gcf()
fig.set_size_inches(12,6)
fig.savefig('nonlog_probplot_upreg.png',bbox_inches='tight',dpi=300)



In [31]:
#probability plot for down-regulated genes, not log-transformed, for comparison here only.

res3 = stats.probplot(LD, plot=plt)
plt.tight_layout()
plt.ylabel('Ordered values')
plt.title('Down-regulated WT vs. model probability plot, non logarithmic')
fig = plt.gcf()
fig.set_size_inches(12,6)
fig.savefig('nonlog_probplot_downreg.png',bbox_inches='tight',dpi=300)



In [ ]: