I need to install qiime on the cluster, then I go:
In [1]:
!cat /etc/*-release
To install QIIME, I am following instructions from here
In [5]:
!module load Python
#!module load R
In [7]:
!pip install --user qiime -I
In [1]:
!print_qiime_config.py -t
Goal: To match IDs to OTUs - this will enable me to select OTU differentially enriched based on the LEfSE analysis,
and return transcripts, based on the reference correlation performed here: /Volumes/group_dv/personal/DValenzano/papers/uBiome0/data/regr.csv
First, I take the OTU genus reference table, normalise it and save it as a csv
In [2]:
#!export PATH=/beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages/qiime/:$PATH
#!export PATH=/export/apps/qiime/1.8.0/bin:$PATH
#!export PATH=$PATH:/software/R/3.2.4/bin/
#!export PATH=$PATH:/software/R/3.2.4/bin
#!echo $PATH
#ls ~/.bashrc
To install all the required R packages, follow instructions from http://qiime.org/install/install.html
In [1]:
#!module load R
#!which R
get_ipython().system(u'normalize_table.py -i /beegfs/group_dv/home/DValenzano/uBiome/data/Description_otu_table_sorted_L6.biom -a CSS -o /beegfs/group_dv/home/DValenzano/uBiome/data/Description_otu_table_sorted_L6n.biom')
This file I just normalized has the OTUs at the genus level for all the individuals for which we also have the rnaseq data
In [2]:
get_ipython().system(u'biom convert -i /beegfs/group_dv/home/DValenzano/uBiome/data/Description_otu_table_sorted_L6n.biom -o /beegfs/group_dv/home/DValenzano/uBiome/data/Description_otu_table_sorted_L6n.txt --to-tsv')
In [ ]:
get_ipython().system(u'biom convert -i /beegfs/group_dv/home/DValenzano/uBiome/data/Description_otu_table_sorted_L6n.biom -o /beegfs/group_dv/home/DValenzano/uBiome/data/Description_otu_table_sorted_L6n.txt --to-tsv')
In [45]:
get_ipython().system(u'biom convert -i /beegfs/group_dv/home/DValenzano/uBiome/data/16wk_otu_table.biom -o /beegfs/group_dv/home/DValenzano/uBiome/data/16wk_otu_table.txt --to-tsv')
In [10]:
t = open('/beegfs/group_dv/home/DValenzano/uBiome/data/Description_otu_table_sorted_L6n.txt', 'rU').read()
t = t.replace('\t',',')
z = open('/beegfs/group_dv/home/DValenzano/uBiome/data/Description_otu_table_sorted_L6n.csv', 'w')
z.write(t)
z.close()
In [62]:
#t = open('/beegfs/group_dv/home/DValenzano/uBiome/data/Description_otu_table_sorted_L6n.csv', 'rU').read()
In [63]:
t1 = t.replace('_','.').replace('YI', 'WT')
t2 = t1.split('\n')[1:]
t3 = [ i.split(',') for i in t2 ]
t4 = [t3[0]]+[i for i in t3[1:] if sum(map(float, (i[1:]))) != 0.0] #this removes 0-only OTUs
tt = zip(*t4)
tT = [ list(i) for i in tt ]
In [64]:
u = open('/beegfs/group_dv/home/DValenzano/uBiome/data/rnaseq_fpkm_genes.csv', 'rU').read()
u2 = u.split('\n')
u3 = [ i.split(',') for i in u2]
u4 = [u3[0]]+[i for i in u3[1:] if sum(map(float, (i[1:]))) != 0.0] #this removes 0-only transcripts
ut = zip(*u4)
uT = [ list(i) for i in ut ]
In [65]:
from sets import Set
uh = [i[0] for i in uT[1:]]
th = [i[0] for i in tT[1:]]
h = list(Set(uh) & Set(th))
tT1 = [tT[0]] + [i for i in tT if i[0] in h]
uT1 = [uT[0]] + [i for i in uT if i[0] in h]
In [69]:
uTs = [uT1[0]] + sorted(uT1[1:], key = lambda x: (x[0]))
tTs = [tT1[0]] + sorted(tT1[1:], key = lambda x: (x[0]))
#[i[0] for i in uTs][1:] == [i[0] for i in tTs][1:] # this tests whether the two matrices have headers in same order
uj = ','.join([','.join(i)+'\n' for i in uTs]).replace('\n,','\n')[:-1]
tj = ','.join([','.join(i)+'\n' for i in tTs]).replace('\n,','\n')[:-1]
In [63]:
zu = open('/beegfs/group_dv/home/DValenzano/uBiome/data/rnaseq_fpkm_genes_sorted.csv', 'w')
zu.write(uj)
zu.close()
zt = open('/beegfs/group_dv/home/DValenzano/uBiome/data/rnaseq_merged_otu_table_sorted.csv', 'w')
zt.write(tj)
zt.close()
In [67]:
[i.split(',')[0] for i in tj.split('\n')[1:-1]] == [i.split(',')[0] for i in uj.split('\n')[1:-1]]
Out[67]:
There is a difference from the previous analysis (March 17-18 on README) since 3 samples were removed due to low depth
Good - Now continuing from here
In [1]:
import numpy as np
from scipy.stats.stats import pearsonr
from scipy import stats
import pandas as pd
from scipy import stats, integrate
import matplotlib.pyplot as plt
#import seaborn as sns
In [16]:
rna = uj
otu = tj
In [44]:
rnal = [i.split(',') for i in rna.split('\n')] #that's a "l" as in "L"
otul = [i.split(',') for i in otu.split('\n')]
rnat = [ list(i) for i in zip(*rnal)]
otut = [list(i) for i in zip(*otul) ]
rnat_z = [ map(float, i[1:]) for i in rnat[1:] if i.count('0') < 7]
otut_z = [ map(float, i[1:]) for i in otut[1:] if i.count('0.0') < 7]
# In[10]:
rna_key = [i[0] for i in rnat[1:] if i.count('0') < 7]
otu_key = [ i[0] for i in otut[1:] if i.count('0') < 7]
rna_value = rnat_z
otu_value = otut_z
rna_d = dict(zip(rna_key, rna_value))
otu_d = dict(zip(otu_key, otu_value))
In [50]:
def loop_one(m1,otu): # this returns all the correlation coefficients between m1 and a given otu (otu)
ls = []
for i in m1:
ls.append(stats.linregress(i, otu)) #this function returns slope, intercept, r_value, p_value, std_err
return ','.join(map(str, ls))+'\n'
def loop_two(m1, m2):
ls = []
for j in m2:
ls.append(loop_one(m1, j))
return ','.join(ls).replace('\n,','\n')
fpkmvsotu = loop_two(rnat_z, otut_z)
fpkmvsotu1 = fpkmvsotu.replace('LinregressResult', '').replace('slope=','').replace('intercept=', '').replace('rvalue=', '').replace('pvalue=', '').replace('stderr=', '')
z = open('/beegfs/group_dv/home/DValenzano/uBiome/data/corr.csv','w')
z.write(fpkmvsotu1)
z.close()
In [52]:
class reg(object):
"""
This class uses as input a linear regression file obtained by the loop_two function and returns
regression tables and plots, given specific parameteres, such as R-squared value, p-value, OTU or
transcript
"""
# version = 0.1
# def __init__(self, inp):
# self.inp = inp
# self.inp2 = [ i.split('LinregressResult')[1:] for i in self.inp.split('\n')[:-1]]
# def loop1(inp, n):
# ls = []
# for i in inp:
# ls.append(i.split(',')[n].split('=')[1])
# return ','.join(ls).replace(')', '')
# def loop2(inp, n):
# ls = []
# for i in inp:
# ls.append(loop1(i, n))
# return ls
# self.slope = [map(float, i.split(',')) for i in loop2(self.inp2, 0)]
# self.intercept = [map(float, i.split(',')) for i in loop2(self.inp2, 1)]
# self.rvalue = [map(float, i.split(',')) for i in loop2(self.inp2, 2)]
# self.rsquare = [ map(lambda x: x**2, i) for i in self.rvalue]
# self.pvalue = [map(float, i.split(',')) for i in loop2(self.inp2, 3)]
# self.stderr0 = [map(float, i.split(',')) for i in loop2(self.inp2, 4) ]
version = 0.2
def __init__(self, inp):
self.inp = inp
self.inp2 = [ i.split('(') for i in self.inp.split('\n')[:-1]]
def loop1(inp, n):
ls = []
for i in inp[1:]:
ls.append(i.split(',')[n])
return ','.join(ls).replace(')', '')
def loop2(inp, n):
ls = []
for i in inp:
ls.append(loop1(i, n))
return ls
self.slope = [map(float, i.split(',')) for i in loop2(self.inp2, 0)]
self.intercept = [map(float, i.split(',')) for i in loop2(self.inp2, 1)]
self.rvalue = [map(float, i.split(',')) for i in loop2(self.inp2, 2)]
self.rsquare = [ map(lambda x: x**2, i) for i in self.rvalue]
self.pvalue = [map(float, i.split(',')) for i in loop2(self.inp2, 3)]
self.stderr = [map(float, i.split(',')) for i in loop2(self.inp2, 4) ]
In [53]:
def filt(inp, thr, sign): #filter p-values, r-values and so forth based on a chosen threshold
ar = np.array(inp)
ls = []
if sign == '>':
ls = zip(*np.where(ar > thr))
elif sign == '<':
ls = zip(*np.where(ar < thr))
else:
print "Error: check your input file"
return ls
def filt_(inp, thr, sign): #filter p-values, r-values and so forth based on a chosen threshold
ar = np.array(inp)
li = []
lp = []
if sign == '>':
lp = np.where(ar > thr)
elif sign == '<':
lp = np.where(ar < thr)
else:
print "Error: check your input file"
p1_l = [ list(i) for i in lp]
la = list(ar[p1_l])
p2 = p1_l+[la]
p3 = [ list(i) for i in zip(*p2)]
p4 = sorted(p3, key=lambda x: x[2])
return ','.join([ ','.join(map(str, i))+'\n' for i in p4]).replace('\n,','\n')[:-1]
In [33]:
#fpkmvsotu = open('/beegfs/group_dv/home/DValenzano/uBiome/data/corr.csv', 'rU').read()
#fpkmvsotu1 = fpkmvsotu.replace('LinregressResult', '').replace('slope=','').replace('intercept=', '').replace('rvalue=', '').replace('pvalue=', '').replace('stderr=', '')
rf = reg(fpkmvsotu1)
In [43]:
rpval = filt_(rf.pvalue, 0.001, '<')
P2 = [otu_key[int(i.split(',')[0])]+','+rna_key[int(i.split(',')[1])] +','+i.split(',')[2] for i in rpval.split('\n') ]
P3 = ','.join([ i+'\n' for i in P2]).replace('\n,','\n')[:-1]
len(P3.split('\n'))
zpvalue = 'OTU,transcr,value\n'+','.join([i+'\n'for i in P3.split('\n')[:100]]).replace('\n,','\n')
z = open('/beegfs/group_dv/home/DValenzano/uBiome/data/TOP_otu-tr_pvalue.csv', 'w')
z.write(zpvalue)
z.close()
Below the instructions to plot
In [53]:
Rf = reg(fpkmvsotu1)
Rpval = filt_(Rf.pvalue, 1.0, '<') #all the pvals
Rrsq = filt_(Rf.rsquare, 0.0, '>') #all the rsquare
In [48]:
# This won't work from the server - for now
# Xp = []
# Yp = []
# Otu = []
# Rna = []
# for i in range(12):
# Otu.append(Rpval.split('\n')[i].split(',')[0])
# Rna.append(Rpval.split('\n')[i].split(',')[1])
# Xp.append(otu_d[otu_key[int(Rpval.split('\n')[i].split(',')[0])]])
# Yp.append(rna_d[rna_key[int(Rpval.split('\n')[i].split(',')[1])]])
# rng = zip(range(10), range(331,340))
# fig = plt.figure()
# fig.suptitle('Top 9 regressions based on p value ', fontsize=14, fontweight='bold')
# for i,j in rng:
# ax = fig.add_subplot(j)
# fig.subplots_adjust(top=0.85, hspace=0.8, wspace=0.8)
# #ax.set_title('axes title')
# ax.set_xlabel(Otu[i])
# ax.set_ylabel(Rna[i])
# plt.scatter(Xp[i],Yp[i])
# plt.show()
In [49]:
# This won't work from the server - for now
# Xr = []
# Yr = []
# Otu = []
# Rna = []
# for i in range((len(Rrsq.split('\n')) - 12),len(Rrsq.split('\n'))):
# Otu.append(Rrsq.split('\n')[i].split(',')[0])
# Rna.append(Rrsq.split('\n')[i].split(',')[1])
# Xr.append(otu_d[otu_key[int(Rrsq.split('\n')[i].split(',')[0])]])
# Yr.append(rna_d[rna_key[int(Rrsq.split('\n')[i].split(',')[1])]])
# fig = plt.figure()
# fig.suptitle('Top 9 regressions based on R2 ', fontsize=14, fontweight='bold')
# for i,j in rng:
# ax = fig.add_subplot(j)
# fig.subplots_adjust(top=0.85, hspace=0.8, wspace=0.8)
# #ax.set_title('axes title')
# ax.set_xlabel(Otu[i])
# ax.set_ylabel(Rna[i])
# plt.scatter(Xr[i],Yr[i])
# plt.show()
Below what I ran on Apr 26 2016
In [49]:
fpkmvsotu0 = open('/beegfs/group_dv/home/DValenzano/uBiome/data/corr.csv', 'rU').read()
uj = open('/beegfs/group_dv/home/DValenzano/uBiome/data/rnaseq_fpkm_genes_sorted.csv', 'rU').read()
tj = open('/beegfs/group_dv/home/DValenzano/uBiome/data/rnaseq_merged_otu_table_sorted.csv','rU').read()
rna = uj
otu = tj
rnal = [i.split(',') for i in rna.split('\n')] #that's a "l" as in "L"
otul = [i.split(',') for i in otu.split('\n')]
rnat = [ list(i) for i in zip(*rnal)]
otut = [list(i) for i in zip(*otul) ]
rnat_z = [ map(float, i[1:]) for i in rnat[1:] if i.count('0') < 7]
otut_z = [ map(float, i[1:]) for i in otut[1:] if i.count('0.0') < 7]
# In[10]:
rna_key = [i[0] for i in rnat[1:] if i.count('0') < 7]
otu_key = [ i[0] for i in otut[1:] if i.count('0.0') < 7]
rna_value = rnat_z
otu_value = otut_z
rna_d = dict(zip(rna_key, rna_value))
otu_d = dict(zip(otu_key, otu_value))
In [50]:
fpkmvsotu1 = fpkmvsotu0.replace('LinregressResult', '').replace('slope=','').replace('intercept=', '').replace('rvalue=', '').replace('pvalue=', '').replace('stderr=', '')
In [59]:
Rf = reg(fpkmvsotu1)
Rpval001 = filt_(Rf.pvalue, 0.001, '<') #all the pvals < 0.00001
Rrsq06 = filt_(Rf.rsquare, 0.6, '>') #all the rsquare > 0.8
In [54]:
Rf = reg(fpkmvsotu1)
Rpval = filt_(Rf.pvalue, 1.0, '<') #all the pvals
Rrsq = filt_(Rf.rsquare, 0.0, '>') #all the rsquare
In [55]:
prova = Rf.rsquare
len(prova)
Out[55]:
In [56]:
Xp = []
Yp = []
Otu = []
Rna = []
for i in range(12):
Otu.append(Rpval.split('\n')[i].split(',')[0])
Rna.append(Rpval.split('\n')[i].split(',')[1])
Xp.append(otu_d[otu_key[int(Rpval.split('\n')[i].split(',')[0])]])
Yp.append(rna_d[rna_key[int(Rpval.split('\n')[i].split(',')[1])]])
rng = zip(range(10), range(331,340))
fig = plt.figure()
fig.suptitle('Top 9 regressions based on p value ', fontsize=14, fontweight='bold')
# for i,j in rng:
# ax = fig.add_subplot(j)
# fig.subplots_adjust(top=0.85, hspace=0.8, wspace=0.8)
# #ax.set_title('axes title')
# ax.set_xlabel(Otu[i])
# ax.set_ylabel(Rna[i])
# plt.scatter(Xr[i],Yr[i])
# plt.show()
for i,j in rng:
ax = fig.add_subplot(j)
fig.subplots_adjust(top=0.85, hspace=0.8, wspace=0.8)
#ax.set_title('axes title')
ax.set_xlabel(Otu[i])
ax.set_ylabel(Rna[i])
plt.scatter(Xp[i],Yp[i])
plt.show()
In [57]:
Xr = []
Yr = []
Otu = []
Rna = []
for i in range((len(Rrsq.split('\n')) - 12),len(Rrsq.split('\n'))):
Otu.append(Rrsq.split('\n')[i].split(',')[0])
Rna.append(Rrsq.split('\n')[i].split(',')[1])
Xr.append(otu_d[otu_key[int(Rrsq.split('\n')[i].split(',')[0])]])
Yr.append(rna_d[rna_key[int(Rrsq.split('\n')[i].split(',')[1])]])
fig = plt.figure()
fig.suptitle('Top 9 regressions based on R2 ', fontsize=14, fontweight='bold')
for i,j in rng:
ax = fig.add_subplot(j)
fig.subplots_adjust(top=0.85, hspace=0.8, wspace=0.8)
#ax.set_title('axes title')
ax.set_xlabel(Otu[i])
ax.set_ylabel(Rna[i])
plt.scatter(Xr[i],Yr[i])
plt.show()
In [73]:
pval = 'OTU,transcript,pvalue\n'+','.join([ otu_key[int(i.split(',')[0])]+','+rna_key[int(i.split(',')[1])]+','+i.split(',')[2]+'\n' for i in Rpval.split('\n')]).replace('\n,','\n')
rsq = 'OTU,transcript,rsquare\n'+','.join([ otu_key[int(i.split(',')[0])]+','+rna_key[int(i.split(',')[1])]+','+i.split(',')[2]+'\n' for i in Rrsq.split('\n')]).replace('\n,','\n')
In [74]:
z = open('/beegfs/group_dv/home/DValenzano/uBiome/data/all_p.csv', 'w')
z.write(pval)
z.close()
z = open('/beegfs/group_dv/home/DValenzano/uBiome/data/all_r2.csv', 'w')
z.write(rsq)
z.close()
In [72]:
#The order of the individuals on each plot is:
oder = [i[0] for i in uTs][1:]
27-Apr-2016 - Goal: to plot correlations with individuals from various groups with different colors
In [86]:
Groups = [i[0][:-2] for i in uTs][1:] # this establishes the groups
In [62]:
Xr = []
Yr = []
Otu = []
Rna = []
for i in range((len(Rrsq.split('\n')) - 12),len(Rrsq.split('\n'))):
Otu.append(Rrsq.split('\n')[i].split(',')[0])
Rna.append(Rrsq.split('\n')[i].split(',')[1])
Xr.append(otu_d[otu_key[int(Rrsq.split('\n')[i].split(',')[0])]])
Yr.append(rna_d[rna_key[int(Rrsq.split('\n')[i].split(',')[1])]])
fig = plt.figure()
fig.suptitle('Top 9 regressions based on R2 ', fontsize=14, fontweight='bold')
for i,j in rng:
ax = fig.add_subplot(j)
fig.subplots_adjust(top=0.85, hspace=0.8, wspace=0.8)
#ax.set_title('axes title')
ax.set_xlabel(Otu[i])
ax.set_ylabel(Rna[i])
plt.scatter(Xr[i],Yr[i])
plt.show()
Below the group coloring works for a single plot
In [136]:
dfR = pd.DataFrame(dict(x=Xr[1], y=Yr[1], label=Groups))
groupsR = df.groupby('label')
# Plot
fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groupsR:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=name)
ax.legend()
plt.show()
Now I am trying to achieve the same for the subplots with the 9 top correlations - it works, although the legend
does not look too nice
In [137]:
fig = plt.figure()
fig.suptitle('Top 9 regressions based on R2 ', fontsize=14, fontweight='bold')
for i,j in rng:
ax = fig.add_subplot(j)
fig.subplots_adjust(top=0.85, hspace=0.8, wspace=0.8)
ax.set_title('axes title')
ax.set_xlabel(Otu[i])
ax.set_ylabel(Rna[i])
dfR = pd.DataFrame(dict(x=Xr[i], y=Yr[i], label=Groups))
groupsR = dfR.groupby('label')
for name, group in groupsR:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=6, label=name)
ax.legend()
# plt.legend( loc = 'lower center', bbox_to_anchor = (0,-0.1,1,1),
# bbox_transform = plt.gcf().transFigure )
plt.show()
The decision is that the top plots will be displayed in b&w, while the single ones can be displayed separating group by group
Now I will work on the calculation of the q values
In [2]:
pval = open('/beegfs/group_dv/home/DValenzano/uBiome/data/all_p.csv', 'rU').read()
rsq = open('/beegfs/group_dv/home/DValenzano/uBiome/data/all_r2.csv', 'rU').read()
In [3]:
pvals = np.array([ float(i.split(',')[-1]) for i in pval.split('\n')[1:-1]])
In [27]:
from statsmodels.sandbox.stats.multicomp import multipletests #from here: http://statsmodels.sourceforge.net/devel/generated/statsmodels.sandbox.stats.multicomp.multipletests.html#statsmodels.sandbox.stats.multicomp.multipletests
In [23]:
prova = multipletests(pvals, alpha=0.5, method='fdr_tsbky', is_sorted=True, returnsorted=False)
In [31]:
import rpy2.robjects as robjects
In [32]:
robjects.r['pi'][0]
Out[32]:
Below the density plot for the pvalue distribution
In [46]:
plt.hist(pvals, bins = 50, histtype='stepfilled', color= 'grey')
plt.title("p values density histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()
In [47]:
r2vals = np.array([ float(i.split(',')[-1]) for i in rsq.split('\n')[1:-1]])
plt.hist(r2vals, bins = 50, histtype='stepfilled', color= 'grey')
plt.title("p values density histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()
I am now going to log2-transform all the fpkm values and then recalculate the correlation
In [68]:
import math
#[ math.log(i)/math.log(2) for i in rna_d[rna_key[int(Rrsq.split('\n')[-2].split(',')[1])]]]
#rna_d[rna_key[int(Rrsq.split('\n')[-2].split(',')[1])]]
In [78]:
rnat_z2 = [ [ math.log(j+1)/math.log(2) for j in i ] for i in rnat_z ]
rna_value2 = rnat_z2
otu_value = otut_z
rna_d2 = dict(zip(rna_key, rna_value2))
otu_d2 = dict(zip(otu_key, otu_value))
def loop_one(m1,otu): # this returns all the correlation coefficients between m1 and a given otu (otu)
ls = []
for i in m1:
ls.append(stats.linregress(i, otu)) #this function returns slope, intercept, r_value, p_value, std_err
return ','.join(map(str, ls))+'\n'
def loop_two(m1, m2):
ls = []
for j in m2:
ls.append(loop_one(m1, j))
return ','.join(ls).replace('\n,','\n')
fpkmvsotu2 = loop_two(rnat_z2, otut_z)
fpkmvsotu2_1 = fpkmvsotu2.replace('LinregressResult', '').replace('slope=','').replace('intercept=', '').replace('rvalue=', '').replace('pvalue=', '').replace('stderr=', '')
z = open('/beegfs/group_dv/home/DValenzano/uBiome/data/corr_log2.csv','w')
z.write(fpkmvsotu2_1)
z.close()
In [79]:
Rf2 = reg(fpkmvsotu2_1)
Rpval2 = filt_(Rf2.pvalue, 1.0, '<') #all the pvals
Rrsq2 = filt_(Rf2.rsquare, 0.0, '>') #all the rsquare
In [160]:
Xp2 = []
Yp2 = []
Otu2 = []
Rna2 = []
for i in range(12):
Otu2.append(Rpval2.split('\n')[i].split(',')[0])
Rna2.append(Rpval2.split('\n')[i].split(',')[1])
Xp2.append(otu_d2[otu_key[int(Rpval2.split('\n')[i].split(',')[0])]])
Yp2.append(rna_d2[rna_key[int(Rpval2.split('\n')[i].split(',')[1])]])
rng = zip(range(10), range(331,340))
fig = plt.figure()
fig.suptitle('Top 9 regressions based on p value ', fontsize=14, fontweight='bold')
for i,j in rng:
ax = fig.add_subplot(j)
fig.subplots_adjust(top=0.85, hspace=0.8, wspace=0.8)
#ax.set_title('axes title')
ax.set_xlabel(Otu2[i])
ax.set_ylabel(Rna2[i])
plt.scatter(Xp2[i],Yp2[i])
plt.show()
In [161]:
Xp2 = []
Yp2 = []
Otu2 = []
Rna2 = []
for i in range((len(Rrsq2.split('\n')) - 12),len(Rrsq2.split('\n'))):
Otu2.append(Rrsq2.split('\n')[i].split(',')[0])
Rna2.append(Rrsq2.split('\n')[i].split(',')[1])
Xp2.append(otu_d2[otu_key[int(Rrsq2.split('\n')[i].split(',')[0])]])
Yp2.append(rna_d2[rna_key[int(Rrsq2.split('\n')[i].split(',')[1])]])
rng = zip(range(10), range(331,340))
fig = plt.figure()
fig.suptitle('Top 9 regressions based on R square value ', fontsize=14, fontweight='bold')
for i,j in rng:
ax = fig.add_subplot(j)
fig.subplots_adjust(top=0.85, hspace=0.8, wspace=0.8)
#ax.set_title('axes title')
ax.set_xlabel(Otu2[i])
ax.set_ylabel(Rna2[i])
plt.scatter(Xp2[i],Yp2[i])
plt.show()
In [135]:
pval2 = 'OTU,transcript,pvalue\n'+','.join([ otu_key[int(i.split(',')[0])]+','+rna_key[int(i.split(',')[1])]+','+i.split(',')[2]+'\n' for i in Rpval2.split('\n')]).replace('\n,','\n')
rsq2 = 'OTU,transcript,rsquare\n'+','.join([ otu_key[int(i.split(',')[0])]+','+rna_key[int(i.split(',')[1])]+','+i.split(',')[2]+'\n' for i in Rrsq2.split('\n')]).replace('\n,','\n')
z = open('/beegfs/group_dv/home/DValenzano/uBiome/data/all_p2.csv', 'w')
z.write(pval2)
z.close()
z = open('/beegfs/group_dv/home/DValenzano/uBiome/data/all_r2_2.csv', 'w')
z.write(rsq2)
z.close()
In [136]:
pvals2 = np.array([ float(i.split(',')[-1]) for i in pval2.split('\n')[1:-1]])
plt.hist(pvals2, bins = 50, histtype='stepfilled', color= 'grey')
plt.title("p values density histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()
In [137]:
r2vals2 = np.array([ float(i.split(',')[-1]) for i in rsq2.split('\n')[1:-1]])
plt.hist(r2vals, bins = 50, histtype='stepfilled', color= 'grey')
plt.title("p values density histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()
In [149]:
top_p2 = Rpval2.split('\n')[:100]
top_r2 = Rrsq2.split('\n')[-100:][::-1] #this reverses the list
In [150]:
p2_k = [','.join(i.split(',')[:2]) for i in top_p2]
p2_v = [ i.split(',')[2] for i in top_p2]
p2_d = dict(zip(p2_k, p2_v))
r2_k = [','.join(i.split(',')[:2]) for i in top_r2]
r2_v = [ i.split(',')[2] for i in top_r2]
r2_d = dict(zip(r2_k, r2_v))
In [155]:
from sets import Set
top = Set(r2_k) & Set(p2_k)
In [ ]: