I need to install qiime on the cluster, then I go:



In [1]:

    
!cat /etc/*-release









    



CentOS Linux release 7.2.1511 (Core) 
NAME="CentOS Linux"
VERSION="7 (Core)"
ID="centos"
ID_LIKE="rhel fedora"
VERSION_ID="7"
PRETTY_NAME="CentOS Linux 7 (Core)"
ANSI_COLOR="0;31"
CPE_NAME="cpe:/o:centos:centos:7"
HOME_URL="https://www.centos.org/"
BUG_REPORT_URL="https://bugs.centos.org/"

CENTOS_MANTISBT_PROJECT="CentOS-7"
CENTOS_MANTISBT_PROJECT_VERSION="7"
REDHAT_SUPPORT_PRODUCT="centos"
REDHAT_SUPPORT_PRODUCT_VERSION="7"

CentOS Linux release 7.2.1511 (Core) 
CentOS Linux release 7.2.1511 (Core)

To install QIIME, I am following instructions from here



In [5]:

    
!module load Python
#!module load R



In [7]:

    
!pip install --user qiime -I









    



Collecting qiime
  Using cached qiime-1.9.1.tar.gz
Requirement already satisfied (use --upgrade to upgrade): numpy>=1.9.0 in /beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages (from qiime)
Collecting scipy>=0.14.0 (from qiime)
  Using cached scipy-0.17.0-cp27-cp27m-manylinux1_x86_64.whl
Collecting cogent==1.5.3 (from qiime)
  Using cached cogent-1.5.3.tgz
Collecting natsort<4.0.0 (from qiime)
  Using cached natsort-3.5.6-py2.py3-none-any.whl
Requirement already satisfied (use --upgrade to upgrade): matplotlib!=1.4.2,>=1.1.0 in /beegfs/common/software/Python/2.7.9/lib/python2.7/site-packages (from qiime)
Collecting pynast==1.2.2 (from qiime)
  Using cached pynast-1.2.2.tar.gz
Collecting qcli<0.2.0,>=0.1.1 (from qiime)
  Using cached qcli-0.1.1.tar.gz
Collecting gdata (from qiime)
  Using cached gdata-2.0.18.tar.gz
Collecting biom-format<2.2.0,>=2.1.4 (from qiime)
  Downloading biom-format-2.1.5.tar.gz (307kB)
    100% |################################| 317kB 1.7MB/s 
Collecting emperor<1.0.0,>=0.9.51 (from qiime)
  Using cached emperor-0.9.51.tar.gz
Collecting scikit-bio<0.3.0,>=0.2.3 (from qiime)
  Using cached scikit-bio-0.2.3.tar.gz
Collecting burrito-fillings<0.2.0,>=0.1.1 (from qiime)
  Downloading burrito-fillings-0.1.1.tar.gz (214kB)
    100% |################################| 215kB 1.7MB/s 
Requirement already satisfied (use --upgrade to upgrade): pandas>=0.13.1 in /beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages (from qiime)
Collecting burrito<1.0.0,>=0.9.1 (from qiime)
  Using cached burrito-0.9.1.tar.gz
Collecting qiime-default-reference<0.2.0,>=0.1.2 (from qiime)
  Using cached qiime-default-reference-0.1.3.tar.gz
Requirement already satisfied (use --upgrade to upgrade): six>=1.4 in /beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages (from matplotlib!=1.4.2,>=1.1.0->qiime)
Requirement already satisfied (use --upgrade to upgrade): python-dateutil in /beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages (from matplotlib!=1.4.2,>=1.1.0->qiime)
Requirement already satisfied (use --upgrade to upgrade): pytz in /beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages (from matplotlib!=1.4.2,>=1.1.0->qiime)
Requirement already satisfied (use --upgrade to upgrade): pyparsing>=1.5.6 in /beegfs/common/software/Python/2.7.9/lib/python2.7/site-packages (from matplotlib!=1.4.2,>=1.1.0->qiime)
Requirement already satisfied (use --upgrade to upgrade): nose>=0.11.1 in /beegfs/common/software/Python/2.7.9/lib/python2.7/site-packages (from matplotlib!=1.4.2,>=1.1.0->qiime)
Requirement already satisfied (use --upgrade to upgrade): mock in /beegfs/common/software/Python/2.7.9/lib/python2.7/site-packages (from matplotlib!=1.4.2,>=1.1.0->qiime)
Collecting click (from biom-format<2.2.0,>=2.1.4->qiime)
  Using cached click-6.6.tar.gz
Collecting future>=0.14.3 (from biom-format<2.2.0,>=2.1.4->qiime)
  Using cached future-0.15.2.tar.gz
Collecting pyqi (from biom-format<2.2.0,>=2.1.4->qiime)
  Using cached pyqi-0.3.2.tar.gz
Requirement already satisfied (use --upgrade to upgrade): IPython in /beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages (from scikit-bio<0.3.0,>=0.2.3->qiime)
Requirement already satisfied (use --upgrade to upgrade): funcsigs in /beegfs/common/software/Python/2.7.9/lib/python2.7/site-packages (from mock->matplotlib!=1.4.2,>=1.1.0->qiime)
Requirement already satisfied (use --upgrade to upgrade): pbr>=0.11 in /beegfs/common/software/Python/2.7.9/lib/python2.7/site-packages (from mock->matplotlib!=1.4.2,>=1.1.0->qiime)
Requirement already satisfied (use --upgrade to upgrade): decorator in /beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages (from IPython->scikit-bio<0.3.0,>=0.2.3->qiime)
Requirement already satisfied (use --upgrade to upgrade): simplegeneric>0.8 in /beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages (from IPython->scikit-bio<0.3.0,>=0.2.3->qiime)
Requirement already satisfied (use --upgrade to upgrade): pexpect in /beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages (from IPython->scikit-bio<0.3.0,>=0.2.3->qiime)
Requirement already satisfied (use --upgrade to upgrade): traitlets in /beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages (from IPython->scikit-bio<0.3.0,>=0.2.3->qiime)
Requirement already satisfied (use --upgrade to upgrade): pickleshare in /beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages (from IPython->scikit-bio<0.3.0,>=0.2.3->qiime)
Requirement already satisfied (use --upgrade to upgrade): ptyprocess>=0.5 in /beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages (from pexpect->IPython->scikit-bio<0.3.0,>=0.2.3->qiime)
Requirement already satisfied (use --upgrade to upgrade): ipython-genutils in /beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages (from traitlets->IPython->scikit-bio<0.3.0,>=0.2.3->qiime)
Requirement already satisfied (use --upgrade to upgrade): path.py>=6.2 in /beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages (from pickleshare->IPython->scikit-bio<0.3.0,>=0.2.3->qiime)
Installing collected packages: scipy, cogent, natsort, pynast, qcli, gdata, click, future, pyqi, biom-format, scikit-bio, emperor, burrito, burrito-fillings, qiime-default-reference, qiime
  Running setup.py install for cogent ... - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ done
  Running setup.py install for pynast ... - \ done
  Running setup.py install for qcli ... - \ done
  Running setup.py install for gdata ... - \ | / - \ | / - \ | / - \ | / - \ | done
  Running setup.py install for click ... - \ | done
  Running setup.py install for future ... - \ | / - \ | / - \ | / - \ | / - \ | done
  Running setup.py install for pyqi ... - \ | / - done
  Running setup.py install for biom-format ... - \ | / - \ | / - \ done
  Running setup.py install for scikit-bio ... - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - done
  Running setup.py install for emperor ... - \ | / done
  Running setup.py install for burrito ... - \ done
  Running setup.py install for burrito-fillings ... - \ | / - done
  Running setup.py install for qiime-default-reference ... - \ | / - \ | / done
  Running setup.py install for qiime ... - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ done
Successfully installed biom-format-2.1.5 burrito-0.9.1 burrito-fillings-0.1.1 click-6.6 cogent-1.5.3 emperor-0.9.51 future-0.15.2 gdata-2.0.18 natsort-3.5.6 pynast-1.2.2 pyqi-0.3.2 qcli-0.1.1 qiime-1.9.1 qiime-default-reference-0.1.3 scikit-bio-0.2.3 scipy-0.17.0



In [1]:

    
!print_qiime_config.py -t









    



System information
==================
         Platform:	linux2
   Python version:	2.7.9 (default, Apr 10 2015, 15:04:25)  [GCC 4.8.2 20140120 (Red Hat 4.8.2-16)]
Python executable:	/software/Python/2.7.9/bin/python

QIIME default reference information
===================================
For details on what files are used as QIIME's default references, see here:
 https://github.com/biocore/qiime-default-reference/releases/tag/0.1.3

Dependency versions
===================
          QIIME library version:	1.9.1
           QIIME script version:	1.9.1
qiime-default-reference version:	0.1.3
                  NumPy version:	1.10.4
                  SciPy version:	0.17.0
                 pandas version:	0.18.0
             matplotlib version:	1.4.3
            biom-format version:	2.1.5
                   h5py version:	Not installed.
                   qcli version:	0.1.1
                   pyqi version:	0.3.2
             scikit-bio version:	0.2.3
                 PyNAST version:	1.2.2
                Emperor version:	0.9.51
                burrito version:	0.9.1
       burrito-fillings version:	0.1.1
              sortmerna version:	SortMeRNA version 2.0, 29/11/2014
              sumaclust version:	SUMACLUST Version 1.0.00
                  swarm version:	Swarm 1.2.19 [Apr 20 2016 13:38:51]
                          gdata:	Installed.

QIIME config values
===================
For definitions of these settings and to learn how to configure QIIME, see here:
 http://qiime.org/install/qiime_config.html
 http://qiime.org/tutorials/parallel_qiime.html

                     blastmat_dir:	None
      pick_otus_reference_seqs_fp:	/beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages/qiime_default_reference/gg_13_8_otus/rep_set/97_otus.fasta
                         sc_queue:	all.q
      topiaryexplorer_project_dir:	None
     pynast_template_alignment_fp:	/beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages/qiime_default_reference/gg_13_8_otus/rep_set_aligned/85_otus.pynast.fasta
                  cluster_jobs_fp:	start_parallel_jobs.py
pynast_template_alignment_blastdb:	None
assign_taxonomy_reference_seqs_fp:	/beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages/qiime_default_reference/gg_13_8_otus/rep_set/97_otus.fasta
                     torque_queue:	friendlyq
                    jobs_to_start:	1
                       slurm_time:	None
            denoiser_min_per_core:	50
assign_taxonomy_id_to_taxonomy_fp:	/beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages/qiime_default_reference/gg_13_8_otus/taxonomy/97_otu_taxonomy.txt
                         temp_dir:	/tmp/
                     slurm_memory:	None
                      slurm_queue:	None
                      blastall_fp:	blastall
                 seconds_to_sleep:	1

QIIME base install test results
===============================
.........
----------------------------------------------------------------------
Ran 9 tests in 0.145s

OK

Goal: To match IDs to OTUs - this will enable me to select OTU differentially enriched based on the LEfSE analysis,
and return transcripts, based on the reference correlation performed here: /Volumes/group_dv/personal/DValenzano/papers/uBiome0/data/regr.csv
First, I take the OTU genus reference table, normalise it and save it as a csv



In [2]:

    
#!export PATH=/beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages/qiime/:$PATH
#!export PATH=/export/apps/qiime/1.8.0/bin:$PATH
#!export PATH=$PATH:/software/R/3.2.4/bin/
#!export PATH=$PATH:/software/R/3.2.4/bin
#!echo $PATH
#ls ~/.bashrc

To install all the required R packages, follow instructions from http://qiime.org/install/install.html



In [1]:

    
#!module load R
#!which R
get_ipython().system(u'normalize_table.py -i /beegfs/group_dv/home/DValenzano/uBiome/data/Description_otu_table_sorted_L6.biom -a CSS -o /beegfs/group_dv/home/DValenzano/uBiome/data/Description_otu_table_sorted_L6n.biom')

This file I just normalized has the OTUs at the genus level for all the individuals for which we also have the rnaseq data



In [2]:

    
get_ipython().system(u'biom convert -i /beegfs/group_dv/home/DValenzano/uBiome/data/Description_otu_table_sorted_L6n.biom -o /beegfs/group_dv/home/DValenzano/uBiome/data/Description_otu_table_sorted_L6n.txt --to-tsv')



In [ ]:

    
get_ipython().system(u'biom convert -i /beegfs/group_dv/home/DValenzano/uBiome/data/Description_otu_table_sorted_L6n.biom -o /beegfs/group_dv/home/DValenzano/uBiome/data/Description_otu_table_sorted_L6n.txt --to-tsv')



In [45]:

    
get_ipython().system(u'biom convert -i /beegfs/group_dv/home/DValenzano/uBiome/data/16wk_otu_table.biom -o /beegfs/group_dv/home/DValenzano/uBiome/data/16wk_otu_table.txt --to-tsv')



In [10]:

    
t = open('/beegfs/group_dv/home/DValenzano/uBiome/data/Description_otu_table_sorted_L6n.txt', 'rU').read()
t = t.replace('\t',',')
z = open('/beegfs/group_dv/home/DValenzano/uBiome/data/Description_otu_table_sorted_L6n.csv', 'w')
z.write(t)
z.close()



In [62]:

    
#t = open('/beegfs/group_dv/home/DValenzano/uBiome/data/Description_otu_table_sorted_L6n.csv', 'rU').read()

Now I need to generate a new correlation between fpkm and OTU, individual by individual
The OTU Table is this, which now needs to be transposed and sorted
I am following the same guidelines as in here



In [63]:

    
t1 = t.replace('_','.').replace('YI', 'WT')
t2 = t1.split('\n')[1:]
t3 = [ i.split(',') for i in t2 ]
t4 = [t3[0]]+[i for i in t3[1:] if sum(map(float, (i[1:]))) != 0.0] #this removes 0-only OTUs
tt = zip(*t4)
tT = [ list(i) for i in tt ]



In [64]:

    
u = open('/beegfs/group_dv/home/DValenzano/uBiome/data/rnaseq_fpkm_genes.csv', 'rU').read()
u2 = u.split('\n')
u3 = [ i.split(',') for i in u2]
u4 = [u3[0]]+[i for i in u3[1:] if sum(map(float, (i[1:]))) != 0.0] #this removes 0-only transcripts
ut = zip(*u4)
uT = [ list(i) for i in ut ]



In [65]:

    
from sets import Set
uh = [i[0] for i in uT[1:]]
th = [i[0] for i in tT[1:]]
h = list(Set(uh) & Set(th))
tT1 = [tT[0]] + [i for i in tT if i[0] in h]
uT1 = [uT[0]] + [i for i in uT if i[0] in h]









    



/beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages/ipykernel/__main__.py:1: DeprecationWarning: the sets module is deprecated
  if __name__ == '__main__':



In [69]:

    
uTs = [uT1[0]] + sorted(uT1[1:], key = lambda x: (x[0]))
tTs = [tT1[0]] + sorted(tT1[1:], key = lambda x: (x[0]))

#[i[0] for i in uTs][1:] == [i[0] for i in tTs][1:] # this tests whether the two matrices have headers in same order

uj = ','.join([','.join(i)+'\n' for i in uTs]).replace('\n,','\n')[:-1]
tj = ','.join([','.join(i)+'\n' for i in tTs]).replace('\n,','\n')[:-1]



In [63]:

    
zu = open('/beegfs/group_dv/home/DValenzano/uBiome/data/rnaseq_fpkm_genes_sorted.csv', 'w')
zu.write(uj)
zu.close()

zt = open('/beegfs/group_dv/home/DValenzano/uBiome/data/rnaseq_merged_otu_table_sorted.csv', 'w')
zt.write(tj)
zt.close()



In [67]:

    
[i.split(',')[0] for i in tj.split('\n')[1:-1]] == [i.split(',')[0] for i in uj.split('\n')[1:-1]]









    Out[67]:





True

There is a difference from the previous analysis (March 17-18 on README) since 3 samples were removed due to low depth
Good - Now continuing from here



In [1]:

    
import numpy as np
from scipy.stats.stats import pearsonr
from scipy import stats
import pandas as pd
from scipy import stats, integrate
import matplotlib.pyplot as plt
#import seaborn as sns



In [16]:

    
rna = uj
otu = tj



In [44]:

    
rnal = [i.split(',') for i in rna.split('\n')] #that's a "l" as in "L"
otul = [i.split(',') for i in otu.split('\n')]

rnat = [ list(i) for i in zip(*rnal)]
otut = [list(i) for i in zip(*otul) ]

rnat_z = [ map(float, i[1:]) for i in rnat[1:] if i.count('0') < 7]
otut_z = [ map(float, i[1:]) for i in otut[1:] if i.count('0.0') < 7]


# In[10]:

rna_key = [i[0] for i in rnat[1:] if i.count('0') < 7]
otu_key = [ i[0] for i in otut[1:] if i.count('0') < 7]

rna_value = rnat_z
otu_value = otut_z

rna_d = dict(zip(rna_key, rna_value))
otu_d = dict(zip(otu_key, otu_value))



In [50]:

    
def loop_one(m1,otu): # this returns all the correlation coefficients between m1 and a given otu (otu)
    ls = []
    for i in m1:
        ls.append(stats.linregress(i, otu))    #this function returns slope, intercept, r_value, p_value, std_err
    return ','.join(map(str, ls))+'\n'

def loop_two(m1, m2):
    ls = []
    for j in m2:
        ls.append(loop_one(m1, j))
    return ','.join(ls).replace('\n,','\n')

fpkmvsotu = loop_two(rnat_z, otut_z)
fpkmvsotu1 = fpkmvsotu.replace('LinregressResult', '').replace('slope=','').replace('intercept=', '').replace('rvalue=', '').replace('pvalue=', '').replace('stderr=', '') 

z = open('/beegfs/group_dv/home/DValenzano/uBiome/data/corr.csv','w')
z.write(fpkmvsotu1)
z.close()



In [52]:

    
class reg(object):
    
    """
    This class uses as input a linear regression file obtained by the loop_two function and returns 
    regression tables and plots, given specific parameteres, such as R-squared value, p-value, OTU or 
    transcript
    """
            
#     version = 0.1
    
#     def __init__(self, inp):
#         self.inp = inp 
#         self.inp2 = [ i.split('LinregressResult')[1:] for i in self.inp.split('\n')[:-1]] 
#         def loop1(inp, n):
#             ls = []
#             for i in inp:
#                 ls.append(i.split(',')[n].split('=')[1])
#             return ','.join(ls).replace(')', '')
#         def loop2(inp, n):
#             ls = []
#             for i in inp:
#                 ls.append(loop1(i, n))
#             return ls
#         self.slope =  [map(float, i.split(',')) for i in loop2(self.inp2, 0)]
#         self.intercept = [map(float, i.split(',')) for i in loop2(self.inp2, 1)]
#         self.rvalue = [map(float, i.split(',')) for i in loop2(self.inp2, 2)]
#         self.rsquare = [ map(lambda x: x**2, i) for i in self.rvalue]
#         self.pvalue = [map(float, i.split(',')) for i in loop2(self.inp2, 3)]
#         self.stderr0 = [map(float, i.split(',')) for i in loop2(self.inp2, 4)  ]  

    version = 0.2
    
    def __init__(self, inp):
        self.inp = inp 
        self.inp2 = [ i.split('(') for i in self.inp.split('\n')[:-1]] 
    
        def loop1(inp, n):
            ls = []
            for i in inp[1:]:
                ls.append(i.split(',')[n])
            return ','.join(ls).replace(')', '')
        def loop2(inp, n):
            ls = []
            for i in inp:
                ls.append(loop1(i, n))
            return ls
       
        self.slope =  [map(float, i.split(',')) for i in loop2(self.inp2, 0)]
        self.intercept = [map(float, i.split(',')) for i in loop2(self.inp2, 1)]
        self.rvalue = [map(float, i.split(',')) for i in loop2(self.inp2, 2)]
        self.rsquare = [ map(lambda x: x**2, i) for i in self.rvalue]
        self.pvalue = [map(float, i.split(',')) for i in loop2(self.inp2, 3)]
        self.stderr = [map(float, i.split(',')) for i in loop2(self.inp2, 4)  ]



In [53]:

    
def filt(inp, thr, sign): #filter p-values, r-values and so forth based on a chosen threshold
    ar = np.array(inp)
    ls = []
    if sign == '>':
        ls = zip(*np.where(ar > thr))
    elif sign == '<':
        ls = zip(*np.where(ar < thr))
    else: 
        print "Error: check your input file"
    return ls


def filt_(inp, thr, sign): #filter p-values, r-values and so forth based on a chosen threshold
    ar = np.array(inp)
    li = []
    lp = []
    if sign == '>':
        lp = np.where(ar > thr)
    elif sign == '<':
        lp = np.where(ar < thr)
    else: 
        print "Error: check your input file"
    p1_l = [ list(i) for i in lp]
    la = list(ar[p1_l])
    p2 = p1_l+[la]
    p3 = [ list(i) for i in zip(*p2)]
    p4 = sorted(p3, key=lambda x: x[2])
    return ','.join([ ','.join(map(str, i))+'\n' for i in p4]).replace('\n,','\n')[:-1]



In [33]:

    
#fpkmvsotu = open('/beegfs/group_dv/home/DValenzano/uBiome/data/corr.csv', 'rU').read()
#fpkmvsotu1 = fpkmvsotu.replace('LinregressResult', '').replace('slope=','').replace('intercept=', '').replace('rvalue=', '').replace('pvalue=', '').replace('stderr=', '') 

rf = reg(fpkmvsotu1)



In [43]:

    
rpval = filt_(rf.pvalue, 0.001, '<')
P2 = [otu_key[int(i.split(',')[0])]+','+rna_key[int(i.split(',')[1])] +','+i.split(',')[2] for i in rpval.split('\n') ]
P3 = ','.join([ i+'\n' for i in P2]).replace('\n,','\n')[:-1]
len(P3.split('\n'))

zpvalue = 'OTU,transcr,value\n'+','.join([i+'\n'for i in P3.split('\n')[:100]]).replace('\n,','\n')
z = open('/beegfs/group_dv/home/DValenzano/uBiome/data/TOP_otu-tr_pvalue.csv', 'w')
z.write(zpvalue)
z.close()

Below the instructions to plot



In [53]:

    
Rf = reg(fpkmvsotu1)
Rpval = filt_(Rf.pvalue, 1.0, '<') #all the pvals 
Rrsq = filt_(Rf.rsquare, 0.0, '>') #all the rsquare



In [48]:

    
# This won't work from the server - for now
# Xp = []
# Yp = []
# Otu = []
# Rna = []
# for i in range(12):
#     Otu.append(Rpval.split('\n')[i].split(',')[0])
#     Rna.append(Rpval.split('\n')[i].split(',')[1])
#     Xp.append(otu_d[otu_key[int(Rpval.split('\n')[i].split(',')[0])]])
#     Yp.append(rna_d[rna_key[int(Rpval.split('\n')[i].split(',')[1])]])

# rng = zip(range(10), range(331,340))

# fig = plt.figure()
# fig.suptitle('Top 9 regressions based on p value ', fontsize=14, fontweight='bold')

# for i,j in rng:
#     ax = fig.add_subplot(j)
#     fig.subplots_adjust(top=0.85, hspace=0.8, wspace=0.8)
#     #ax.set_title('axes title')
#     ax.set_xlabel(Otu[i])
#     ax.set_ylabel(Rna[i])
#     plt.scatter(Xp[i],Yp[i])
    
# plt.show()



In [49]:

    
# This won't work from the server - for now
# Xr = []
# Yr = []
# Otu = []
# Rna = []
# for i in range((len(Rrsq.split('\n')) - 12),len(Rrsq.split('\n'))):
#     Otu.append(Rrsq.split('\n')[i].split(',')[0])
#     Rna.append(Rrsq.split('\n')[i].split(',')[1])
#     Xr.append(otu_d[otu_key[int(Rrsq.split('\n')[i].split(',')[0])]])
#     Yr.append(rna_d[rna_key[int(Rrsq.split('\n')[i].split(',')[1])]])
    
# fig = plt.figure()
# fig.suptitle('Top 9 regressions based on R2 ', fontsize=14, fontweight='bold')

# for i,j in rng:
#     ax = fig.add_subplot(j)
#     fig.subplots_adjust(top=0.85, hspace=0.8, wspace=0.8)
#     #ax.set_title('axes title')
#     ax.set_xlabel(Otu[i])
#     ax.set_ylabel(Rna[i])
#     plt.scatter(Xr[i],Yr[i])
    
# plt.show()

Below what I ran on Apr 26 2016



In [49]:

    
fpkmvsotu0 = open('/beegfs/group_dv/home/DValenzano/uBiome/data/corr.csv', 'rU').read()

uj = open('/beegfs/group_dv/home/DValenzano/uBiome/data/rnaseq_fpkm_genes_sorted.csv', 'rU').read()

tj = open('/beegfs/group_dv/home/DValenzano/uBiome/data/rnaseq_merged_otu_table_sorted.csv','rU').read()

rna = uj
otu = tj

rnal = [i.split(',') for i in rna.split('\n')] #that's a "l" as in "L"
otul = [i.split(',') for i in otu.split('\n')]

rnat = [ list(i) for i in zip(*rnal)]
otut = [list(i) for i in zip(*otul) ]

rnat_z = [ map(float, i[1:]) for i in rnat[1:] if i.count('0') < 7]
otut_z = [ map(float, i[1:]) for i in otut[1:] if i.count('0.0') < 7]


# In[10]:

rna_key = [i[0] for i in rnat[1:]  if i.count('0') < 7]
otu_key = [ i[0] for i in otut[1:] if i.count('0.0') < 7]

rna_value = rnat_z
otu_value = otut_z

rna_d = dict(zip(rna_key, rna_value))
otu_d = dict(zip(otu_key, otu_value))



In [50]:

    
fpkmvsotu1 = fpkmvsotu0.replace('LinregressResult', '').replace('slope=','').replace('intercept=', '').replace('rvalue=', '').replace('pvalue=', '').replace('stderr=', '')



In [59]:

    
Rf = reg(fpkmvsotu1)
Rpval001 = filt_(Rf.pvalue, 0.001, '<') #all the pvals < 0.00001
Rrsq06 = filt_(Rf.rsquare, 0.6, '>') #all the rsquare > 0.8



In [54]:

    
Rf = reg(fpkmvsotu1)
Rpval = filt_(Rf.pvalue, 1.0, '<') #all the pvals 
Rrsq = filt_(Rf.rsquare, 0.0, '>') #all the rsquare



In [55]:

    
prova = Rf.rsquare
len(prova)









    Out[55]:





65



In [56]:

    
Xp = []
Yp = []
Otu = []
Rna = []
for i in range(12):
    Otu.append(Rpval.split('\n')[i].split(',')[0])
    Rna.append(Rpval.split('\n')[i].split(',')[1])
    Xp.append(otu_d[otu_key[int(Rpval.split('\n')[i].split(',')[0])]])
    Yp.append(rna_d[rna_key[int(Rpval.split('\n')[i].split(',')[1])]])
    
rng = zip(range(10), range(331,340))

fig = plt.figure()
fig.suptitle('Top 9 regressions based on p value ', fontsize=14, fontweight='bold')

# for i,j in rng:
#     ax = fig.add_subplot(j)
#     fig.subplots_adjust(top=0.85, hspace=0.8, wspace=0.8)
#     #ax.set_title('axes title')
#     ax.set_xlabel(Otu[i])
#     ax.set_ylabel(Rna[i])
#     plt.scatter(Xr[i],Yr[i])
    
# plt.show()

for i,j in rng:
    ax = fig.add_subplot(j)
    fig.subplots_adjust(top=0.85, hspace=0.8, wspace=0.8)
    #ax.set_title('axes title')
    ax.set_xlabel(Otu[i])
    ax.set_ylabel(Rna[i])
    plt.scatter(Xp[i],Yp[i])
    
plt.show()









    



/software/Python/2.7.9/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [57]:

    
Xr = []
Yr = []
Otu = []
Rna = []
for i in range((len(Rrsq.split('\n')) - 12),len(Rrsq.split('\n'))):
    Otu.append(Rrsq.split('\n')[i].split(',')[0])
    Rna.append(Rrsq.split('\n')[i].split(',')[1])
    Xr.append(otu_d[otu_key[int(Rrsq.split('\n')[i].split(',')[0])]])
    Yr.append(rna_d[rna_key[int(Rrsq.split('\n')[i].split(',')[1])]])
    
fig = plt.figure()
fig.suptitle('Top 9 regressions based on R2 ', fontsize=14, fontweight='bold')

for i,j in rng:
    ax = fig.add_subplot(j)
    fig.subplots_adjust(top=0.85, hspace=0.8, wspace=0.8)
    #ax.set_title('axes title')
    ax.set_xlabel(Otu[i])
    ax.set_ylabel(Rna[i])
    plt.scatter(Xr[i],Yr[i])
    
plt.show()



In [73]:

    
pval = 'OTU,transcript,pvalue\n'+','.join([ otu_key[int(i.split(',')[0])]+','+rna_key[int(i.split(',')[1])]+','+i.split(',')[2]+'\n' for i in Rpval.split('\n')]).replace('\n,','\n')
rsq = 'OTU,transcript,rsquare\n'+','.join([ otu_key[int(i.split(',')[0])]+','+rna_key[int(i.split(',')[1])]+','+i.split(',')[2]+'\n' for i in Rrsq.split('\n')]).replace('\n,','\n')



In [74]:

    
z = open('/beegfs/group_dv/home/DValenzano/uBiome/data/all_p.csv', 'w')
z.write(pval)
z.close()

z = open('/beegfs/group_dv/home/DValenzano/uBiome/data/all_r2.csv', 'w')
z.write(rsq)
z.close()



In [72]:

    
#The order of the individuals on each plot is:
oder = [i[0] for i in uTs][1:]

27-Apr-2016 - Goal: to plot correlations with individuals from various groups with different colors



In [86]:

    
Groups = [i[0][:-2] for i in uTs][1:] # this establishes the groups



In [62]:

    
Xr = []
Yr = []
Otu = []
Rna = []
for i in range((len(Rrsq.split('\n')) - 12),len(Rrsq.split('\n'))):
    Otu.append(Rrsq.split('\n')[i].split(',')[0])
    Rna.append(Rrsq.split('\n')[i].split(',')[1])
    Xr.append(otu_d[otu_key[int(Rrsq.split('\n')[i].split(',')[0])]])
    Yr.append(rna_d[rna_key[int(Rrsq.split('\n')[i].split(',')[1])]])
    
fig = plt.figure()
fig.suptitle('Top 9 regressions based on R2 ', fontsize=14, fontweight='bold')

for i,j in rng:
    ax = fig.add_subplot(j)
    fig.subplots_adjust(top=0.85, hspace=0.8, wspace=0.8)
    #ax.set_title('axes title')
    ax.set_xlabel(Otu[i])
    ax.set_ylabel(Rna[i])
    plt.scatter(Xr[i],Yr[i])
    
plt.show()

Below the group coloring works for a single plot



In [136]:

    
dfR = pd.DataFrame(dict(x=Xr[1], y=Yr[1], label=Groups))
groupsR = df.groupby('label')

# Plot
fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in groupsR:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, label=name)
ax.legend()

plt.show()

Now I am trying to achieve the same for the subplots with the 9 top correlations - it works, although the legend
does not look too nice



In [137]:

    
fig = plt.figure()
fig.suptitle('Top 9 regressions based on R2 ', fontsize=14, fontweight='bold')

for i,j in rng:
    ax = fig.add_subplot(j)
    fig.subplots_adjust(top=0.85, hspace=0.8, wspace=0.8)
    ax.set_title('axes title')
    ax.set_xlabel(Otu[i])
    ax.set_ylabel(Rna[i])
    dfR = pd.DataFrame(dict(x=Xr[i], y=Yr[i], label=Groups))
    groupsR = dfR.groupby('label')
    for name, group in groupsR:
        ax.plot(group.x, group.y, marker='o', linestyle='', ms=6, label=name)
ax.legend()    
# plt.legend( loc = 'lower center', bbox_to_anchor = (0,-0.1,1,1),
#             bbox_transform = plt.gcf().transFigure )
plt.show()

The decision is that the top plots will be displayed in b&w, while the single ones can be displayed separating group by group

Now I will work on the calculation of the q values



In [2]:

    
pval = open('/beegfs/group_dv/home/DValenzano/uBiome/data/all_p.csv', 'rU').read()
rsq = open('/beegfs/group_dv/home/DValenzano/uBiome/data/all_r2.csv', 'rU').read()



In [3]:

    
pvals = np.array([ float(i.split(',')[-1]) for i in pval.split('\n')[1:-1]])



In [27]:

    
from statsmodels.sandbox.stats.multicomp import multipletests #from here: http://statsmodels.sourceforge.net/devel/generated/statsmodels.sandbox.stats.multicomp.multipletests.html#statsmodels.sandbox.stats.multicomp.multipletests



In [23]:

    
prova = multipletests(pvals, alpha=0.5, method='fdr_tsbky', is_sorted=True, returnsorted=False)



In [31]:

    
import rpy2.robjects as robjects



In [32]:

    
robjects.r['pi'][0]









    Out[32]:





3.141592653589793

Below the density plot for the pvalue distribution



In [46]:

    
plt.hist(pvals, bins = 50, histtype='stepfilled', color= 'grey')
plt.title("p values density histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()



In [47]:

    
r2vals = np.array([ float(i.split(',')[-1]) for i in rsq.split('\n')[1:-1]])
plt.hist(r2vals, bins = 50, histtype='stepfilled', color= 'grey')
plt.title("p values density histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()

I am now going to log2-transform all the fpkm values and then recalculate the correlation



In [68]:

    
import math
#[ math.log(i)/math.log(2) for i in rna_d[rna_key[int(Rrsq.split('\n')[-2].split(',')[1])]]]
#rna_d[rna_key[int(Rrsq.split('\n')[-2].split(',')[1])]]



In [78]:

    
rnat_z2 = [ [ math.log(j+1)/math.log(2) for j in i ] for i in rnat_z ]

rna_value2 = rnat_z2
otu_value = otut_z

rna_d2 = dict(zip(rna_key, rna_value2))
otu_d2 = dict(zip(otu_key, otu_value))

def loop_one(m1,otu): # this returns all the correlation coefficients between m1 and a given otu (otu)
    ls = []
    for i in m1:
        ls.append(stats.linregress(i, otu))    #this function returns slope, intercept, r_value, p_value, std_err
    return ','.join(map(str, ls))+'\n'

def loop_two(m1, m2):
    ls = []
    for j in m2:
        ls.append(loop_one(m1, j))
    return ','.join(ls).replace('\n,','\n')

fpkmvsotu2 = loop_two(rnat_z2, otut_z)
fpkmvsotu2_1 = fpkmvsotu2.replace('LinregressResult', '').replace('slope=','').replace('intercept=', '').replace('rvalue=', '').replace('pvalue=', '').replace('stderr=', '') 

z = open('/beegfs/group_dv/home/DValenzano/uBiome/data/corr_log2.csv','w')
z.write(fpkmvsotu2_1)
z.close()



In [79]:

    
Rf2 = reg(fpkmvsotu2_1)
Rpval2 = filt_(Rf2.pvalue, 1.0, '<') #all the pvals 
Rrsq2 = filt_(Rf2.rsquare, 0.0, '>') #all the rsquare



In [160]:

    
Xp2 = []
Yp2 = []
Otu2 = []
Rna2 = []
for i in range(12):
    Otu2.append(Rpval2.split('\n')[i].split(',')[0])
    Rna2.append(Rpval2.split('\n')[i].split(',')[1])
    Xp2.append(otu_d2[otu_key[int(Rpval2.split('\n')[i].split(',')[0])]])
    Yp2.append(rna_d2[rna_key[int(Rpval2.split('\n')[i].split(',')[1])]])
    
rng = zip(range(10), range(331,340))

fig = plt.figure()
fig.suptitle('Top 9 regressions based on p value ', fontsize=14, fontweight='bold')

for i,j in rng:
    ax = fig.add_subplot(j)
    fig.subplots_adjust(top=0.85, hspace=0.8, wspace=0.8)
    #ax.set_title('axes title')
    ax.set_xlabel(Otu2[i])
    ax.set_ylabel(Rna2[i])
    plt.scatter(Xp2[i],Yp2[i])
    
plt.show()



In [161]:

    
Xp2 = []
Yp2 = []
Otu2 = []
Rna2 = []
for i in range((len(Rrsq2.split('\n')) - 12),len(Rrsq2.split('\n'))):
    Otu2.append(Rrsq2.split('\n')[i].split(',')[0])
    Rna2.append(Rrsq2.split('\n')[i].split(',')[1])
    Xp2.append(otu_d2[otu_key[int(Rrsq2.split('\n')[i].split(',')[0])]])
    Yp2.append(rna_d2[rna_key[int(Rrsq2.split('\n')[i].split(',')[1])]])
    
rng = zip(range(10), range(331,340))

fig = plt.figure()
fig.suptitle('Top 9 regressions based on R square value ', fontsize=14, fontweight='bold')

for i,j in rng:
    ax = fig.add_subplot(j)
    fig.subplots_adjust(top=0.85, hspace=0.8, wspace=0.8)
    #ax.set_title('axes title')
    ax.set_xlabel(Otu2[i])
    ax.set_ylabel(Rna2[i])
    plt.scatter(Xp2[i],Yp2[i])
    
plt.show()



In [135]:

    
pval2 = 'OTU,transcript,pvalue\n'+','.join([ otu_key[int(i.split(',')[0])]+','+rna_key[int(i.split(',')[1])]+','+i.split(',')[2]+'\n' for i in Rpval2.split('\n')]).replace('\n,','\n')
rsq2 = 'OTU,transcript,rsquare\n'+','.join([ otu_key[int(i.split(',')[0])]+','+rna_key[int(i.split(',')[1])]+','+i.split(',')[2]+'\n' for i in Rrsq2.split('\n')]).replace('\n,','\n')

z = open('/beegfs/group_dv/home/DValenzano/uBiome/data/all_p2.csv', 'w')
z.write(pval2)
z.close()

z = open('/beegfs/group_dv/home/DValenzano/uBiome/data/all_r2_2.csv', 'w')
z.write(rsq2)
z.close()



In [136]:

    
pvals2 = np.array([ float(i.split(',')[-1]) for i in pval2.split('\n')[1:-1]])
plt.hist(pvals2, bins = 50, histtype='stepfilled', color= 'grey')
plt.title("p values density histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()



In [137]:

    
r2vals2 = np.array([ float(i.split(',')[-1]) for i in rsq2.split('\n')[1:-1]])
plt.hist(r2vals, bins = 50, histtype='stepfilled', color= 'grey')
plt.title("p values density histogram")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()



In [149]:

    
top_p2 = Rpval2.split('\n')[:100]
top_r2 = Rrsq2.split('\n')[-100:][::-1] #this reverses the list



In [150]:

    
p2_k = [','.join(i.split(',')[:2]) for i in top_p2]
p2_v = [ i.split(',')[2] for i in top_p2]
p2_d = dict(zip(p2_k, p2_v))

r2_k = [','.join(i.split(',')[:2]) for i in top_r2]
r2_v = [ i.split(',')[2] for i in top_r2]
r2_d = dict(zip(r2_k, r2_v))



In [155]:

    
from sets import Set
top = Set(r2_k) & Set(p2_k)









    



/beegfs/group_dv/home/DValenzano/.Python/2.7/lib/python2.7/site-packages/ipykernel/__main__.py:1: DeprecationWarning: the sets module is deprecated
  if __name__ == '__main__':



In [ ]: