The next cell will get a ~65 MB data file 'sequence.index', you only need to run the cell once


In [1]:
!rm sequence.index 2>/dev/null
!wget -nd ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/sequence.index


--2015-05-13 12:58:30--  ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/sequence.index
           => ‘sequence.index’
Resolving ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)... 193.62.192.8
Connecting to ftp.1000genomes.ebi.ac.uk (ftp.1000genomes.ebi.ac.uk)|193.62.192.8|:21... connected.
Logging in as anonymous ... Logged in!
==> SYST ... done.    ==> PWD ... done.
==> TYPE I ... done.  ==> CWD (1) /vol1/ftp ... done.
==> SIZE sequence.index ... 67069489
==> PASV ... done.    ==> RETR sequence.index ... done.
Length: 67069489 (64M) (unauthoritative)

sequence.index      100%[=====================>]  63.96M  2.26MB/s   in 20s    

2015-05-13 12:58:52 (3.16 MB/s) - ‘sequence.index’ saved [67069489]

Interfacing with R


In [1]:
import os

from IPython.display import Image

import rpy2.robjects as robjects
import rpy2.robjects.lib.ggplot2 as ggplot2
from rpy2.robjects.functions import SignatureTranslatedFunction

import pandas as pd
import pandas.rpy.common as pd_common


/home/tra/anaconda3/envs/bio2/lib/python2.7/site-packages/pandas/rpy/__init__.py:8: FutureWarning: The pandas.rpy module is deprecated and will be removed in a future version. We refer to external packages like rpy2, found here: http://rpy.sourceforge.net
  "like rpy2, found here: http://rpy.sourceforge.net", FutureWarning)

In [2]:
read_delim = robjects.r('read.delim')
seq_data = read_delim('sequence.index', header=True, stringsAsFactors=False)
#In R:
#  seq.data <- read.delim('sequence.index', header=TRUE, stringsAsFactors=FALSE)

In [3]:
print('This data frame has %d columns and %d rows' % (seq_data.ncol, seq_data.nrow))
print(seq_data.colnames)
#In R:
#  print(colnames(seq.data))
#  print(nrow(seq.data))
#  print(ncol(seq.data))

print('Columns in Python %d ' % robjects.r.ncol(seq_data)[0])

#access some functions
as_integer = robjects.r('as.integer')
match = robjects.r.match

my_col = match('READ_COUNT', seq_data.colnames)[0] # Vector returned
print('Type of read count before as.integer: %s' % seq_data[my_col - 1].rclass[0])
seq_data[my_col - 1] = as_integer(seq_data[my_col - 1])
print('Type of read count after as.integer: %s' % seq_data[my_col - 1].rclass[0])

my_col = match('BASE_COUNT', seq_data.colnames)[0] # Vector returned
seq_data[my_col - 1] = as_integer(seq_data[my_col - 1])

my_col = match('CENTER_NAME', seq_data.colnames)[0]
seq_data[my_col - 1] = robjects.r.toupper(seq_data[my_col - 1])
robjects.r.assign('seq.data', seq_data)
robjects.r('print(c("Column names in R: ",colnames(seq.data)))')

robjects.r('seq.data <- seq.data[seq.data$WITHDRAWN==0, ]')
#Lets remove all withdrawn sequences

robjects.r("seq.data <- seq.data[, c('STUDY_ID', 'STUDY_NAME', 'CENTER_NAME', 'SAMPLE_ID', 'SAMPLE_NAME', 'POPULATION', 'INSTRUMENT_PLATFORM', 'LIBRARY_LAYOUT', 'PAIRED_FASTQ', 'READ_COUNT', 'BASE_COUNT', 'ANALYSIS_GROUP')]")
#Lets shorten the dataframe

#Population as factor
robjects.r('seq.data$POPULATION <- as.factor(seq.data$POPULATION)')


This data frame has 26 columns and 187720 rows
 [1] "FASTQ_FILE"          "MD5"                 "RUN_ID"             
 [4] "STUDY_ID"            "STUDY_NAME"          "CENTER_NAME"        
 [7] "SUBMISSION_ID"       "SUBMISSION_DATE"     "SAMPLE_ID"          
[10] "SAMPLE_NAME"         "POPULATION"          "EXPERIMENT_ID"      
[13] "INSTRUMENT_PLATFORM" "INSTRUMENT_MODEL"    "LIBRARY_NAME"       
[16] "RUN_NAME"            "RUN_BLOCK_NAME"      "INSERT_SIZE"        
[19] "LIBRARY_LAYOUT"      "PAIRED_FASTQ"        "WITHDRAWN"          
[22] "WITHDRAWN_DATE"      "COMMENT"             "READ_COUNT"         
[25] "BASE_COUNT"          "ANALYSIS_GROUP"     

Columns in Python 26 
Type of read count before as.integer: character
Type of read count after as.integer: integer
 [1] "Column names in R: " "FASTQ_FILE"          "MD5"                
 [4] "RUN_ID"              "STUDY_ID"            "STUDY_NAME"         
 [7] "CENTER_NAME"         "SUBMISSION_ID"       "SUBMISSION_DATE"    
[10] "SAMPLE_ID"           "SAMPLE_NAME"         "POPULATION"         
[13] "EXPERIMENT_ID"       "INSTRUMENT_PLATFORM" "INSTRUMENT_MODEL"   
[16] "LIBRARY_NAME"        "RUN_NAME"            "RUN_BLOCK_NAME"     
[19] "INSERT_SIZE"         "LIBRARY_LAYOUT"      "PAIRED_FASTQ"       
[22] "WITHDRAWN"           "WITHDRAWN_DATE"      "COMMENT"            
[25] "READ_COUNT"          "BASE_COUNT"          "ANALYSIS_GROUP"     
Out[3]:
<FactorVector - Python:0x7f1f9b918f80 / R:0xce97870>
[      27,       27,       27, ...,       25,       25,       25]

In [4]:
ggplot2.theme = SignatureTranslatedFunction(ggplot2.theme,
                                            init_prm_translate = {'axis_text_x': 'axis.text.x'})
bar = ggplot2.ggplot(seq_data) + ggplot2.geom_bar() + ggplot2.aes_string(x='CENTER_NAME') + ggplot2.theme(axis_text_x=ggplot2.element_text(angle=90, hjust=1))
robjects.r.png('out.png')
bar.plot()
dev_off = robjects.r('dev.off')
dev_off()
Image(filename='out.png')


Out[4]:

In [5]:
#Get Yoruba and CEU
robjects.r('yri_ceu <- seq.data[seq.data$POPULATION %in% c("YRI", "CEU") & seq.data$BASE_COUNT < 2E9 & seq.data$READ_COUNT < 3E7, ]')
yri_ceu = robjects.r('yri_ceu')

In [6]:
scatter = ggplot2.ggplot(yri_ceu) + ggplot2.aes_string(x='BASE_COUNT', y='READ_COUNT', shape='factor(POPULATION)', col='factor(ANALYSIS_GROUP)') + ggplot2.geom_point()
robjects.r.png('out.png')
scatter.plot()
dev_off = robjects.r('dev.off')
dev_off()
Image(filename='out.png')


Out[6]:

In [7]:
pd_yri_ceu = pd_common.load_data('yri_ceu')
print(type(pd_yri_ceu))
pd_yri_ceu


<class 'pandas.core.frame.DataFrame'>
Out[7]:
STUDY_ID STUDY_NAME CENTER_NAME SAMPLE_ID SAMPLE_NAME POPULATION INSTRUMENT_PLATFORM LIBRARY_LAYOUT PAIRED_FASTQ READ_COUNT BASE_COUNT ANALYSIS_GROUP
1 SRP000032 1000Genomes Project Pilot 2 BGI SRS000212 NA19238 YRI ILLUMINA SINGLE 9280498 334097928 high coverage
2 SRP000032 1000Genomes Project Pilot 2 BGI SRS000212 NA19238 YRI ILLUMINA SINGLE 9571982 344591352 high coverage
3 SRP000032 1000Genomes Project Pilot 2 BGI SRS000214 NA19240 YRI ILLUMINA PAIRED 149044 5365584 high coverage
4 SRP000032 1000Genomes Project Pilot 2 BGI SRS000214 NA19240 YRI ILLUMINA PAIRED data/NA19240/sequence_read/ERR000020_2.filt.fa... 2057690 74076840 high coverage
5 SRP000032 1000Genomes Project Pilot 2 BGI SRS000214 NA19240 YRI ILLUMINA PAIRED data/NA19240/sequence_read/ERR000020_1.filt.fa... 2057690 74076840 high coverage
6 SRP000032 1000Genomes Project Pilot 2 BGI SRS000212 NA19238 YRI ILLUMINA SINGLE 9388168 337974048 high coverage
7 SRP000032 1000Genomes Project Pilot 2 BGI SRS000212 NA19238 YRI ILLUMINA SINGLE 7762958 279466488 high coverage
8 SRP000032 1000Genomes Project Pilot 2 BGI SRS000212 NA19238 YRI ILLUMINA SINGLE 9625450 385018000 high coverage
9 SRP000032 1000Genomes Project Pilot 2 BGI SRS000212 NA19238 YRI ILLUMINA SINGLE 8808642 317111112 high coverage
10 SRP000032 1000Genomes Project Pilot 2 BGI SRS000214 NA19240 YRI ILLUMINA PAIRED 15187 683415 high coverage
11 SRP000032 1000Genomes Project Pilot 2 BGI SRS000214 NA19240 YRI ILLUMINA PAIRED data/NA19240/sequence_read/ERR000025_2.filt.fa... 2159324 97169580 high coverage
12 SRP000032 1000Genomes Project Pilot 2 BGI SRS000214 NA19240 YRI ILLUMINA PAIRED data/NA19240/sequence_read/ERR000025_1.filt.fa... 2159324 97169580 high coverage
13 SRP000032 1000Genomes Project Pilot 2 BGI SRS000213 NA19239 YRI ILLUMINA PAIRED 59312 2669040 high coverage
14 SRP000032 1000Genomes Project Pilot 2 BGI SRS000213 NA19239 YRI ILLUMINA PAIRED data/NA19239/sequence_read/ERR000027_2.filt.fa... 5080128 228605760 high coverage
15 SRP000032 1000Genomes Project Pilot 2 BGI SRS000213 NA19239 YRI ILLUMINA PAIRED data/NA19239/sequence_read/ERR000027_1.filt.fa... 5080128 228605760 high coverage
16 SRP000032 1000Genomes Project Pilot 2 BGI SRS000212 NA19238 YRI ILLUMINA SINGLE 11752662 423095832 high coverage
17 SRP000032 1000Genomes Project Pilot 2 BGI SRS000212 NA19238 YRI ILLUMINA PAIRED 229179 10313055 high coverage
18 SRP000032 1000Genomes Project Pilot 2 BGI SRS000212 NA19238 YRI ILLUMINA PAIRED data/NA19238/sequence_read/ERR000030_2.filt.fa... 7692812 346176540 high coverage
19 SRP000032 1000Genomes Project Pilot 2 BGI SRS000212 NA19238 YRI ILLUMINA PAIRED data/NA19238/sequence_read/ERR000030_1.filt.fa... 7692812 346176540 high coverage
20 SRP000032 1000Genomes Project Pilot 2 BGI SRS000212 NA19238 YRI ILLUMINA SINGLE 11402532 410491152 high coverage
21 SRP000032 1000Genomes Project Pilot 2 BGI SRS000212 NA19238 YRI ILLUMINA SINGLE 6777368 243985248 high coverage
22 SRP000032 1000Genomes Project Pilot 2 BGI SRS000213 NA19239 YRI ILLUMINA PAIRED 78918 2841048 high coverage
23 SRP000032 1000Genomes Project Pilot 2 BGI SRS000213 NA19239 YRI ILLUMINA PAIRED data/NA19239/sequence_read/ERR000034_2.filt.fa... 1131253 40725108 high coverage
24 SRP000032 1000Genomes Project Pilot 2 BGI SRS000213 NA19239 YRI ILLUMINA PAIRED data/NA19239/sequence_read/ERR000034_1.filt.fa... 1131253 40725108 high coverage
25 SRP000032 1000Genomes Project Pilot 2 BGI SRS000212 NA19238 YRI ILLUMINA SINGLE 12013717 432493812 high coverage
26 SRP000032 1000Genomes Project Pilot 2 BGI SRS000212 NA19238 YRI ILLUMINA SINGLE 8045886 289651896 high coverage
27 SRP000032 1000Genomes Project Pilot 2 BGI SRS000212 NA19238 YRI ILLUMINA SINGLE 9081298 326926728 high coverage
28 SRP000032 1000Genomes Project Pilot 2 BGI SRS000212 NA19238 YRI ILLUMINA SINGLE 10130502 364698072 high coverage
29 SRP000032 1000Genomes Project Pilot 2 BGI SRS000212 NA19238 YRI ILLUMINA SINGLE 8632879 310783644 high coverage
30 SRP000032 1000Genomes Project Pilot 2 BGI SRS000212 NA19238 YRI ILLUMINA SINGLE 8108919 291921084 high coverage
... ... ... ... ... ... ... ... ... ... ... ... ...
174775 SRP000547 1000 Genomes CEPH (Utah residents with ancestr... BI SRS000037 NA07347 CEU ILLUMINA PAIRED 286010 28887010 low coverage
174776 SRP000547 1000 Genomes CEPH (Utah residents with ancestr... BI SRS000037 NA07347 CEU ILLUMINA PAIRED data/NA07347/sequence_read/SRR793094_2.filt.fa... 11761994 1187961394 low coverage
174777 SRP000547 1000 Genomes CEPH (Utah residents with ancestr... BI SRS000037 NA07347 CEU ILLUMINA PAIRED data/NA07347/sequence_read/SRR793094_1.filt.fa... 11761994 1187961394 low coverage
177994 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000198 NA19153 YRI ILLUMINA PAIRED 821518 82151800 low coverage
178036 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000205 NA19201 YRI ILLUMINA PAIRED 61270 6127000 low coverage
178039 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000208 NA19207 YRI ILLUMINA PAIRED 61280 6128000 low coverage
178054 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000212 NA19238 YRI ILLUMINA PAIRED 55356 5535600 low coverage
178063 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000209 NA19209 YRI ILLUMINA PAIRED 67550 6755000 low coverage
178081 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000213 NA19239 YRI ILLUMINA PAIRED 51922 5192200 low coverage
178084 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000213 NA19239 YRI ILLUMINA PAIRED 56870 5687000 low coverage
178096 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000195 NA19144 YRI ILLUMINA PAIRED 873195 87319500 low coverage
178099 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000195 NA19144 YRI ILLUMINA PAIRED 821999 82199900 low coverage
178117 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000214 NA19240 YRI ILLUMINA PAIRED 50397 5039700 low coverage
178135 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000214 NA19240 YRI ILLUMINA PAIRED 55358 5535800 low coverage
178144 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000207 NA19206 YRI ILLUMINA PAIRED 64122 6412200 low coverage
178171 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000199 NA19159 YRI ILLUMINA PAIRED 72884 7288400 low coverage
178183 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000182 NA19098 YRI ILLUMINA PAIRED 980198 98019800 low coverage
178189 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000182 NA19098 YRI ILLUMINA PAIRED 1062464 106246400 low coverage
178225 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000198 NA19153 YRI ILLUMINA PAIRED 785777 78577700 low coverage
178282 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000212 NA19238 YRI ILLUMINA PAIRED 59886 5988600 low coverage
178312 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000205 NA19201 YRI ILLUMINA PAIRED 63977 6397700 low coverage
178315 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000208 NA19207 YRI ILLUMINA PAIRED 63850 6385000 low coverage
178324 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000209 NA19209 YRI ILLUMINA PAIRED 63639 6363900 low coverage
178327 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000207 NA19206 YRI ILLUMINA PAIRED 67132 6713200 low coverage
178480 SRP000542 1000 Genomes YRI Yoruba population sequencing WUGSC SRS000199 NA19159 YRI ILLUMINA PAIRED 68438 6843800 low coverage
178750 SRP004078 1000 Genomes CEPH (Utah residents with ancestr... WUGSC SRS000075 NA12750 CEU ILLUMINA PAIRED 67944 6794400 exome
178759 SRP004078 1000 Genomes CEPH (Utah residents with ancestr... WUGSC SRS000075 NA12750 CEU ILLUMINA PAIRED 44974 4497400 exome
184746 SRP000542 1000 Genomes YRI Yoruba population sequencing BI SRS000096 NA18499 YRI ILLUMINA PAIRED 871763 88048063 low coverage
184747 SRP000542 1000 Genomes YRI Yoruba population sequencing BI SRS000096 NA18499 YRI ILLUMINA PAIRED data/NA18499/sequence_read/SRR797225_2.filt.fa... 8623980 871021980 low coverage
184748 SRP000542 1000 Genomes YRI Yoruba population sequencing BI SRS000096 NA18499 YRI ILLUMINA PAIRED data/NA18499/sequence_read/SRR797225_1.filt.fa... 8623980 871021980 low coverage

24949 rows × 12 columns


In [8]:
del pd_yri_ceu['PAIRED_FASTQ']
no_paired = pd_common.convert_to_r_dataframe(pd_yri_ceu)
robjects.r.assign('no.paired', no_paired)
robjects.r("print(colnames(no.paired))")


 [1] "STUDY_ID"            "STUDY_NAME"          "CENTER_NAME"        
 [4] "SAMPLE_ID"           "SAMPLE_NAME"         "POPULATION"         
 [7] "INSTRUMENT_PLATFORM" "LIBRARY_LAYOUT"      "READ_COUNT"         
[10] "BASE_COUNT"          "ANALYSIS_GROUP"     
Out[8]:
<StrVector - Python:0x7f1f96169878 / R:0x6362e38>
[str, str, str, ..., str, str, str]

In [ ]: