Trying to generate a plot showing the distribution of metagenome sizes by year, and hoping to do this by pulling data with the EBI restful API.
Some links that work:
the studies I care about - ena.xml
file used below was downloaded from this page.
In [1]:
import numpy as np
import matplotlib
matplotlib.use('agg')
import pandas as pd
%pylab inline
In [2]:
!grep PRIMARY_ID ena.xml | wc -l
In [3]:
!grep PRIMARY_ID ena.xml | head
In [4]:
study_ids = !grep PRIMARY_ID ena.xml
print len(study_ids)
print study_ids[:10]
study_ids = [e.strip()[12:21] for e in study_ids]
print len(study_ids)
print study_ids[:10]
In [5]:
data = []
for e in study_ids:
url = """'http://www.ebi.ac.uk/ena/data/warehouse/filereport?accession="%s"&result=read_run&fields=study_accession,sample_accession,run_accession,instrument_model,read_count,first_public'""" % e
current_data = !curl $url
data += [l.strip().split('\t') for l in current_data if l.startswith(e)]
In [6]:
columns = 'study_accession\tsample_accession\trun_accession\tinstrument_model\tread_count\tfirst_public'.split('\t')
print columns
In [7]:
df = pd.DataFrame(data, columns=columns)
df
Out[7]:
In [8]:
years = []
for d in df['first_public']:
if d:
years.append(int(d.split('-')[0]))
else:
years.append(None)
read_counts = []
for rc in df['read_count']:
if rc:
read_counts.append(int(rc))
else:
read_counts.append(None)
df['year-public'] = years
df['read_count'] = read_counts
df.dropna(subset=('year-public', 'read_count'), inplace=True)
#df.to_csv('read_count_by_sample.csv')
ax = df.boxplot(column="read_count", by="year-public")
ax.set_ylim(0, 1000000)
Out[8]:
In [97]:
s = pd.DataFrame([df.groupby(by="study_accession", as_index=True)['read_count'].mean(),
df.groupby(by="study_accession", as_index=True)['year-public'].max(),
df.groupby(by="study_accession", as_index=True)['sample_accession'].count()])
s.index = ("Mean number of reads per sample (per study)", "Year of study publication", "Number of samples (per study)")
# uncomment to write results to file:
#s.to_csv('run_count_by_study.csv')
In [81]:
s
Out[81]:
In [95]:
ax = s.T.boxplot(column="Number of samples (per study)", by="Year of study publication", grid=False)
ax.set_ylim(0, 250)
ax.set_xlabel("Year of study publication")
ax.set_title("")
ax.set_ylabel("Number of samples per study")
Out[95]:
In [96]:
ax = s.T.boxplot(column="Mean number of reads per sample (per study)", by="Year of study publication", grid=False)
ax.set_ylim(0, 10000000)
ax.set_xlabel("Year of study publication")
ax.set_title("")
ax.set_ylabel("Mean number of reads per sample (per study)")
Out[96]:
In [89]:
s.T.groupby("Year of study publication").max()
Out[89]: