In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
files = !ls /path/to/metadata-sequences/*_filtered_seqs.length.txt
In [4]:
df = pd.DataFrame()
for path in files:
df_single = pd.read_csv(path, header=None)
x = re.match('/path/to/metadata-sequences/([0-9]*)_filtered_seqs.length.txt', path)
study = x.groups()[0]
df_single.columns = [study]
df = pd.concat([df, df_single.describe()], axis=1)
In [5]:
df.to_csv('/path/to/metadata-sequences/length_filtered_seqs_adaptor_cleanup.csv', index_label='study_id')
In [6]:
df
Out[6]:
In [8]:
df.loc['50%'].round().value_counts()
Out[8]:
In [10]:
df.loc['50%'].sort_values().head()
Out[10]:
In [11]:
fig, ax = plt.subplots()
print('sequence length after adaptor clean-up -- %s EMP studies' % df.shape[1])
ax.hist(df.loc['50%'].round(),
bins=np.arange(df.loc['mean'].round().min()-0.5, df.loc['mean'].round().max()+1.5, 1),
bottom=-0.01)
ax.set_xlabel('Median sequence length of sequences in study (bp)', fontsize=11)
plt.xticks(fontsize=10)
ax.set_ylabel('Number of studies', fontsize=11)
plt.yticks(fontsize=10)
ax.set_ylim([0, 50])
plt.savefig('/path/to/metadata-sequences/median_length_filtered_seqs_adaptor_cleanup.pdf')