In [1]:
import glob
import matplotlib as mpl
import re

import pandas as pd

In [2]:
import sys
print(sys.version)


3.5.1 (v3.5.1:37a07cee5969, Dec  5 2015, 21:12:44) 
[GCC 4.2.1 (Apple Inc. build 5666) (dot 3)]

In [3]:
%matplotlib inline

In [4]:
!ls ../../unused_reads/


__pycache__               blast_unmapped.py         old                       unmapped-final
analysis.py               downsample_differently.py plots                     unused_reads.py
blast_multiply_mapped.py  multiply_mapped-final     test_run_pipeline.py      unused_reads.pyc

In [5]:
module_dir = "../../unused_reads/"

In [6]:
# Import .py file from a different path:
# first add the path to that dir to my path. 
sys.path.append(module_dir)
import analysis
import unused_reads as ur


/Users/janet/.virtualenvs/meta4/lib/python3.5/site-packages/matplotlib/__init__.py:1350: UserWarning:  This call to matplotlib.use() has no effect
because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

  warnings.warn(_use_error_msg)
import glob import re import matplotlib.pyplot as plt import matplotlib as mpl import pandas as pd import seaborn as sns

In [7]:
# pd.set_option('display.width', 1000)

In [8]:
pd.options.display.max_colwidth = 1000

In [9]:
PLOT_DIR = '../../unused_reads/plots/'
ur.create_dir(PLOT_DIR)

In [10]:
# Get the loaded dataframes. 
df_dict = analysis.run_analysis(make_plots=False)


['../../unused_reads/unmapped-final/blast_results/74_LOW10_10000-blasted.tsv', '../../unused_reads/unmapped-final/blast_results/70_HOW9_1000-blasted.tsv', '../../unused_reads/unmapped-final/blast_results/112_LOW13_10000-blasted.tsv', '../../unused_reads/unmapped-final/blast_results/19_HOW5_10000-blasted.tsv', '../../unused_reads/unmapped-final/blast_results/112_LOW13_1000-blasted.tsv', '../../unused_reads/unmapped-final/blast_results/70_HOW9_10000-blasted.tsv', '../../unused_reads/unmapped-final/blast_results/82_HOW10_10000-blasted.tsv', '../../unused_reads/unmapped-final/blast_results/57_HOW8_10000-blasted.tsv', '../../unused_reads/unmapped-final/blast_results/82_HOW10_1000-blasted.tsv', '../../unused_reads/unmapped-final/blast_results/32_HOW6_10000-blasted.tsv']
['../../unused_reads/multiply_mapped-final/blast_results/57_HOW8_100-blasted.tsv', '../../unused_reads/multiply_mapped-final/blast_results/19_HOW5_100-blasted.tsv', '../../unused_reads/multiply_mapped-final/blast_results/32_HOW6_100-blasted.tsv', '../../unused_reads/multiply_mapped-final/blast_results/74_LOW10_100-blasted.tsv', '../../unused_reads/multiply_mapped-final/blast_results/82_HOW10_100-blasted.tsv', '../../unused_reads/multiply_mapped-final/blast_results/70_HOW9_100-blasted.tsv', '../../unused_reads/multiply_mapped-final/blast_results/112_LOW13_100-blasted.tsv']

In [11]:
df_dict.keys()


Out[11]:
dict_keys(['unspecific', 'unmapped'])

In [12]:
unspecific = df_dict['unspecific']
unmapped = df_dict['unmapped']

Unmapped reads:


In [13]:
unmapped.head(2)


Out[13]:
stitle qseqid sseqid pident length evalue bitscore mismatch gapopen qstart qend sstart send sample downsample granularity
0 Bacillus subtilis subsp. subtilis strain CU1050, complete genome HISEQ15:132:C6FRCANXX:6:2115:13750:47625 gi|983204966|gb|CP014166.1| 100 151 7.000000e-75 291 0 0 1 151 3019253 3019103 74_LOW10 10000
1 Bacillus subtilis subsp. subtilis strain CU1050, complete genome HISEQ15:132:C6FRCANXX:6:2115:13750:47625 gi|983204966|gb|CP014166.1| 100 150 3.000000e-74 289 0 0 2 151 9885 10034 74_LOW10 10000

In [14]:
unmapped[['sample', 'downsample granularity']].drop_duplicates()


Out[14]:
sample downsample granularity
0 74_LOW10 10000
0 112_LOW13 10000
0 19_HOW5 10000
0 112_LOW13 1000
0 70_HOW9 10000
0 82_HOW10 10000
0 57_HOW8 10000
0 82_HOW10 1000
0 32_HOW6 10000

analysis.plot_length_dist(analysis.unmapped, 'unmapped', analysis.PLOT_DIR)

analysis.plot_pident_dist(analysis.unmapped, 'unmapped', analysis.PLOT_DIR)

Multiply Mapped Reads

Look at summary data


In [15]:
unmapped_summary = analysis.summarise(df=unmapped, min_pident=90, min_length=140, 
                       downsample_granularity=10000)


available downsampling granularities: [10000  1000]
fraction of rows removed for min percent_identity = 90: 96.70849516980381
fraction of rows removed for min length = 140: 53.39065959528345

In [16]:
unmapped_summary.head()


Out[16]:
stitle qseqid sseqid pident length evalue bitscore mismatch gapopen qstart qend sstart send sample downsample granularity
0 Bacillus subtilis subsp. subtilis strain CU1050, complete genome HISEQ15:132:C6FRCANXX:6:2115:13750:47625 gi|983204966|gb|CP014166.1| 100 151 7.000000e-75 291 0 0 1 151 3019253 3019103 74_LOW10 10000
1 Bacillus subtilis subsp. subtilis strain CU1050, complete genome HISEQ15:132:C6FRCANXX:6:2115:13750:47625 gi|983204966|gb|CP014166.1| 100 150 3.000000e-74 289 0 0 2 151 9885 10034 74_LOW10 10000
2 Bacillus subtilis subsp. subtilis strain CU1050, complete genome HISEQ15:132:C6FRCANXX:6:2115:13750:47625 gi|983204966|gb|CP014166.1| 100 150 3.000000e-74 289 0 0 2 151 30353 30502 74_LOW10 10000
3 Bacillus subtilis subsp. subtilis strain CU1050, complete genome HISEQ15:132:C6FRCANXX:6:2115:13750:47625 gi|983204966|gb|CP014166.1| 100 150 3.000000e-74 289 0 0 2 151 90610 90759 74_LOW10 10000
4 Bacillus subtilis subsp. subtilis strain CU1050, complete genome HISEQ15:132:C6FRCANXX:6:2115:13750:47625 gi|983204966|gb|CP014166.1| 100 150 3.000000e-74 289 0 0 2 151 96466 96615 74_LOW10 10000

In [17]:
unmapped_summary.to_csv("160324_explore_unmapped.csv")

In [18]:
unmapped_simplified = unmapped_summary[['stitle', 'qseqid', 'sample']].drop_duplicates()

In [19]:
unmapped_simplified.head()


Out[19]:
stitle qseqid sample
0 Bacillus subtilis subsp. subtilis strain CU1050, complete genome HISEQ15:132:C6FRCANXX:6:2115:13750:47625 74_LOW10
10 Uncultured bacterium clone KGV25 16S ribosomal RNA gene, partial sequence HISEQ15:132:C6FRCANXX:6:1207:11048:24151 74_LOW10
18 Salmonella enterica subsp. enterica serovar Enteritidis str. EC20110222 genome HISEQ15:132:C6FRCANXX:6:2314:2015:25645 74_LOW10
25 Uncultured bacterium clone F5K2Q4C04JXAH4 23S ribosomal RNA gene, partial sequence HISEQ15:132:C6FRCANXX:6:2204:9685:45387 74_LOW10
26 Bacillus subtilis subsp. natto strain CGMCC 2108, complete genome HISEQ15:132:C6FRCANXX:6:2215:1122:52758 74_LOW10

In [20]:
### Look at just one sample

In [21]:
unmapped_simp_74 = unmapped_simplified[unmapped_simplified['sample']=='74_LOW10']

In [22]:
test_df = unmapped[unmapped['sample']=='74_LOW10']
print(test_df.shape[0])
too_many = analysis.reads_appearing_more_than_once(test_df)


881
num unique query sequences: 232
number of sequences appearing more than 1 times: 92

In [23]:
too_many


Out[23]:
['HISEQ15:132:C6FRCANXX:6:2113:6832:2973',
 'HISEQ15:132:C6FRCANXX:6:1313:8157:65324',
 'HISEQ15:132:C6FRCANXX:6:2212:2298:10127',
 'HISEQ15:132:C6FRCANXX:6:1208:8018:45488',
 'HISEQ15:132:C6FRCANXX:6:1102:4303:85746',
 'HISEQ15:132:C6FRCANXX:6:2102:2404:56713',
 'HISEQ15:132:C6FRCANXX:6:2101:16533:46672',
 'HISEQ15:132:C6FRCANXX:6:1306:3764:14315',
 'HISEQ15:132:C6FRCANXX:6:1211:5050:78027',
 'HISEQ15:132:C6FRCANXX:6:2215:1122:52758',
 'HISEQ15:132:C6FRCANXX:6:2207:17475:60273',
 'HISEQ15:132:C6FRCANXX:6:1211:3942:20355',
 'HISEQ15:132:C6FRCANXX:6:2115:13750:47625',
 'HISEQ15:132:C6FRCANXX:6:1202:17885:46902',
 'HISEQ15:132:C6FRCANXX:6:2206:9478:97721',
 'HISEQ15:132:C6FRCANXX:6:1315:6899:17776',
 'HISEQ15:132:C6FRCANXX:6:1310:10372:15847',
 'HISEQ15:132:C6FRCANXX:6:1203:4528:52323',
 'HISEQ15:132:C6FRCANXX:6:2202:17495:67657',
 'HISEQ15:132:C6FRCANXX:6:1101:17669:99566',
 'HISEQ15:132:C6FRCANXX:6:1210:1583:59425',
 'HISEQ15:132:C6FRCANXX:6:1207:4615:24842',
 'HISEQ15:132:C6FRCANXX:6:1210:5090:42370',
 'HISEQ15:132:C6FRCANXX:6:1201:19938:21591',
 'HISEQ15:132:C6FRCANXX:6:2211:18799:9979',
 'HISEQ15:132:C6FRCANXX:6:1203:16007:93808',
 'HISEQ15:132:C6FRCANXX:6:2308:13378:54241',
 'HISEQ15:132:C6FRCANXX:6:1216:5031:84357',
 'HISEQ15:132:C6FRCANXX:6:1113:14854:67179',
 'HISEQ15:132:C6FRCANXX:6:2215:18151:22479',
 'HISEQ15:132:C6FRCANXX:6:1201:20089:84071',
 'HISEQ15:132:C6FRCANXX:6:2311:16259:95927',
 'HISEQ15:132:C6FRCANXX:6:2314:2015:25645',
 'HISEQ15:132:C6FRCANXX:6:2212:13457:16246',
 'HISEQ15:132:C6FRCANXX:6:2208:7063:44113',
 'HISEQ15:132:C6FRCANXX:6:2315:6234:65724',
 'HISEQ15:132:C6FRCANXX:6:1215:18126:73739',
 'HISEQ15:132:C6FRCANXX:6:1305:5311:88721',
 'HISEQ15:132:C6FRCANXX:6:2213:21072:47381',
 'HISEQ15:132:C6FRCANXX:6:1202:3083:5695',
 'HISEQ15:132:C6FRCANXX:6:2310:20051:96808',
 'HISEQ15:132:C6FRCANXX:6:2111:14750:42771',
 'HISEQ15:132:C6FRCANXX:6:1206:11050:42012',
 'HISEQ15:132:C6FRCANXX:6:1313:12832:84397',
 'HISEQ15:132:C6FRCANXX:6:2204:3666:22526',
 'HISEQ15:132:C6FRCANXX:6:2110:15232:60855',
 'HISEQ15:132:C6FRCANXX:6:1206:2085:62779',
 'HISEQ15:132:C6FRCANXX:6:1201:18389:63402',
 'HISEQ15:132:C6FRCANXX:6:1303:5539:99913',
 'HISEQ15:132:C6FRCANXX:6:2313:20815:91615',
 'HISEQ15:132:C6FRCANXX:6:1109:16587:80695',
 'HISEQ15:132:C6FRCANXX:6:1209:8394:82584',
 'HISEQ15:132:C6FRCANXX:6:2216:10862:55008',
 'HISEQ15:132:C6FRCANXX:6:1312:10235:50541',
 'HISEQ15:132:C6FRCANXX:6:1113:4329:11043',
 'HISEQ15:132:C6FRCANXX:6:1207:17025:45395',
 'HISEQ15:132:C6FRCANXX:6:1309:6297:75037',
 'HISEQ15:132:C6FRCANXX:6:1207:6024:4939',
 'HISEQ15:132:C6FRCANXX:6:1205:16024:79442',
 'HISEQ15:132:C6FRCANXX:6:1309:5471:34745',
 'HISEQ15:132:C6FRCANXX:6:2112:14688:23155',
 'HISEQ15:132:C6FRCANXX:6:2116:15801:47261',
 'HISEQ15:132:C6FRCANXX:6:1107:3442:24751',
 'HISEQ15:132:C6FRCANXX:6:1210:16730:100293',
 'HISEQ15:132:C6FRCANXX:6:2211:11757:70030',
 'HISEQ15:132:C6FRCANXX:6:2201:13137:25091',
 'HISEQ15:132:C6FRCANXX:6:2108:6961:75310',
 'HISEQ15:132:C6FRCANXX:6:1302:13224:29216',
 'HISEQ15:132:C6FRCANXX:6:1313:15476:46434',
 'HISEQ15:132:C6FRCANXX:6:2312:15664:35396',
 'HISEQ15:132:C6FRCANXX:6:1113:4247:85868',
 'HISEQ15:132:C6FRCANXX:6:2202:12774:90034',
 'HISEQ15:132:C6FRCANXX:6:2112:10373:70650',
 'HISEQ15:132:C6FRCANXX:6:2211:11399:89838',
 'HISEQ15:132:C6FRCANXX:6:2313:5002:34452',
 'HISEQ15:132:C6FRCANXX:6:2108:10791:96131',
 'HISEQ15:132:C6FRCANXX:6:1302:14421:71099',
 'HISEQ15:132:C6FRCANXX:6:2214:9523:83932',
 'HISEQ15:132:C6FRCANXX:6:2308:3357:12768',
 'HISEQ15:132:C6FRCANXX:6:2216:14101:74064',
 'HISEQ15:132:C6FRCANXX:6:1215:4757:55477',
 'HISEQ15:132:C6FRCANXX:6:2312:8972:94302',
 'HISEQ15:132:C6FRCANXX:6:1116:13991:9296',
 'HISEQ15:132:C6FRCANXX:6:2306:11276:24123',
 'HISEQ15:132:C6FRCANXX:6:2113:9911:41723',
 'HISEQ15:132:C6FRCANXX:6:2305:18989:60301',
 'HISEQ15:132:C6FRCANXX:6:1116:14465:81998',
 'HISEQ15:132:C6FRCANXX:6:2212:9281:88097',
 'HISEQ15:132:C6FRCANXX:6:1114:11159:98958',
 'HISEQ15:132:C6FRCANXX:6:1213:11593:85396',
 'HISEQ15:132:C6FRCANXX:6:2202:21073:12526',
 'HISEQ15:132:C6FRCANXX:6:2105:13519:39410']
analysis.save_summary(unmapped[unmapped['sample']=='74_LOW10'], '74_LOW10', module_dir + 'results/')

Apply save_summary for each sample.

for desc, df in unmapped.groupby(['sample', 'downsample granularity']): print(desc) analysis.save_summary(df, '{}_{}'.format(desc[0], desc[1]), module_dir + 'unmapped-final' + '/results_summary/' + '/downsample_{}/'.format(desc[1]))

In [25]:
analysis.summarise_results_across_samples(unmapped, module_dir + 'unmapped-final' )


('112_LOW13', 1000)
number of reads: 22568
Was more than one result per read returned?  22568 vs 22568 rows
save file at filepath ../../unused_reads/unmapped-final/results_summary//downsample_1000/
('112_LOW13', 10000)
number of reads: 2244
Was more than one result per read returned?  2244 vs 2244 rows
save file at filepath ../../unused_reads/unmapped-final/results_summary//downsample_10000/
('19_HOW5', 10000)
number of reads: 3202
Was more than one result per read returned?  3202 vs 3202 rows
save file at filepath ../../unused_reads/unmapped-final/results_summary//downsample_10000/
('32_HOW6', 10000)
number of reads: 1163
Was more than one result per read returned?  1163 vs 1163 rows
save file at filepath ../../unused_reads/unmapped-final/results_summary//downsample_10000/
('57_HOW8', 10000)
number of reads: 1662
Was more than one result per read returned?  1662 vs 1662 rows
save file at filepath ../../unused_reads/unmapped-final/results_summary//downsample_10000/
('70_HOW9', 10000)
number of reads: 3957
Was more than one result per read returned?  3957 vs 3957 rows
save file at filepath ../../unused_reads/unmapped-final/results_summary//downsample_10000/
('74_LOW10', 10000)
number of reads: 232
Was more than one result per read returned?  232 vs 232 rows
save file at filepath ../../unused_reads/unmapped-final/results_summary//downsample_10000/
('82_HOW10', 1000)
number of reads: 20499
Was more than one result per read returned?  20499 vs 20499 rows
save file at filepath ../../unused_reads/unmapped-final/results_summary//downsample_1000/
('82_HOW10', 10000)
number of reads: 2063
Was more than one result per read returned?  2063 vs 2063 rows
save file at filepath ../../unused_reads/unmapped-final/results_summary//downsample_10000/

In [26]:
analysis.summarise_results_across_samples(unspecific, module_dir + 'multiply_mapped-final' )


('112_LOW13', 100)
number of reads: 1413
Was more than one result per read returned?  1413 vs 1413 rows
save file at filepath ../../unused_reads/unspecific-final/results_summary//downsample_100/
('19_HOW5', 100)
number of reads: 458
Was more than one result per read returned?  458 vs 458 rows
save file at filepath ../../unused_reads/unspecific-final/results_summary//downsample_100/
('32_HOW6', 100)
number of reads: 2197
Was more than one result per read returned?  2197 vs 2197 rows
save file at filepath ../../unused_reads/unspecific-final/results_summary//downsample_100/
('57_HOW8', 100)
number of reads: 1895
Was more than one result per read returned?  1895 vs 1895 rows
save file at filepath ../../unused_reads/unspecific-final/results_summary//downsample_100/
('70_HOW9', 100)
number of reads: 844
Was more than one result per read returned?  844 vs 844 rows
save file at filepath ../../unused_reads/unspecific-final/results_summary//downsample_100/
('74_LOW10', 100)
number of reads: 1936
Was more than one result per read returned?  1936 vs 1936 rows
save file at filepath ../../unused_reads/unspecific-final/results_summary//downsample_100/
('82_HOW10', 100)
number of reads: 1562
Was more than one result per read returned?  1562 vs 1562 rows
save file at filepath ../../unused_reads/unspecific-final/results_summary//downsample_100/

In [ ]: