In [1]:
import glob
import matplotlib as mpl
import re
import pandas as pd
In [2]:
import sys
print(sys.version)
In [3]:
%matplotlib inline
In [4]:
!ls ../../unused_reads/
In [5]:
module_dir = "../../unused_reads/"
In [6]:
# Import .py file from a different path:
# first add the path to that dir to my path.
sys.path.append(module_dir)
import analysis
import unused_reads as ur
In [7]:
# pd.set_option('display.width', 1000)
In [8]:
pd.options.display.max_colwidth = 1000
In [9]:
PLOT_DIR = '../../unused_reads/plots/'
ur.create_dir(PLOT_DIR)
In [10]:
# Get the loaded dataframes.
df_dict = analysis.run_analysis(make_plots=False)
In [11]:
df_dict.keys()
Out[11]:
In [12]:
unspecific = df_dict['unspecific']
unmapped = df_dict['unmapped']
In [13]:
unmapped.head(2)
Out[13]:
In [14]:
unmapped[['sample', 'downsample granularity']].drop_duplicates()
Out[14]:
analysis.plot_length_dist(analysis.unmapped, 'unmapped', analysis.PLOT_DIR)
analysis.plot_pident_dist(analysis.unmapped, 'unmapped', analysis.PLOT_DIR)
In [15]:
unmapped_summary = analysis.summarise(df=unmapped, min_pident=90, min_length=140,
downsample_granularity=10000)
In [16]:
unmapped_summary.head()
Out[16]:
In [17]:
unmapped_summary.to_csv("160324_explore_unmapped.csv")
In [18]:
unmapped_simplified = unmapped_summary[['stitle', 'qseqid', 'sample']].drop_duplicates()
In [19]:
unmapped_simplified.head()
Out[19]:
In [20]:
### Look at just one sample
In [21]:
unmapped_simp_74 = unmapped_simplified[unmapped_simplified['sample']=='74_LOW10']
In [22]:
test_df = unmapped[unmapped['sample']=='74_LOW10']
print(test_df.shape[0])
too_many = analysis.reads_appearing_more_than_once(test_df)
In [23]:
too_many
Out[23]:
In [25]:
analysis.summarise_results_across_samples(unmapped, module_dir + 'unmapped-final' )
In [26]:
analysis.summarise_results_across_samples(unspecific, module_dir + 'multiply_mapped-final' )
In [ ]: