import sys


In [1]:
import pandas as pd

In [2]:
import seaborn as sns

In [3]:
import matplotlib as mpl
import matplotlib.pyplot as plt

In [4]:
%matplotlib inline

In [5]:
import aggregate_mummer_results

In [6]:
full_data = pd.read_csv("percent_identities.tsv" ,sep = '\t')

In [7]:
full_data.head(3)


Out[7]:
Unnamed: 0 mummer file ref bin query bin query name query contig count query bin name ref name ref contig count ref bin name ... query contigs query file name ref bp ref contigs ref file name % identity query alignment length total number alignments aggregated frac of query aligned estimated % identity
0 0 Rhodocyclaceae-49_Ga0081631_to_Methylophilacea... Ga0081624 Ga0081631 Rhodocyclaceae-49 190 Rhodocyclaceae-49_Ga0081631 Methylophilaceae-40 240 Methylophilaceae-40_Ga0081624 ... 189 Rhodocyclaceae-49_Ga0081631.fasta 1297884 239 Methylophilaceae-40_Ga0081624.fasta 86.967273 286 2 0.000077 0.006723
1 0 Methylosarcina_lacus-69_Ga0081643_to_Acidovora... Ga0081609 Ga0081643 Methylosarcina lacus-69 217 Methylosarcina_lacus-69_Ga0081643 Acidovorax-123 18 Acidovorax-123_Ga0081609 ... 216 Methylosarcina_lacus-69_Ga0081643.fasta 2720811 17 Acidovorax-123_Ga0081609.fasta 82.767176 1388 3 0.000339 0.028097
2 0 Methylosarcina_lacus-55_Ga0081634_to_Acidovora... Ga0081651 Ga0081634 Methylosarcina lacus-55 443 Methylosarcina_lacus-55_Ga0081634 Acidovorax-79 98 Acidovorax-79_Ga0081651 ... 442 Methylosarcina_lacus-55_Ga0081634.fasta 2768114 97 Acidovorax-79_Ga0081651.fasta 79.679943 1048 2 0.000563 0.044885

3 rows × 21 columns


In [8]:
organism_names = full_data['query name'].unique()

In [9]:
organism_names


Out[9]:
array(['Rhodocyclaceae-49', 'Methylosarcina lacus-69',
       'Methylosarcina lacus-55', 'Methylotenera mobilis-76-1',
       'Methylobacter-69', 'Methylophilus methylotrophus-127-1',
       'Opititae-40', 'Methylobacte-98r', 'Acidovorax-123',
       'Methylophilaceae-49', 'Acidovorax-75', 'Bacteria-21',
       'Methylotenera mobilis-76-2', 'Acidovora-69x',
       'Methylophilus methylotrophus-129-2', 'Bacteriovora-8x',
       'Flavobacteriaceae-79-2', 'Methylobacter-123',
       'Methylophilus methylotrophus-98', 'Methylophilaceae-40',
       'Methylophilus methylotrophus-55', 'Burkholderiales-55',
       'Methylotenera mobilis-123', 'Burkholderiales-76',
       'Methylophilaceae-8', 'Acidovorax-21',
       'Methylophilus methylotrophus-129-1', 'Acidovorax-98',
       'Methylophilaceae-55', 'Bacteriovorax-49',
       'Methylotenera mobilis-63', 'Methylovulum miyakonense-55',
       'Burkholderiales-129', 'Flavobacteriaceae-49',
       'Methylophilus methylotrophus-79',
       'Methylobacter tundripaludum-129', 'Acidovorax-127',
       'Rhodocyclaceae-127', 'Methylophilus methylotrophus-69',
       'Methylophilus methylotrophus-127-2', 'Acidovorax-79',
       'Opitutae-129', 'Methylosarcina-21', 'Methylococcaceae-55',
       'Bacteroidetes-76', 'Bacteroidetes-49', 'Methylococcaceae-21',
       'Bacteroidetes-8', 'Flavobacteriaceae-79-1', 'Bacteriovorax-63',
       'Bacteriovorax-21'], dtype=object)

In [10]:
plot_names = [n for n in organism_names if "Methylotenera" in n] + \
             [n for n in organism_names if "Acidovorax" in n]

In [11]:
plot_names


Out[11]:
['Methylotenera mobilis-76-1',
 'Methylotenera mobilis-76-2',
 'Methylotenera mobilis-123',
 'Methylotenera mobilis-63',
 'Acidovorax-123',
 'Acidovorax-75',
 'Acidovorax-21',
 'Acidovorax-98',
 'Acidovorax-127',
 'Acidovorax-79']

In [12]:
def only_selected_query_and_ref(name_list):
    all_data = pd.read_csv("percent_identities.tsv" ,sep = '\t')
    all_data = all_data[all_data['query name'].isin(name_list)]
    all_data = all_data[all_data['ref name'].isin(name_list)]
    print("num rows selected: {}".format(all_data.shape[0]))
    return all_data

In [13]:
plot_data = only_selected_query_and_ref(plot_names)


num rows selected: 86

In [14]:
ax = plt.axes()
sns.heatmap(aggregate_mummer_results.pivot_identity_table(plot_data), ax = ax, )
ax.set_title('% identity \n (length-weighted)')
ax.figure.tight_layout()
ax.figure.set_size_inches(w=4, h=6)
ax.figure.savefig('160601_original_percent_identity_measure.pdf')



In [15]:
plot_data.head()


Out[15]:
Unnamed: 0 mummer file ref bin query bin query name query contig count query bin name ref name ref contig count ref bin name ... query contigs query file name ref bp ref contigs ref file name % identity query alignment length total number alignments aggregated frac of query aligned estimated % identity
3 0 Methylotenera_mobilis-76-1_Ga0081647_to_Methyl... Ga0081640 Ga0081647 Methylotenera mobilis-76-1 208 Methylotenera_mobilis-76-1_Ga0081647 Methylotenera mobilis-63 291 Methylotenera_mobilis-63_Ga0081640 ... 207 Methylotenera_mobilis-76-1_Ga0081647.fasta 1642972 290 Methylotenera_mobilis-63_Ga0081640.fasta 84.924765 305230 149 0.125403 10.649837
12 0 Methylotenera_mobilis-76-2_Ga0081649_to_Acidov... Ga0081613 Ga0081649 Methylotenera mobilis-76-2 409 Methylotenera_mobilis-76-2_Ga0081649 Acidovorax-127 60 Acidovorax-127_Ga0081613 ... 408 Methylotenera_mobilis-76-2_Ga0081649.fasta 3161893 59 Acidovorax-127_Ga0081613.fasta 100.000000 159 2 0.000053 0.005304
32 0 Acidovorax-21_Ga0081621_to_Methylotenera_mobil... Ga0081649 Ga0081621 Acidovorax-21 446 Acidovorax-21_Ga0081621 Methylotenera mobilis-76-2 409 Methylotenera_mobilis-76-2_Ga0081649 ... 445 Acidovorax-21_Ga0081621.fasta 2997616 408 Methylotenera_mobilis-76-2_Ga0081649.fasta 97.670000 86 1 0.000047 0.004586
36 0 Acidovorax-98_Ga0081659_to_Acidovorax-123_Ga00... Ga0081609 Ga0081659 Acidovorax-98 359 Acidovorax-98_Ga0081659 Acidovorax-123 18 Acidovorax-123_Ga0081609 ... 358 Acidovorax-98_Ga0081659.fasta 2720811 17 Acidovorax-123_Ga0081609.fasta 83.451002 170025 156 0.050519 4.215872
40 0 Methylotenera_mobilis-123_Ga0081608_to_Methylo... Ga0081649 Ga0081608 Methylotenera mobilis-123 300 Methylotenera_mobilis-123_Ga0081608 Methylotenera mobilis-76-2 409 Methylotenera_mobilis-76-2_Ga0081649 ... 299 Methylotenera_mobilis-123_Ga0081608.fasta 2997616 408 Methylotenera_mobilis-76-2_Ga0081649.fasta 98.991738 1187351 260 0.539381 53.394227

5 rows × 21 columns


In [16]:
ax = plt.axes()
sns.heatmap(aggregate_mummer_results.pivot_identity_table(plot_data, value_var='estimated % identity'), 
            ax = ax)
ax.set_title('(% identity)*(fraction aligned))')
ax.figure.tight_layout()
ax.figure.set_size_inches(w=4, h=6)
ax.figure.savefig('160601_original_percent_identity_tims_frac_aligned.pdf')



In [17]:
def subset_given_colnames(name_list):
    full_data = pd.read_csv("percent_identities.tsv" ,sep = '\t')
    all_names = full_data['query name'].unique()
    
    # build a list of names to pick out. 
    plot_names = []
    
    for org_name in name_list:
        plot_names += [n for n in organism_names if org_name in n]
    
    # reduce to the desired organisms. 
    selected_data = full_data.copy()
    selected_data = selected_data[selected_data['query name'].isin(plot_names)]
    selected_data = selected_data[selected_data['ref name'].isin(plot_names)]
    
    print("num rows selected: {}".format(selected_data.shape[0]))
    return selected_data
subset_given_colnames(['Acidovorax', 'Methylotenera mobilis'])

In [18]:
def plot_metrics_as_heatmaps(metric_list, organism_list, figsize=(10, 6),
                            filename = None):
    print(len(metric_list))
    fig, axn = plt.subplots(1, len(metric_list), 
                            sharex=True, sharey=True, 
                            figsize=figsize)
    cbar_ax = fig.add_axes([.91, .3, .03, .4])

    data = subset_given_colnames(name_list = organism_list)
    data['% of query aligned'] = data['frac of query aligned']*100
    
    for i, metric in enumerate(metric_list):
        # prepare pivoted data
        print("i: {}, metric: {}".format(i, metric))
        subplot_ax = axn[i]
        print('axis: {}'.format(subplot_ax))
        subplot_data = aggregate_mummer_results.pivot_identity_table(data, 
                                                                     value_var=metric)
        sns.heatmap(subplot_data, ax=axn[i],
                    cbar=i == 0,
                    vmin=0, vmax=100,
                    cbar_ax=None if i else cbar_ax
                   )
        subplot_ax.set_title(metric)

    fig.tight_layout(rect=[0, 0, .9, 1])
    print(type(fig))
    print(type(axn))
    if filename is not None:
        fig.savefig(filename)
        fig.savefig(filename.rstrip('pdf') + 'svg')
mpl.rcParams.update({ 'font.size': 12, 'axes.titlesize': 14, 'axes.labelsize': 12, 'xtick.labelsize': 12, 'ytick.labelsize': 12, #'font.family': 'Lato', 'font.weight': 600, 'axes.labelweight': 300, 'axes.titleweight': 100, 'figure.autolayout': True})

In [19]:
mpl.rcParams.update({'axes.titleweight': 600})

In [20]:
p = plot_metrics_as_heatmaps(['% identity', '% of query aligned', 'estimated % identity'], 
                             ['Methylotenera mobilis', 'Acidovorax'],
                             figsize=(11, 4),
                             filename = '160601_ANI_metric_development.pdf')
#p.figure.savefig()
#p.figure.savefig('160601_ANI_metric_development.svg')


3
num rows selected: 86
i: 0, metric: % identity
axis: Axes(0.125,0.125;0.227941x0.775)
i: 1, metric: % of query aligned
axis: Axes(0.398529,0.125;0.227941x0.775)
i: 2, metric: estimated % identity
axis: Axes(0.672059,0.125;0.227941x0.775)
<class 'matplotlib.figure.Figure'>
<class 'numpy.ndarray'>
/Users/janet/miniconda3/envs/m4_janalysis/lib/python3.5/site-packages/matplotlib/figure.py:1744: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "

In [24]:
p = plot_metrics_as_heatmaps(['% of query aligned', 'estimated % identity'], 
                             organism_names,
                             figsize=(20, 15),
                             filename = '160603_all_Fauzi--percent_aligned_an_percent_identity.pdf')
#p.figure.savefig()
#p.figure.savefig('160601_ANI_metric_development.svg')


2
num rows selected: 1264
i: 0, metric: % of query aligned
axis: Axes(0.125,0.125;0.352273x0.775)
i: 1, metric: estimated % identity
axis: Axes(0.547727,0.125;0.352273x0.775)
/Users/janet/miniconda3/envs/m4_janalysis/lib/python3.5/site-packages/matplotlib/figure.py:1744: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "
<class 'matplotlib.figure.Figure'>
<class 'numpy.ndarray'>

In [30]:
p = plot_metrics_as_heatmaps(['% of query aligned', 'estimated % identity'], 
                             ['Methylophilus methylotrophus'],
                             figsize=(7, 4.5),
                             filename = '160603_Fauzi_Methylophilus_methylotrophus--percent_aligned_and_percent_identity.pdf')
#p.figure.savefig()
#p.figure.savefig('160601_ANI_metric_development.svg')


2
num rows selected: 64
i: 0, metric: % of query aligned
axis: Axes(0.125,0.125;0.352273x0.775)
i: 1, metric: estimated % identity
axis: Axes(0.547727,0.125;0.352273x0.775)
/Users/janet/miniconda3/envs/m4_janalysis/lib/python3.5/site-packages/matplotlib/figure.py:1744: UserWarning: This figure includes Axes that are not compatible with tight_layout, so its results might be incorrect.
  warnings.warn("This figure includes Axes that are not "
<class 'matplotlib.figure.Figure'>
<class 'numpy.ndarray'>

In [ ]: