In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from scipy.spatial.distance import pdist, squareform
import scipy.spatial as sp
import scipy.cluster.hierarchy as hc

from sklearn.manifold import MDS, TSNE

import matplotlib.pyplot as plt
import seaborn as sns

from bokeh.plotting import figure, output_file, show
from bokeh.transform import transform, factor_cmap, factor_mark
from bokeh.models import ColumnDataSource, Range1d, LabelSet, Label
from bokeh.io import output_notebook
from bokeh.palettes import Greys, RdGy, YlGn, Spectral, Category10

rp = 'res/2020-02-03'
meta_meta_df = pd.read_csv(f'{rp}/in/meta_meta.csv', index_col=0)

output_notebook()
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
Loading BokehJS ...
In [2]:
meta_meta_df.sink.unique()
Out[2]:
array(['A8', 'A9', 'A10', 'A25', 'B9', 'B12', 'B16', 'B19', 'C1', 'C2',
       'C5', 'C10'], dtype=object)
In [3]:
# !rsync rescomp:~/sinkhole/anal/mash/mg/mg-mash-dists.phylip {rp}
In [4]:
def lower_triangle_to_full_matrix(filename):
    num_lines_in_file = sum(1 for line in open(filename))
    distances = []
    sample_names = []

    with open(filename) as f:
        next(f) # skip sample count line
        for line in f:
            elements = line.strip().split('\t')
            sample_names.append(elements[0])
            row = [float(e) for e in elements[1:]]
            row.extend([0.0] * (num_lines_in_file-1-len(row)))
            distances.append(row)
        np_array = np.asarray(distances)
        index_upper = np.triu_indices(num_lines_in_file-1)
        np_array[index_upper] = np_array.T[index_upper]
        return pd.DataFrame(np_array, columns=sample_names, index=sample_names)
In [5]:
guids_samples = meta_meta_df['sample'].to_dict()
mg_sink_lut = dict(zip(meta_meta_df['sink'].unique(), sns.color_palette("Paired")))
mg_sink_couleurs = meta_meta_df.set_index('sample')['sink'].rename('sink').map(mg_sink_lut)

mg_ward_lut = dict(zip(meta_meta_df['ward'].unique(), Category10[3]))
mg_ward_couleurs = meta_meta_df.set_index('sample')['ward'].rename('ward').map(mg_ward_lut)
In [6]:
mash_k31_guids_dm_df = lower_triangle_to_full_matrix(f'{rp}/mg-mash-dists.phylip')
mash_k31_guids_dm_df.to_csv(f'{rp}/mash_k31_guids_dm.tsv', sep='\t')
mash_k31_samples_dm_df = mash_k31_guids_dm_df.copy().rename(index=guids_samples).rename(columns=guids_samples)

Proper linkage

In [7]:
# cm_df = mash_k31_samples_dm_df.T.corr()
# dm_df = 1 - cm_df
dm_df = mash_k31_samples_dm_df.copy()
linkage = hc.linkage(sp.distance.squareform(dm_df), method='average')
sns.set_style('white')
fig = sns.clustermap(dm_df,
                     row_linkage=linkage,
                     col_linkage=linkage,
                     col_colors=mg_ward_couleurs,
#                      col_colors=[mg_ward_couleurs, mg_sink_couleurs],
                     cmap='Greys',
                    robust=True)
# fig.savefig(f'{rp}/mash_k31_mg_cm.svg')

fig.cax.set_visible(False)
In [18]:
pw = squareform(pdist(dm_df, 'euclidean'))
pw_df = pd.DataFrame(pw, index=dm_df.index, columns=dm_df.index)
embedding = MDS(n_components=2, dissimilarity='precomputed', metric=True).fit_transform(pw_df)

embedding_df = pd.DataFrame(embedding, index=dm_df.index, columns=['x', 'y'])
embedding_meta_df = pd.merge(embedding_df, meta_meta_df[['ward', 'sink', 'sample', 'timepoint']], left_index=True, right_on='sample').query("timepoint != 'T5'")
embedding_meta_df = embedding_meta_df.replace({'7B': 'GM (A)', 'AICU': 'ACC (B)', 'EAU': 'AA (C)'})
In [19]:
# markers = ['triangle', 'circle', 'square']
# title = "Metagenomic pairwise MASH distances"

# p = figure(title=title, plot_width=400, plot_height=400)

# p.xaxis.axis_label = 'MDS 1'
# p.yaxis.axis_label = 'MDS 2'

# p.scatter('x', 'y',
#           source=ColumnDataSource(embedding_meta_df.fillna('')),
#           legend_field='ward',
#           fill_alpha=0,
#           line_width=2,
#           marker='x',
# #           line_alpha=0.66,
#           size=8,
#           color=factor_cmap('ward', 'Set1_3', tuple(embedding_meta_df.ward.unique())))
# #           marker=factor_mark('ward', markers, tuple(embedding_meta_df.ward.unique())))

# labels = LabelSet(x='x', y='y',
#                   text='sample',
#                   level='glyph',
# #                   text_color=factor_cmap('ward', Spectral[3], tuple(embedding_meta_df.ward.unique())),
#                   text_font_size="8pt",
# #                   x_offset=3, y_offset=1,
#                   x_offset=2, y_offset=1,
#                   source=ColumnDataSource(embedding_meta_df.fillna('')))
# p.add_layout(labels)

# p.legend.padding = 4
# p.legend.label_text_font_size = '8pt'
# p.legend.label_height = 5
# p.xgrid.grid_line_color = None
# p.ygrid.grid_line_color = None
# p.toolbar_location=None
# p.legend.location = 'top_left'
# p.outline_line_color = 'black'
# # p.legend.visible = False


# p.xaxis.major_tick_line_color = None  # turn off x-axis major ticks
# p.xaxis.minor_tick_line_color = None  # turn off x-axis minor ticks
# p.yaxis.major_tick_line_color = None  # turn off y-axis major ticks
# p.yaxis.minor_tick_line_color = None  # turn off y-axis minor ticks

# # output_file(f'{root_path}/kraken2_minikraken_species_t1234_labels.html', title=title)

# show(p)
In [20]:
sns.set_style('white')

fig, ax = plt.subplots(figsize=(6, 6))
ax = sns.scatterplot('x', 'y',
                      data=embedding_meta_df,
                      hue='ward',
                      hue_order=('GM (A)', 'ACC (B)', 'AA (C)'),
                      s=75,
                      palette=Category10[3])
for line in range(0, embedding_meta_df.shape[0]):
     ax.text(embedding_meta_df['x'][line]+0.003, embedding_meta_df['y'][line]+0.001, 
     embedding_meta_df['sample'][line], horizontalalignment='left', 
     size='10', color='black')
        
# ax.set_xlim(-0.09, 0.21)

ax.set_xlabel('MDS 1')
ax.set_ylabel('MDS 2')

ax.set_xticklabels([])
ax.set_yticklabels([])

# fig.savefig(f'{rp}/mash_k31_mg_mds.svg')
In [19]:
boxplot_df = dm_df.unstack().to_frame().reset_index().copy()

boxplot_df['warda'] = boxplot_df['level_0'].str[0]
boxplot_df['wardb'] = boxplot_df['level_1'].str[0]
boxplot_df['sameward'] = boxplot_df['warda'] == boxplot_df['wardb']
boxplot_df['wardcomparison'] = boxplot_df['sameward'].map({False: 'different ward', True: 'same ward'})

boxplot_df['sinka'] = boxplot_df['level_0'].str.partition('T')[0]
boxplot_df['sinkb'] = boxplot_df['level_1'].str.partition('T')[0]
boxplot_df['samesink'] = boxplot_df['sinka'] == boxplot_df['sinkb']
boxplot_df['sinkcomparison'] = boxplot_df['samesink'].map({False: 'different sink', True: 'same sink'})

boxplot_df['comparison'] = boxplot_df['sinkcomparison'].astype(str) + ', ' + boxplot_df['wardcomparison'].astype(str)

boxplot_df.rename({0: 'dist'}, axis=1, inplace=True)

boxplot_df['comparison'].replace({'same sink, same ward': 'within sink',
                                 'different sink, same ward': 'within ward',
                                 'different sink, different ward' : 'between wards'},
                                 inplace=True)
boxplot_df = boxplot_df.query("level_0 != level_1")  # Remove self comparisons
In [43]:
sns.set_style('white')
fig, ax = plt.subplots(figsize=(4, 6))
# Remove compar
fig = sns.boxplot(y='dist', x='comparison', data=boxplot_df,
                  order=['within sink', 'within ward', 'between wards'],
                  palette=['white', 'white', 'white'])
ax.set_xlabel(None)
# plt.savefig(f'{rp}/mash_k31_box.svg')