In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from Bio import SeqIO
from natsort import natsorted

from bokeh.palettes import Category10

root_path = 'res/2020-02-02'
meta_df = pd.read_csv(f'{root_path}/in/guids_meta.tsv', sep='\t', index_col=0)
meta_meta_df = pd.read_csv(f'{root_path}/in/meta_meta.csv', index_col=0).query("seq_batch != 3")  # We don't want third batches for now
guids_samples = meta_meta_df['sample'].to_dict()
samples_guids = {v: k for k, v in guids_samples.items()}

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Meta bams to deinterleaved fastq

In [2]:
deint_root = '~/sinkhole/data/meta/first/fq_deint'
bam_root = '~/raw_input'
linked_deint_root = '~/sinkhole/code/respipe/meta-func/fq_deint'

cmds = {g: f'samtools fastq -N --threads 4 -1 {deint_root}/{g}_1.fastq.gz -2 {deint_root}/{g}_2.fastq.gz -0 /dev/null -s /dev/null {bam_root}/{g}/in.bam'
        for g in meta_meta_df.index}
with open(f'{root_path}/meta_bams_to_fqs.sh', 'w') as cmds_fh:
    cmds_fh.write('\n'.join(cmds.values()))
In [3]:
# !rsync {root_path}/meta_bams_to_fqs.sh anal1:~/sinkhole/code/misc
In [4]:
cmds = {g: f'ln -s {deint_root}/{g}_1.fastq.gz {linked_deint_root}/{g}_1.fastq.gz; ln -s {deint_root}/{g}_2.fastq.gz {linked_deint_root}/{g}_2.fastq.gz'
        for g in meta_meta_df.index}
with open(f'{root_path}/symlink_fqs_respipe.sh', 'w') as cmds_fh:
    cmds_fh.write('\n'.join(cmds.values()))
# for cmd in cmds.values():
#     print(cmd)
In [5]:
# !rsync {root_path}/symlink_fqs_respipe.sh anal1:~/sinkhole/code/misc

Genomic and metagenomic resistance figure

Parse PHMs from data directly

In [6]:
phm_aros = set((r.id.partition('ARO:')[2].partition('|')[0] for r in SeqIO.parse(f'{root_path}/in/card/data/nucleotide_fasta_protein_homolog_model.fasta', format='fasta')))
phm_df = pd.DataFrame(phm_aros).set_index(0)

ontology_df = pd.read_csv(f'{root_path}/in/card/ontology/aro.tsv', sep='\t').drop('ID', axis=1)
ontology_df['Accession'] = ontology_df['Accession'].str.replace('ARO:','')
ontology_df = ontology_df.set_index('Accession')

phm_meta_df = pd.merge(phm_df, ontology_df, left_index=True, right_index=True, how='left')
phm_meta_df.head()
Out[6]:
Name Description
3000005 vanD VanD is a D-Ala-D-Ala ligase homolog similar t...
3000010 vanA VanA is a D-Ala-D-Ala ligase homolog that synt...
3000013 vanB VanB is a D-Ala-D-Ala ligase homolog similar t...
3000024 patA PatA is an ABC transporter of Streptococcus pn...
3000025 patB PatB is an ABC transporter of Streptococcus pn...

Genomic

In [7]:
# !rsync anal1:~/sinkhole/code/respipe/sipi-func/ResPipe_results/data/g/{root_path}
In [84]:
card_lc_df = pd.read_csv(f'{root_path}/data/g/func/CARD_lateral_coverage.tsv', sep='\t')
card_lc_df['Unnamed: 0'] = card_lc_df['Unnamed: 0'].str.partition('_')[0]
card_lc_df.set_index('Unnamed: 0', inplace=True)
card_lc_df.index.rename('aro', inplace=True)
oldcols_newcols = {c: c[5:] for c in card_lc_df.columns}
card_lc_df.rename(columns=oldcols_newcols, inplace=True)

card_lcs_df = pd.read_csv(f'{root_path}/data/g/func/CARD_lateral_coverage_specific.tsv', sep='\t')
card_lcs_df['Unnamed: 0'] = card_lcs_df['Unnamed: 0'].str.partition('_')[0]
card_lcs_df.set_index('Unnamed: 0', inplace=True)
card_lcs_df.index.rename('aro', inplace=True)
oldcols_newcols = {c: c[5:] for c in card_lcs_df.columns}
card_lcs_df.rename(columns=oldcols_newcols, inplace=True)

g_phm_lc_df = pd.merge(card_lc_df, phm_meta_df, left_index=True, right_index=True, how='inner')
g_phm_lc_df.index.rename('aro', inplace=True)
g_phm_lc_df.set_index('Name', inplace=True)
g_phm_lc_df.drop(['Description'], axis=1, inplace=True)
g_phm_lc_df['rowsum'] = g_phm_lc_df.sum(axis=1)
g_phm_lc_df.sort_values('rowsum', ascending=False, inplace=True)
g_phm_lc_df.drop('rowsum', axis=1, inplace=True)

g_phm_lcs_df = pd.merge(card_lcs_df, phm_meta_df, left_index=True, right_index=True, how='inner')
g_phm_lcs_df.index.rename('aro', inplace=True)
g_phm_lcs_df.set_index('Name', inplace=True)
g_phm_lcs_df.drop(['Description'], axis=1, inplace=True)
g_phm_lcs_df['rowsum'] = g_phm_lcs_df.sum(axis=1)
g_phm_lcs_df.sort_values('rowsum', ascending=False, inplace=True)
g_phm_lcs_df.drop('rowsum', axis=1, inplace=True)

g_phm_lc_df.to_csv(f'{root_path}/respipe_g_phm_lc_df.tsv', sep='\t')
g_phm_lc_thresh_df = g_phm_lc_df[(g_phm_lc_df >= 0.75).any(axis=1)]

Cluster abundant families with nonspecific matches (from lc)

571 genes with one or more sample >= 0.75% cov, want <100

In [30]:
g_phm_lc_thresh_df.index.shape
Out[30]:
(571,)
In [10]:
families_to_cluster = ['TEM',
                       'SHV',
                       'CTX-M',
                       'OXA',
                       'OXY',
                       'Qnr',
                       'aadA',
                       "AAC(6')",
                       'mdt',
                       'dfr',
                       'OKP',
                       'emr',
                       'cml']

i = 0
for f in families_to_cluster:
    print(f, len([g for g in g_phm_lc_thresh_df.index if f in g]))
    i += len([g for g in g_phm_lc_thresh_df.index if f in g])
TEM 160
SHV 151
CTX-M 81
OXA 30
OXY 18
Qnr 17
aadA 13
AAC(6') 13
mdt 10
dfr 8
OKP 6
emr 6
cml 4
In [11]:
names_map = {'Escherichia coli acrA': 'acrA',
             'Escherichia coli ampH beta-lactamase': 'ampH',
             'Klebsiella pneumoniae OmpK37': 'OmpK37',
             'Escherichia coli ampC1 beta-lactamase': 'ampC1',
             'plasmid-encoded cat (pp-cat)': 'pp-cat',
             'Salmonella enterica cmlA': 'cmlA',  # To be clustered with cmlA in table already
             'Escherichia coli emrE': 'emrE',  # To be clustered with emr in table already
             'Klebsiella pneumoniae KpnE': 'KpnE',
             'Klebsiella pneumoniae KpnF': 'KpnF',
             'Klebsiella pneumoniae KpnG': 'KpnG',
             'Klebsiella pneumoniae KpnH': 'KpnH'}
In [12]:
g_phm_lc_thresh_clust_df = g_phm_lc_thresh_df.copy().rename(index=names_map)  # Rename first, then cluster

for g in families_to_cluster:
    starts_with_df = g_phm_lc_thresh_clust_df[g_phm_lc_thresh_clust_df.index.str.startswith(g)]
    starts_with_labels = starts_with_df.index

    clustered_row_df = starts_with_df.max().rename(f'{g} family').to_frame().T

    g_phm_lc_thresh_clust_df.drop(starts_with_labels, inplace=True)
    g_phm_lc_thresh_clust_df = g_phm_lc_thresh_clust_df.append(clustered_row_df)

# g_phm_lc_thresh_clust_df.rename(index=names_map, inplace=True)
In [15]:
g_phm_lc_thresh_clust_df.index.shape
Out[15]:
(67,)

Metagenomic (my respipe run)

In [85]:
card_lc_df = pd.read_csv(f'{root_path}/data/mg/func/CARD_lateral_coverage.tsv', sep='\t')
card_lc_df['Unnamed: 0'] = card_lc_df['Unnamed: 0'].str.partition('_')[0]
card_lc_df.set_index('Unnamed: 0', inplace=True)
card_lc_df.index.rename('aro', inplace=True)
oldcols_newcols = {c: c[5:] for c in card_lc_df.columns}
card_lc_df.rename(columns=oldcols_newcols, inplace=True)
card_lc_df.rename(columns=guids_samples, inplace=True)
card_lc_df = card_lc_df.reindex(natsorted(card_lc_df.columns), axis=1)

card_lcs_df = pd.read_csv(f'{root_path}/data/mg/func/CARD_lateral_coverage_specific.tsv', sep='\t')
card_lcs_df['Unnamed: 0'] = card_lcs_df['Unnamed: 0'].str.partition('_')[0]
card_lcs_df.set_index('Unnamed: 0', inplace=True)
card_lcs_df.index.rename('aro', inplace=True)
oldcols_newcols = {c: c[5:] for c in card_lcs_df.columns}
card_lcs_df.rename(columns=oldcols_newcols, inplace=True)
card_lcs_df.rename(columns=guids_samples, inplace=True)
card_lcs_df = card_lcs_df.reindex(natsorted(card_lcs_df.columns), axis=1)

mg_phm_lc_df = pd.merge(card_lc_df, phm_meta_df, left_index=True, right_index=True, how='inner')
mg_phm_lc_df.index.rename('aro', inplace=True)
mg_phm_lc_df.set_index('Name', inplace=True)
mg_phm_lc_df.drop(['Description'], axis=1, inplace=True)
mg_phm_lc_df['rowsum'] = mg_phm_lc_df.sum(axis=1)
mg_phm_lc_df.sort_values('rowsum', ascending=False, inplace=True)
mg_phm_lc_df.drop('rowsum', axis=1, inplace=True)
mg_phm_lc_df = mg_phm_lc_df.reset_index().drop_duplicates(keep='first').set_index('Name')

mg_phm_lcs_df = pd.merge(card_lcs_df, phm_meta_df, left_index=True, right_index=True, how='inner')
mg_phm_lcs_df.index.rename('aro', inplace=True)
mg_phm_lcs_df.set_index('Name', inplace=True)
mg_phm_lcs_df.drop(['Description'], axis=1, inplace=True)
mg_phm_lcs_df['rowsum'] = mg_phm_lcs_df.sum(axis=1)
mg_phm_lcs_df.sort_values('rowsum', ascending=False, inplace=True)
mg_phm_lcs_df.drop('rowsum', axis=1, inplace=True)
mg_phm_lcs_df = mg_phm_lcs_df.reset_index().drop_duplicates(keep='first').set_index('Name')

mg_phm_lc_df.to_csv(f'{root_path}/respipe_mg_phm_lc_df.tsv', sep='\t')
mg_phm_lc_thresh_df = mg_phm_lc_df[(mg_phm_lc_df >= 0.75).any(axis=1)]
In [89]:
[g for g in mg_phm_lcs_df.index if 'SHV-27' in g]
Out[89]:
['SHV-27']
In [92]:
mg_phm_lc_df.loc['SHV-27']
Out[92]:
A8T1     0.736
A8T4     0.883
A9T1     0.555
A9T4     0.557
A10T1    0.544
A10T4    0.768
A25T1    0.173
A25T4    0.484
B9T1     0.000
B9T4     0.000
B16T1    0.000
B16T4    0.000
B19T1    0.000
B19T2    0.000
B19T3    0.174
B19T4    0.000
C5T1     0.251
C5T4     0.705
C10T1    0.717
C10T4    0.382
Name: SHV-27, dtype: float64
In [93]:
mg_phm_lcs_df.loc['SHV-27']
Out[93]:
A8T1     0.0
A8T4     0.0
A9T1     0.0
A9T4     0.0
A10T1    0.0
A10T4    0.0
A25T1    0.0
A25T4    0.0
B9T1     0.0
B9T4     0.0
B16T1    0.0
B16T4    0.0
B19T1    0.0
B19T2    0.0
B19T3    0.0
B19T4    0.0
C5T1     0.0
C5T4     0.0
C10T1    0.0
C10T4    0.0
Name: SHV-27, dtype: float64

As above, cluster abundant families with nonspecific matches (from lc)

673 genes with one or more sample >= 0.75% cov, want <100

In [31]:
mg_phm_lc_thresh_clust_df = mg_phm_lc_thresh_df.copy().rename(index=names_map)  # Rename first, then cluster

for g in families_to_cluster:
    starts_with_df = mg_phm_lc_thresh_clust_df[mg_phm_lc_thresh_clust_df.index.str.startswith(g)]
    starts_with_labels = starts_with_df.index

    clustered_row_df = starts_with_df.max().rename(f'{g} family').to_frame().T

    mg_phm_lc_thresh_clust_df.drop(starts_with_labels, inplace=True)
    mg_phm_lc_thresh_clust_df = mg_phm_lc_thresh_clust_df.append(clustered_row_df)
In [32]:
# mg_phm_lc_df[(mg_phm_lc_df >= 0.75).any(axis=1)].shape
Out[32]:
(673, 20)
In [18]:
# mg_phm_lc_clust_df = mg_phm_lc_df.copy()

# for g in families_to_cluster:
#     starts_with_df = mg_phm_lc_clust_df[mg_phm_lc_clust_df.index.str.startswith(g)]
#     starts_with_labels = starts_with_df.index

#     clustered_row_df = starts_with_df.max().rename(f'{g} family').to_frame().T

#     mg_phm_lc_clust_df.drop(starts_with_labels, inplace=True)
#     mg_phm_lc_clust_df = mg_phm_lc_clust_df.append(clustered_row_df)

# mg_phm_lc_clust_df.rename(index=names_map, inplace=True) 

Before and after clustering

Before

In [64]:
g_phm_lc_thresh_df[(g_phm_lc_thresh_df >= 0.75).any(axis=1)].shape
Out[64]:
(571, 485)
In [65]:
mg_phm_lc_thresh_df[(mg_phm_lc_thresh_df >= 0.75).any(axis=1)].shape
Out[65]:
(673, 20)
In [66]:
g_genes = set(g_phm_lc_thresh_df[(g_phm_lc_thresh_df >= 0.75).any(axis=1)].index)
mg_genes = set(mg_phm_lc_thresh_df[(mg_phm_lc_thresh_df >= 0.75).any(axis=1)].index)

len(g_genes & mg_genes)
Out[66]:
460

After

In [67]:
g_phm_lc_thresh_clust_df[(g_phm_lc_thresh_clust_df >= 0.75).any(axis=1)].shape
Out[67]:
(67, 485)
In [68]:
mg_phm_lc_thresh_clust_df[(mg_phm_lc_thresh_clust_df >= 0.75).any(axis=1)].shape
Out[68]:
(237, 20)
In [69]:
g_genes = set(g_phm_lc_thresh_clust_df[(g_phm_lc_thresh_clust_df >= 0.75).any(axis=1)].index)
mg_genes = set(mg_phm_lc_thresh_clust_df[(mg_phm_lc_thresh_clust_df >= 0.75).any(axis=1)].index)

len(g_genes & mg_genes)
Out[69]:
58
In [48]:
g_genes - mg_genes
Out[48]:
{'AcrE',
 'LEN-26',
 'SAT-2',
 'gadW',
 'mgrA',
 'pp-cat',
 'tet(B)',
 'tet(D)',
 'ugd'}

Figures

  • Use isolate clustering to display metagenomes
In [70]:
type_lut = dict(zip(meta_df['type'].unique(), sns.color_palette("Set3", n_colors=8)[1:]))
type_couleurs = meta_df['type'].rename('sink / clinical').map(type_lut)

species_lut = dict(zip(meta_df['species'].unique(), sns.color_palette("Set2")[3:]))
species_couleurs = meta_df['species'].rename('species').map(species_lut)

ward_lut = {'7B': Category10[4][0],
            'AICU': Category10[4][1],
            'EAU': Category10[4][2],
            'HAEM': Category10[4][3],
            'ITU': Category10[4][0],
            'AE': (1, 1, 1),
            '5B': (1, 1, 1),
            'SEU': (1, 1, 1)}
ward_couleurs = meta_df['ward'].rename('ward').map(ward_lut)

metagenome_t4_sinks = set(meta_meta_df['sink'])
sinks_is_in_metagenome_t4 = {s: True if s in metagenome_t4_sinks else False for s in set(meta_df['sink'])}

pal = [(1,1,1), (0.8590542099192618, 0.42422145328719724, 0.6597462514417531)] # '#fff' followed by sns.color_palette("OrRd", 2)
sink_is_in_mg_lut = dict(zip([False, True], pal))
sink_is_in_mg_couleurs = meta_df['sink'].map(sinks_is_in_metagenome_t4).map(sink_is_in_mg_lut)

g_couleurs_df = pd.DataFrame([type_couleurs, species_couleurs, ward_couleurs, sink_is_in_mg_couleurs]).T
In [71]:
mg_ward_lut = {'7B': Category10[4][0],
               'AICU': Category10[4][1],
               'EAU': Category10[4][2],
               'HAEM': Category10[4][3],
               'ITU': Category10[4][0],
               'AE': (1, 1, 1),
               '5B': (1, 1, 1),
               'SEU': (1, 1, 1)}
ward_couleurs = meta_df['ward'].rename('ward').map(ward_lut)

mg_ward_couleurs = meta_meta_df.set_index('sample')['ward'].rename('ward').map(mg_ward_lut)

mg_sink_lut = dict(zip(meta_meta_df['sink'].unique(), sns.color_palette("muted")))
mg_sink_couleurs = meta_meta_df.set_index('sample')['sink'].rename('sink').map(mg_sink_lut)

mg_couleurs_df = pd.DataFrame([mg_ward_couleurs, mg_sink_couleurs]).T
In [72]:
sns.set(font_scale=1)
g_cm = sns.clustermap(g_phm_lc_thresh_clust_df,
                figsize=(14,19),
#                 col_colors=[type_couleurs, species_couleurs, ward_couleurs],
                col_colors=g_couleurs_df,
                cmap='Greys',
                row_cluster=True,
                xticklabels=0)
# g_cm.savefig(f'{root_path}/g_phm_lc_clust_df.svg')
In [75]:
sns.set(font_scale=1)
mg_cm = sns.clustermap(mg_phm_lc_clust_df.reindex(g_cm.data2d.index).fillna(0),
               figsize=(6,19),
               col_colors=mg_couleurs_df,
               col_cluster=False,
               row_cluster=False,
               cmap='Greys')
mg_cm.savefig(f'{root_path}/mg_phm_lc_clust_df_2.svg')