import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
from natsort import natsorted
from bokeh.palettes import Category10
root_path = 'res/2020-02-02'
meta_df = pd.read_csv(f'{root_path}/in/guids_meta.tsv', sep='\t', index_col=0)
meta_meta_df = pd.read_csv(f'{root_path}/in/meta_meta.csv', index_col=0).query("seq_batch != 3") # We don't want third batches for now
guids_samples = meta_meta_df['sample'].to_dict()
samples_guids = {v: k for k, v in guids_samples.items()}
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
deint_root = '~/sinkhole/data/meta/first/fq_deint'
bam_root = '~/raw_input'
linked_deint_root = '~/sinkhole/code/respipe/meta-func/fq_deint'
cmds = {g: f'samtools fastq -N --threads 4 -1 {deint_root}/{g}_1.fastq.gz -2 {deint_root}/{g}_2.fastq.gz -0 /dev/null -s /dev/null {bam_root}/{g}/in.bam'
for g in meta_meta_df.index}
with open(f'{root_path}/meta_bams_to_fqs.sh', 'w') as cmds_fh:
cmds_fh.write('\n'.join(cmds.values()))
# !rsync {root_path}/meta_bams_to_fqs.sh anal1:~/sinkhole/code/misc
cmds = {g: f'ln -s {deint_root}/{g}_1.fastq.gz {linked_deint_root}/{g}_1.fastq.gz; ln -s {deint_root}/{g}_2.fastq.gz {linked_deint_root}/{g}_2.fastq.gz'
for g in meta_meta_df.index}
with open(f'{root_path}/symlink_fqs_respipe.sh', 'w') as cmds_fh:
cmds_fh.write('\n'.join(cmds.values()))
# for cmd in cmds.values():
# print(cmd)
# !rsync {root_path}/symlink_fqs_respipe.sh anal1:~/sinkhole/code/misc
Parse PHMs from data directly
phm_aros = set((r.id.partition('ARO:')[2].partition('|')[0] for r in SeqIO.parse(f'{root_path}/in/card/data/nucleotide_fasta_protein_homolog_model.fasta', format='fasta')))
phm_df = pd.DataFrame(phm_aros).set_index(0)
ontology_df = pd.read_csv(f'{root_path}/in/card/ontology/aro.tsv', sep='\t').drop('ID', axis=1)
ontology_df['Accession'] = ontology_df['Accession'].str.replace('ARO:','')
ontology_df = ontology_df.set_index('Accession')
phm_meta_df = pd.merge(phm_df, ontology_df, left_index=True, right_index=True, how='left')
phm_meta_df.head()
# !rsync anal1:~/sinkhole/code/respipe/sipi-func/ResPipe_results/data/g/{root_path}
card_lc_df = pd.read_csv(f'{root_path}/data/g/func/CARD_lateral_coverage.tsv', sep='\t')
card_lc_df['Unnamed: 0'] = card_lc_df['Unnamed: 0'].str.partition('_')[0]
card_lc_df.set_index('Unnamed: 0', inplace=True)
card_lc_df.index.rename('aro', inplace=True)
oldcols_newcols = {c: c[5:] for c in card_lc_df.columns}
card_lc_df.rename(columns=oldcols_newcols, inplace=True)
card_lcs_df = pd.read_csv(f'{root_path}/data/g/func/CARD_lateral_coverage_specific.tsv', sep='\t')
card_lcs_df['Unnamed: 0'] = card_lcs_df['Unnamed: 0'].str.partition('_')[0]
card_lcs_df.set_index('Unnamed: 0', inplace=True)
card_lcs_df.index.rename('aro', inplace=True)
oldcols_newcols = {c: c[5:] for c in card_lcs_df.columns}
card_lcs_df.rename(columns=oldcols_newcols, inplace=True)
g_phm_lc_df = pd.merge(card_lc_df, phm_meta_df, left_index=True, right_index=True, how='inner')
g_phm_lc_df.index.rename('aro', inplace=True)
g_phm_lc_df.set_index('Name', inplace=True)
g_phm_lc_df.drop(['Description'], axis=1, inplace=True)
g_phm_lc_df['rowsum'] = g_phm_lc_df.sum(axis=1)
g_phm_lc_df.sort_values('rowsum', ascending=False, inplace=True)
g_phm_lc_df.drop('rowsum', axis=1, inplace=True)
g_phm_lcs_df = pd.merge(card_lcs_df, phm_meta_df, left_index=True, right_index=True, how='inner')
g_phm_lcs_df.index.rename('aro', inplace=True)
g_phm_lcs_df.set_index('Name', inplace=True)
g_phm_lcs_df.drop(['Description'], axis=1, inplace=True)
g_phm_lcs_df['rowsum'] = g_phm_lcs_df.sum(axis=1)
g_phm_lcs_df.sort_values('rowsum', ascending=False, inplace=True)
g_phm_lcs_df.drop('rowsum', axis=1, inplace=True)
g_phm_lc_df.to_csv(f'{root_path}/respipe_g_phm_lc_df.tsv', sep='\t')
g_phm_lc_thresh_df = g_phm_lc_df[(g_phm_lc_df >= 0.75).any(axis=1)]
lc
)¶571 genes with one or more sample >= 0.75% cov, want <100
g_phm_lc_thresh_df.index.shape
families_to_cluster = ['TEM',
'SHV',
'CTX-M',
'OXA',
'OXY',
'Qnr',
'aadA',
"AAC(6')",
'mdt',
'dfr',
'OKP',
'emr',
'cml']
i = 0
for f in families_to_cluster:
print(f, len([g for g in g_phm_lc_thresh_df.index if f in g]))
i += len([g for g in g_phm_lc_thresh_df.index if f in g])
names_map = {'Escherichia coli acrA': 'acrA',
'Escherichia coli ampH beta-lactamase': 'ampH',
'Klebsiella pneumoniae OmpK37': 'OmpK37',
'Escherichia coli ampC1 beta-lactamase': 'ampC1',
'plasmid-encoded cat (pp-cat)': 'pp-cat',
'Salmonella enterica cmlA': 'cmlA', # To be clustered with cmlA in table already
'Escherichia coli emrE': 'emrE', # To be clustered with emr in table already
'Klebsiella pneumoniae KpnE': 'KpnE',
'Klebsiella pneumoniae KpnF': 'KpnF',
'Klebsiella pneumoniae KpnG': 'KpnG',
'Klebsiella pneumoniae KpnH': 'KpnH'}
g_phm_lc_thresh_clust_df = g_phm_lc_thresh_df.copy().rename(index=names_map) # Rename first, then cluster
for g in families_to_cluster:
starts_with_df = g_phm_lc_thresh_clust_df[g_phm_lc_thresh_clust_df.index.str.startswith(g)]
starts_with_labels = starts_with_df.index
clustered_row_df = starts_with_df.max().rename(f'{g} family').to_frame().T
g_phm_lc_thresh_clust_df.drop(starts_with_labels, inplace=True)
g_phm_lc_thresh_clust_df = g_phm_lc_thresh_clust_df.append(clustered_row_df)
# g_phm_lc_thresh_clust_df.rename(index=names_map, inplace=True)
g_phm_lc_thresh_clust_df.index.shape
card_lc_df = pd.read_csv(f'{root_path}/data/mg/func/CARD_lateral_coverage.tsv', sep='\t')
card_lc_df['Unnamed: 0'] = card_lc_df['Unnamed: 0'].str.partition('_')[0]
card_lc_df.set_index('Unnamed: 0', inplace=True)
card_lc_df.index.rename('aro', inplace=True)
oldcols_newcols = {c: c[5:] for c in card_lc_df.columns}
card_lc_df.rename(columns=oldcols_newcols, inplace=True)
card_lc_df.rename(columns=guids_samples, inplace=True)
card_lc_df = card_lc_df.reindex(natsorted(card_lc_df.columns), axis=1)
card_lcs_df = pd.read_csv(f'{root_path}/data/mg/func/CARD_lateral_coverage_specific.tsv', sep='\t')
card_lcs_df['Unnamed: 0'] = card_lcs_df['Unnamed: 0'].str.partition('_')[0]
card_lcs_df.set_index('Unnamed: 0', inplace=True)
card_lcs_df.index.rename('aro', inplace=True)
oldcols_newcols = {c: c[5:] for c in card_lcs_df.columns}
card_lcs_df.rename(columns=oldcols_newcols, inplace=True)
card_lcs_df.rename(columns=guids_samples, inplace=True)
card_lcs_df = card_lcs_df.reindex(natsorted(card_lcs_df.columns), axis=1)
mg_phm_lc_df = pd.merge(card_lc_df, phm_meta_df, left_index=True, right_index=True, how='inner')
mg_phm_lc_df.index.rename('aro', inplace=True)
mg_phm_lc_df.set_index('Name', inplace=True)
mg_phm_lc_df.drop(['Description'], axis=1, inplace=True)
mg_phm_lc_df['rowsum'] = mg_phm_lc_df.sum(axis=1)
mg_phm_lc_df.sort_values('rowsum', ascending=False, inplace=True)
mg_phm_lc_df.drop('rowsum', axis=1, inplace=True)
mg_phm_lc_df = mg_phm_lc_df.reset_index().drop_duplicates(keep='first').set_index('Name')
mg_phm_lcs_df = pd.merge(card_lcs_df, phm_meta_df, left_index=True, right_index=True, how='inner')
mg_phm_lcs_df.index.rename('aro', inplace=True)
mg_phm_lcs_df.set_index('Name', inplace=True)
mg_phm_lcs_df.drop(['Description'], axis=1, inplace=True)
mg_phm_lcs_df['rowsum'] = mg_phm_lcs_df.sum(axis=1)
mg_phm_lcs_df.sort_values('rowsum', ascending=False, inplace=True)
mg_phm_lcs_df.drop('rowsum', axis=1, inplace=True)
mg_phm_lcs_df = mg_phm_lcs_df.reset_index().drop_duplicates(keep='first').set_index('Name')
mg_phm_lc_df.to_csv(f'{root_path}/respipe_mg_phm_lc_df.tsv', sep='\t')
mg_phm_lc_thresh_df = mg_phm_lc_df[(mg_phm_lc_df >= 0.75).any(axis=1)]
[g for g in mg_phm_lcs_df.index if 'SHV-27' in g]
mg_phm_lc_df.loc['SHV-27']
mg_phm_lcs_df.loc['SHV-27']
lc
)¶673 genes with one or more sample >= 0.75% cov, want <100
mg_phm_lc_thresh_clust_df = mg_phm_lc_thresh_df.copy().rename(index=names_map) # Rename first, then cluster
for g in families_to_cluster:
starts_with_df = mg_phm_lc_thresh_clust_df[mg_phm_lc_thresh_clust_df.index.str.startswith(g)]
starts_with_labels = starts_with_df.index
clustered_row_df = starts_with_df.max().rename(f'{g} family').to_frame().T
mg_phm_lc_thresh_clust_df.drop(starts_with_labels, inplace=True)
mg_phm_lc_thresh_clust_df = mg_phm_lc_thresh_clust_df.append(clustered_row_df)
# mg_phm_lc_df[(mg_phm_lc_df >= 0.75).any(axis=1)].shape
# mg_phm_lc_clust_df = mg_phm_lc_df.copy()
# for g in families_to_cluster:
# starts_with_df = mg_phm_lc_clust_df[mg_phm_lc_clust_df.index.str.startswith(g)]
# starts_with_labels = starts_with_df.index
# clustered_row_df = starts_with_df.max().rename(f'{g} family').to_frame().T
# mg_phm_lc_clust_df.drop(starts_with_labels, inplace=True)
# mg_phm_lc_clust_df = mg_phm_lc_clust_df.append(clustered_row_df)
# mg_phm_lc_clust_df.rename(index=names_map, inplace=True)
g_phm_lc_thresh_df[(g_phm_lc_thresh_df >= 0.75).any(axis=1)].shape
mg_phm_lc_thresh_df[(mg_phm_lc_thresh_df >= 0.75).any(axis=1)].shape
g_genes = set(g_phm_lc_thresh_df[(g_phm_lc_thresh_df >= 0.75).any(axis=1)].index)
mg_genes = set(mg_phm_lc_thresh_df[(mg_phm_lc_thresh_df >= 0.75).any(axis=1)].index)
len(g_genes & mg_genes)
g_phm_lc_thresh_clust_df[(g_phm_lc_thresh_clust_df >= 0.75).any(axis=1)].shape
mg_phm_lc_thresh_clust_df[(mg_phm_lc_thresh_clust_df >= 0.75).any(axis=1)].shape
g_genes = set(g_phm_lc_thresh_clust_df[(g_phm_lc_thresh_clust_df >= 0.75).any(axis=1)].index)
mg_genes = set(mg_phm_lc_thresh_clust_df[(mg_phm_lc_thresh_clust_df >= 0.75).any(axis=1)].index)
len(g_genes & mg_genes)
g_genes - mg_genes
type_lut = dict(zip(meta_df['type'].unique(), sns.color_palette("Set3", n_colors=8)[1:]))
type_couleurs = meta_df['type'].rename('sink / clinical').map(type_lut)
species_lut = dict(zip(meta_df['species'].unique(), sns.color_palette("Set2")[3:]))
species_couleurs = meta_df['species'].rename('species').map(species_lut)
ward_lut = {'7B': Category10[4][0],
'AICU': Category10[4][1],
'EAU': Category10[4][2],
'HAEM': Category10[4][3],
'ITU': Category10[4][0],
'AE': (1, 1, 1),
'5B': (1, 1, 1),
'SEU': (1, 1, 1)}
ward_couleurs = meta_df['ward'].rename('ward').map(ward_lut)
metagenome_t4_sinks = set(meta_meta_df['sink'])
sinks_is_in_metagenome_t4 = {s: True if s in metagenome_t4_sinks else False for s in set(meta_df['sink'])}
pal = [(1,1,1), (0.8590542099192618, 0.42422145328719724, 0.6597462514417531)] # '#fff' followed by sns.color_palette("OrRd", 2)
sink_is_in_mg_lut = dict(zip([False, True], pal))
sink_is_in_mg_couleurs = meta_df['sink'].map(sinks_is_in_metagenome_t4).map(sink_is_in_mg_lut)
g_couleurs_df = pd.DataFrame([type_couleurs, species_couleurs, ward_couleurs, sink_is_in_mg_couleurs]).T
mg_ward_lut = {'7B': Category10[4][0],
'AICU': Category10[4][1],
'EAU': Category10[4][2],
'HAEM': Category10[4][3],
'ITU': Category10[4][0],
'AE': (1, 1, 1),
'5B': (1, 1, 1),
'SEU': (1, 1, 1)}
ward_couleurs = meta_df['ward'].rename('ward').map(ward_lut)
mg_ward_couleurs = meta_meta_df.set_index('sample')['ward'].rename('ward').map(mg_ward_lut)
mg_sink_lut = dict(zip(meta_meta_df['sink'].unique(), sns.color_palette("muted")))
mg_sink_couleurs = meta_meta_df.set_index('sample')['sink'].rename('sink').map(mg_sink_lut)
mg_couleurs_df = pd.DataFrame([mg_ward_couleurs, mg_sink_couleurs]).T
sns.set(font_scale=1)
g_cm = sns.clustermap(g_phm_lc_thresh_clust_df,
figsize=(14,19),
# col_colors=[type_couleurs, species_couleurs, ward_couleurs],
col_colors=g_couleurs_df,
cmap='Greys',
row_cluster=True,
xticklabels=0)
# g_cm.savefig(f'{root_path}/g_phm_lc_clust_df.svg')
sns.set(font_scale=1)
mg_cm = sns.clustermap(mg_phm_lc_clust_df.reindex(g_cm.data2d.index).fillna(0),
figsize=(6,19),
col_colors=mg_couleurs_df,
col_cluster=False,
row_cluster=False,
cmap='Greys')
mg_cm.savefig(f'{root_path}/mg_phm_lc_clust_df_2.svg')