Source code for SpaceTravLR.tools.network

# import celloracle as co
import numpy as np
import pandas as pd
import pickle
import os
import json 
import torch
import networkx as nx 

def get_human_housekeeping_genes():
    return pd.read_csv('https://housekeeping.unicamp.br/Housekeeping_GenesHuman.csv', sep=';')

def get_mouse_housekeeping_genes():
    return pd.read_csv('https://housekeeping.unicamp.br/Housekeeping_GenesMouse.csv', sep=';')


[docs] def get_cellchat_db(species): # import commot as ct # df_ligrec = ct.pp.ligand_receptor_database( # database='CellChat', # species=species, # signaling_type=None # ) # df_ligrec.columns = ['ligand', 'receptor', 'pathway', 'signaling'] # return df_ligrec # data_path = os.path.join( # os.path.dirname(__file__), '..', '..', '..', 'data', f'cellchat_{species}.csv') data_path = os.path.join( os.path.dirname(__file__), '..', '..', 'SpaceTravLR_data', f'cellchat_{species}.csv') return pd.read_csv(data_path)
[docs] def expand_paired_interactions(df): expanded_rows = [] for _, row in df.iterrows(): ligands = row['ligand'].split('_') receptors = row['receptor'].split('_') for ligand in ligands: for receptor in receptors: new_row = row.copy() new_row['ligand'] = ligand new_row['receptor'] = receptor expanded_rows.append(new_row) df = pd.DataFrame(expanded_rows) return df
def encode_labels(labels, reverse_dict=False): unique_labels = sorted(list(set(labels))) if reverse_dict: return {label: i for i, label in enumerate(unique_labels)} return {i: label for i, label in enumerate(unique_labels)} class GeneRegulatoryNetwork: ''' A completely unused class, but used for testing ''' def __init__(self, organism='mouse'): if organism == 'mouse': # self.data = co.data.load_mouse_scATAC_atlas_base_GRN() # data_path = os.path.join( # os.path.dirname(__file__), '..', '..', '..', 'data', 'mm9_mouse_atac_atlas_data_TSS.parquet') data_path = os.path.join( os.path.dirname(__file__), '..', '..', 'SpaceTravLR_data', 'mouse_base_grn.parquet') self.data = pd.read_parquet(data_path) def get_regulators(self, adata, target_gene): base_GRN = self.data df = base_GRN[base_GRN.gene_short_name==target_gene][ np.intersect1d(adata.var_names, base_GRN[base_GRN.gene_short_name==target_gene].columns)].sum() df = df[df!=0] return df.index.tolist() class CellOracleLinks: def __init__(self): pass def get_regulators(self, adata, target_gene, alpha=0.05): regulators_with_pvalues = self.get_regulators_with_pvalues(adata, target_gene, alpha) grouped_regulators = regulators_with_pvalues.groupby('source').mean() filtered_regulators = grouped_regulators[grouped_regulators.index.isin(adata.var_names)] return filtered_regulators.index.tolist() def get_targets(self, adata, tf, alpha=0.05): targets_with_pvalues = self.get_targets_with_pvalues(adata, tf, alpha) grouped_targets = targets_with_pvalues.groupby('target').mean() filtered_targets = grouped_targets[grouped_targets.index.isin(adata.var_names)] return filtered_targets.index.tolist() def get_regulators_with_pvalues(self, adata, target_gene, alpha=0.05): assert target_gene in adata.var_names, f'{target_gene} not in adata.var_names' co_links = pd.concat( [link_data.query(f'target == "{target_gene}" and p < {alpha}')[['source', 'coef_mean']] for link_data in self.links.values()], axis=0).reset_index(drop=True) return co_links.query(f'source.isin({str(list(adata.var_names))})').reset_index(drop=True) def get_targets_with_pvalues(self, adata, tf, alpha=0.05): assert tf in adata.var_names, f'{tf} not in adata.var_names' co_links = pd.concat( [link_data.query(f'source == "{tf}" and p < {alpha}')[['target', 'coef_mean']] for link_data in self.links.values()], axis=0).reset_index(drop=True) return co_links.query(f'target.isin({str(list(adata.var_names))})').reset_index(drop=True) @staticmethod def get_training_genes(co_links, gene_kos, n_propagation=3): grn = nx.DiGraph() edges = [] for cluster, df in co_links.items(): cluster_edges = [(u, v) for u, v in zip(df['source'], df['target'])] edges.extend(cluster_edges) grn.add_edges_from(edges) train_genes = [] for ko in gene_kos: genes = [node for node, distance in nx.single_source_shortest_path_length( grn, ko, cutoff=n_propagation).items()] train_genes.extend(genes) return np.unique(train_genes) class RegulatoryFactory(CellOracleLinks): def __init__(self, colinks_path=None, links=None, annot='cell_type_int'): self.colinks_path = colinks_path self.annot = annot if colinks_path is not None: with open(self.colinks_path, 'rb') as f: self.links = pickle.load(f) elif links is not None: self.links = links self.cluster_labels = encode_labels(self.links.keys()) def get_cluster_regulators(self, adata, target_gene, alpha=0.05): adata_clusters = np.unique(adata.obs[self.annot]) regulator_dict = {} all_regulators = set() for label in adata_clusters: grn_df = self.links[label] grn_df = grn_df[(grn_df.target == target_gene) & (grn_df.p <= alpha)] tfs = list(grn_df.source) regulator_dict[label] = tfs all_regulators.update(tfs) all_regulators = all_regulators & set(adata.to_df().columns) # only use genes also in adata all_regulators = sorted(list(all_regulators)) regulator_masks = {} for label, tfs in regulator_dict.items(): indices = [all_regulators.index(tf)+1 for tf in tfs if tf in all_regulators] mask = torch.zeros(len(all_regulators) + 1) # prepend 1 for beta0 mask[[0] + indices] = 1 regulator_masks[label] = mask self.regulator_dict = regulator_masks return all_regulators class GeneralRegulatoryNetwork(CellOracleLinks): def __init__(self, colinks_path, annot): ''' A general class for loading CellOracle GRNs Assumes lexicographical order for cluster labels ''' assert colinks_path.endswith('.pkl'), 'colinks_path should be a pickle file' with open(colinks_path, 'rb') as f: self.links = pickle.load(f) self.annot = annot def get_cluster_regulators(self, adata, target_gene, alpha=0.05): adata_clusters = np.unique(adata.obs[self.annot]) regulator_dict = {} all_regulators = set() for cluster, label in enumerate(adata_clusters): grn_df = self.links[cluster] grn_df = grn_df[(grn_df.target == target_gene) & (grn_df.p <= alpha)] tfs = list(grn_df.source) regulator_dict[label] = tfs all_regulators.update(tfs) all_regulators = all_regulators & set(adata.to_df().columns) # only use genes also in adata all_regulators = sorted(list(all_regulators)) regulator_masks = {} for label, tfs in regulator_dict.items(): indices = [all_regulators.index(tf)+1 for tf in tfs if tf in all_regulators] mask = torch.zeros(len(all_regulators) + 1) # prepend 1 for beta0 mask[[0] + indices] = 1 regulator_masks[label] = mask self.regulator_dict = regulator_masks return all_regulators class SurveyRegulatoryNetwork(CellOracleLinks): def __init__(self): self.base_pth = os.path.join( os.path.dirname(__file__), '..', '..', '..', 'data') with open(self.base_pth+'/survey/celloracle_links_spleen.pkl', 'rb') as f: self.links = pickle.load(f) self.cluster_labels = { '8': 'T', '4': 'Neutrophil', '5': 'Plasma_Cell', '0': 'B', '2': 'Macrophage', '3': 'NK', '6': 'Platelet', '7': 'RBC', '1': 'DC' } self.annot = 'cluster' def get_cluster_regulators(self, adata, target_gene, alpha=0.05): adata_clusters = np.unique(adata.obs[self.annot]) regulator_dict = {} all_regulators = set() for label in adata_clusters: cluster = self.cluster_labels[str(label)] grn_df = self.links[cluster] grn_df = grn_df[(grn_df.target == target_gene) & (grn_df.p <= alpha)] tfs = list(grn_df.source) regulator_dict[label] = tfs all_regulators.update(tfs) all_regulators = all_regulators & set(adata.to_df().columns) # only use genes also in adata all_regulators = sorted(list(all_regulators)) regulator_masks = {} for label, tfs in regulator_dict.items(): indices = [all_regulators.index(tf)+1 for tf in tfs if tf in all_regulators] mask = torch.zeros(len(all_regulators) + 1) # prepend 1 for beta0 mask[[0] + indices] = 1 regulator_masks[label] = mask self.regulator_dict = regulator_masks return all_regulators class DayThreeRegulatoryNetwork(CellOracleLinks): """ CellOracle infered GRN These are dataset specific and come with estimated betas and p-values """ def __init__(self): self.base_pth = os.path.join( os.path.dirname(__file__), '..', '..', '..', 'data') with open(self.base_pth+'/slideseq/celloracle_links_day3_1.pkl', 'rb') as f: self.links = pickle.load(f) self.annot = 'rctd_cluster' with open(os.path.join(self.base_pth, 'celltype_assign.json'), 'r') as f: self.cluster_labels = json.load(f) def get_cluster_regulators(self, adata, target_gene, alpha=0.05): adata_clusters = np.unique(adata.obs[self.annot]) regulator_dict = {} all_regulators = set() for label in adata_clusters: # cluster = self.cluster_labels[str(label)] cluster = label grn_df = self.links[cluster] grn_df = grn_df[(grn_df.target == target_gene) & (grn_df.p <= alpha)] tfs = list(grn_df.source) regulator_dict[label] = tfs all_regulators.update(tfs) all_regulators = all_regulators & set(adata.to_df().columns) # only use genes also in adata all_regulators = sorted(list(all_regulators)) regulator_masks = {} for label, tfs in regulator_dict.items(): indices = [all_regulators.index(tf)+1 for tf in tfs if tf in all_regulators] mask = torch.zeros(len(all_regulators) + 1) # prepend 1 for beta0 mask[[0] + indices] = 1 regulator_masks[label] = mask self.regulator_dict = regulator_masks return all_regulators class MouseKidneyRegulatoryNetwork(CellOracleLinks): def __init__(self): self.base_pth = os.path.join( os.path.dirname(__file__), '..', '..', '..', 'data') with open(self.base_pth+'/kidney/celloracle_links.pkl', 'rb') as f: self.links = pickle.load(f) self.annot = 'cluster_cat' with open(os.path.join(self.base_pth, 'kidney/celltype_assign.json'), 'r') as f: self.cluster_labels = json.load(f) def get_cluster_regulators(self, adata, target_gene, alpha=0.05): adata_clusters = np.unique(adata.obs[self.annot]) regulator_dict = {} all_regulators = set() for label in adata_clusters: cluster = self.cluster_labels[str(label)] grn_df = self.links[cluster] grn_df = grn_df[(grn_df.target == target_gene) & (grn_df.p <= alpha)] tfs = list(grn_df.source) regulator_dict[label] = tfs all_regulators.update(tfs) all_regulators = all_regulators & set(adata.to_df().columns) # only use genes also in adata all_regulators = sorted(list(all_regulators)) regulator_masks = {} for label, tfs in regulator_dict.items(): indices = [all_regulators.index(tf)+1 for tf in tfs if tf in all_regulators] mask = torch.zeros(len(all_regulators) + 1) # prepend 1 for beta0 mask[[0] + indices] = 1 regulator_masks[label] = mask self.regulator_dict = regulator_masks return all_regulators class MouseSpleenRegulatoryNetwork(CellOracleLinks): def __init__(self): self.base_pth = '/ix/djishnu/alw399/SpaceOracle/data' with open(self.base_pth+'/spleen/celloracle_links.pkl', 'rb') as f: self.links = pickle.load(f) self.annot = 'clusters' self.cluster_labels = { "0": "B cell", "1": "CD4 T cell", "2": "CD8 T cell", "3": "Memory T cell", "4": "Myeloid" } def get_cluster_regulators(self, adata, target_gene, alpha=0.05): adata_clusters = np.unique(adata.obs[self.annot]) regulator_dict = {} all_regulators = set() for label in adata_clusters: cluster = self.cluster_labels[str(label)] # cluster = str(label) grn_df = self.links[cluster] grn_df = grn_df[(grn_df.target == target_gene) & (grn_df.p <= alpha)] tfs = list(grn_df.source) regulator_dict[label] = tfs all_regulators.update(tfs) all_regulators = all_regulators & set(adata.to_df().columns) # only use genes also in adata all_regulators = sorted(list(all_regulators)) regulator_masks = {} for label, tfs in regulator_dict.items(): indices = [all_regulators.index(tf)+1 for tf in tfs if tf in all_regulators] mask = torch.zeros(len(all_regulators) + 1) # prepend 1 for beta0 mask[[0] + indices] = 1 regulator_masks[label] = mask self.regulator_dict = regulator_masks return all_regulators class HumanMelanomaRegulatoryNetwork(CellOracleLinks): def __init__(self): self.base_pth = '/ix/djishnu/alw399/SpaceOracle/data' with open(self.base_pth+'/melanoma/celloracle_links.pkl', 'rb') as f: self.links = pickle.load(f) self.annot = 'cluster_cat' self.cluster_labels = { '2': 'CD8_T', '10': 'tumour_1', '3': 'T_reg', '1': 'CD4_T', '7': 'mono-mac', '9': 'plasma', '8': 'pDC', '6': 'mDC', '5': 'fibroblast', '11': 'tumour_2', '0': 'B_cell', '4': 'endothelial' } def get_cluster_regulators(self, adata, target_gene, alpha=0.05): adata_clusters = np.unique(adata.obs[self.annot]) regulator_dict = {} all_regulators = set() for label in adata_clusters: cluster = self.cluster_labels[str(label)] # cluster = str(label) grn_df = self.links[cluster] grn_df = grn_df[(grn_df.target == target_gene) & (grn_df.p <= alpha)] tfs = list(grn_df.source) regulator_dict[label] = tfs all_regulators.update(tfs) all_regulators = all_regulators & set(adata.to_df().columns) # only use genes also in adata all_regulators = sorted(list(all_regulators)) regulator_masks = {} for label, tfs in regulator_dict.items(): indices = [all_regulators.index(tf)+1 for tf in tfs if tf in all_regulators] mask = torch.zeros(len(all_regulators) + 1) # prepend 1 for beta0 mask[[0] + indices] = 1 regulator_masks[label] = mask self.regulator_dict = regulator_masks return all_regulators class HumanTonsilNetwork(CellOracleLinks): def __init__(self): self.base_pth = os.path.join( os.path.dirname(__file__), '..', '..', '..', 'data') with open(self.base_pth+'/BaseGRNs/tonsil_celloracle.pkl', 'rb') as f: self.links = pickle.load(f) self.annot = 'cluster' self.cluster_labels = { 0: 'Plasma Cells', 1: 'Cycling B Cells', 2: 'Follicular Dendritic Cells ', 3: 'Dark Zone B Cells', 4: 'IFN B Cells', 5: 'T Cells', 6: 'Light Zone B Cells', 7: 'Memory B Cells', 8: 'Naive B Cells', 9: 'GC-Tfh' } def get_cluster_regulators(self, adata, target_gene, alpha=0.05): adata_clusters = np.unique(adata.obs[self.annot]) regulator_dict = {} all_regulators = set() for label in adata_clusters: cluster = self.cluster_labels[int(label)] grn_df = self.links[cluster] grn_df = grn_df[(grn_df.target == target_gene) & (grn_df.p <= alpha)] tfs = list(grn_df.source) regulator_dict[label] = tfs all_regulators.update(tfs) all_regulators = all_regulators & set(adata.to_df().columns) # only use genes also in adata all_regulators = sorted(list(all_regulators)) regulator_masks = {} for label, tfs in regulator_dict.items(): indices = [all_regulators.index(tf)+1 for tf in tfs if tf in all_regulators] mask = torch.zeros(len(all_regulators) + 1) # prepend 1 for beta0 mask[[0] + indices] = 1 regulator_masks[label] = mask self.regulator_dict = regulator_masks return all_regulators class HumanTonsilRegulatoryNetwork(CellOracleLinks): def __init__(self): self.base_pth = os.path.join( os.path.dirname(__file__), '..', '..', '..', 'data') with open(self.base_pth+'/slidetags/tonsil_colinks.pkl', 'rb') as f: self.links = pickle.load(f) self.annot = 'cell_type_int' self.cluster_labels = {0: 'B_germinal_center', 1: 'B_memory', 2: 'B_naive', 3: 'FDC', 4: 'NK', 5: 'T_CD4', 6: 'T_CD8', 7: 'T_double_neg', 8: 'T_follicular_helper', 9: 'mDC', 10: 'myeloid', 11: 'pDC', 12: 'plasma' } def get_cluster_regulators(self, adata, target_gene, alpha=0.05): adata_clusters = np.unique(adata.obs[self.annot]) regulator_dict = {} all_regulators = set() for label in adata_clusters: cluster = self.cluster_labels[label] # cluster = str(label) grn_df = self.links[cluster] grn_df = grn_df[(grn_df.target == target_gene) & (grn_df.p <= alpha)] tfs = list(grn_df.source) regulator_dict[label] = tfs all_regulators.update(tfs) all_regulators = all_regulators & set(adata.to_df().columns) # only use genes also in adata all_regulators = sorted(list(all_regulators)) regulator_masks = {} for label, tfs in regulator_dict.items(): indices = [all_regulators.index(tf)+1 for tf in tfs if tf in all_regulators] mask = torch.zeros(len(all_regulators) + 1) # prepend 1 for beta0 mask[[0] + indices] = 1 regulator_masks[label] = mask self.regulator_dict = regulator_masks return all_regulators