Source code for CoRe.fnGO

"""
Functions for Gene Ontology (GO) overrepresentation analysis using the communicated set of proteins.
"""

import numpy as np
import pandas as pd
from glob import glob
import os
from scipy.stats import hypergeom as hg
#from multipy.fdr import qvalue
from statsmodels.stats.multitest import fdrcorrection
from itertools import compress

# def qvalue(pvals, threshold=0.05, verbose=True):
#     """Function for estimating q-values from p-values using the Storey-
#     Tibshirani q-value method (2003).
#     Input arguments:
#     ================
#     pvals       - P-values corresponding to a family of hypotheses.
#     threshold   - Threshold for deciding which q-values are significant.
#     Output arguments:
#     =================
#     significant - An array of flags indicating which p-values are significant.
#     qvals       - Q-values corresponding to the p-values.
#     """
#
#     """Count the p-values. Find indices for sorting the p-values into
#     ascending order and for reversing the order back to original."""
#     m, pvals = len(pvals), np.asarray(pvals)
#     ind = np.argsort(pvals)
#     rev_ind = np.argsort(ind)
#     pvals = pvals[ind]
#
#     # Estimate proportion of features that are truly null.
#     kappa = np.arange(0, 0.96, 0.01)
#     pik = [sum(pvals > k) / (m*(1-k)) for k in kappa]
#     cs = UnivariateSpline(kappa, pik, k=3, s=None, ext=0)
#     pi0 = float(cs(1.))
#     if (verbose):
#         print('The estimated proportion of truly null features is %.3f' % pi0)
#
#     """The smoothing step can sometimes converge outside the interval [0, 1].
#     This was noted in the published literature at least by Reiss and
#     colleagues [4]. There are at least two approaches one could use to
#     attempt to fix the issue:
#     (1) Set the estimate to 1 if it is outside the interval, which is the
#         assumption in the classic FDR method.
#     (2) Assume that if pi0 > 1, it was overestimated, and if pi0 < 0, it
#         was underestimated. Set to 0 or 1 depending on which case occurs.
#     Here we have chosen the first option, since it is the more conservative
#     one of the two.
#     """
#     if (pi0 < 0 or pi0 > 1):
#         pi0 = 1
#         print('Smoothing estimator did not converge in [0, 1]')
#
#     # Compute the q-values.
#     qvals = np.zeros(np.shape(pvals))
#     qvals[-1] = pi0*pvals[-1]
#     for i in np.arange(m-2, -1, -1):
#         qvals[i] = min(pi0*m*pvals[i]/float(i+1), qvals[i+1])
#
#     # Test which p-values are significant.
#     significant = np.zeros(np.shape(pvals), dtype='bool')
#     significant[ind] = qvals<threshold
#
#     """Order the q-values according to the original order of the p-values."""
#     qvals = qvals[rev_ind]
#     return significant, qvals

[docs]def readGOsets(GO_file,GO_category): """ Reads the gene ontology data set as a python dictionary. Parameters ---------- GO_file: string Name of the file containing Gene Ontology gene sets. GO_category: string Name of the gene ontology category, one among the three, 'GOBP', 'GOCC', or 'GOMF'. Returns ------- GO_BPs: dict Dict with gene ontology gene set as keys and the list of associated genes as dictionary entries. total_unique_genes: list Names of unique genes in the total gene ontology data set. """ rf = open(GO_file,'r') all_lines = rf.readlines() rf.close() GO_BPs = {} total_unique_genes = [] for l in all_lines: this_set = l.rstrip('\r\n').split('\t') if GO_category in this_set[0]: #GO_BPs[this_set[0]] = this_set[2:] GO_BPs[this_set[0]] = pd.DataFrame(this_set[2:]) GO_BPs_dict[this_set[0]] = this_set[2:] for g in this_set[2:]: if g not in total_unique_genes: total_unique_genes.append(g) total_gene_size = len(total_unique_genes) return GO_BPs, total_unique_genes
[docs]def findGOembedding(GO_sets,outputfile): """ Identifies that gene sets that are embedded within other gene sets. The gene sets that are not embedded in any other gene set are returned as dictionary keys with an empty list as the entry. Parameters ---------- GO_sets: dict Dictionary with gene set name as keys and the list of associated genes as entries. outputfile: string Name of the file to store the output. Returns ------- GO_embedding: dict Dict with the index of gene ontology set name as keys and the list of indices of the gene sets that contains the key gene set. """ GO_embedding = {} GO_BP_names = list(GO_sets.keys()) lf = open('GOBP_list.csv','w') lf.close() for i in range(0,len(GO_BP_names)): bp = GO_BP_names[i] GO_embedding[i] = [] for i_1 in range(0,len(GO_BP_names)): bp_1 = GO_BP_names[i_1] this_gene_set_size = len(GO_sets[bp_1]) outline = str(i_1) for i_2 in range(0,len(GO_BP_names)): bp_2 = GO_BP_names[i_2] if len(GO_BPs[bp_2])>this_gene_set_size: check = 0 for gene in GO_sets[bp_1]: if gene in GO_sets[bp_2]: check += 1 if check==this_gene_set_size: GO_embedding[i_1].append(i_2) outline += ','+str(i_2) print(bp_1,file=open('GOBP_list.csv','a')) wf = open(outputfile,'w') for bp in GO_embedding.keys(): outline = bp for bp_in in GO_embedding[bp]: outline += ','+bp_in print(outline,file=wf) wf.close() return GO_embedding
[docs]def findGOcontainer(GO_embedding,outputfile): """ Identifies that gene sets that contains other gene sets. The gene sets that do not contain other gene sets are returned as dictionary keys with an empty list as the entry. Parameters ---------- GO_sets: dict Dictionary with gene set index as keys and the list of gene set that contains it as entries. outputfile: string Name of the file to store the output. Returns ------- GO_container: dict Dict with the index of gene ontology set name as keys and the list of indices of the gene sets that are contained in the key gene set. """ GO_container = {} for bp in GO_embedding.keys(): GO_container[bp] = [] for bp_1 in GO_container.keys(): for bp_2 in GO_embedding.keys(): if bp_1 in GO_embedding[bp_2]: GO_container[bp_1].append(bp_2) wf = open(outputfile,'w') for bp in GO_container.keys(): outline = str(bp) for bp_in in GO_container[bp]: outline += ','+str(bp_in) print(outline,file=wf) wf.close() return GO_container
[docs]def MinMaxGOsets(GO_embedding,GO_container,GO_BP_names): """ Identifies the minimum gene sets that do not contain any other gene sets and the maximum gene sets that are not contained in any other gene sets. Parameters ---------- GO_embedding: dict Dict with the index of gene ontology set name as keys and the list of indices of the gene sets that contains the key gene set. GO_container: dict Dict with the index of gene ontology set name as keys and the list of indices of the gene sets that are contained in the key gene set. GO_BP_names: list Names of gene ontology gene sets. """ wf = open('minimum_BP_set.csv','w') for bp in GO_container.keys(): if len(GO_container[bp])==0: print(GO_BP_names[bp],file=wf) wf.close() wf = open('maximum_BP_set.csv','w') for bp in GO_embedding.keys(): if len(GO_embedding[bp])==0: print(GO_BP_names[bp],file=wf) wf.close()
[docs]def readGOBPs(GO_directory): """ Reads the gene sets associated with Gene Ontology Biological Processes. Parameters ---------- GO_directory: Directory containing the GO data files from `Moleculary Signatures Database <https://www.gsea-msigdb.org/gsea/msigdb/>`_. This directory contains a set of .csv files with GOBP names as filenames containing the list of associated genes. Returns ------- GO_BPs: dict Dict with GOBP names as keys and the list of associated genes as the dictionary entry. """ os.chdir(GO_directory) all_GO_files = glob('*.csv') GO_BPs = {} GO_BPs_dict = {} for f in all_GO_files: k = f.strip('.csv') GO_BPs[k] = pd.read_csv(f,header=None) GO_BPs_dict[k] = pd.read_csv(f,header=None)[0].to_list() return GO_BPs, GO_BPs_dict
[docs]def total_genes(GOBPs): """ Determines the list of unique genes across all the GO gene sets. Parameters ---------- GO_BPs: dict Dict with GOBP names as keys and the list of associated genes as the dictionary entry. Returns ------- unique_genes: list Name of genes present in the gene set database. """ gsets = list(GOBPs.keys()) union = pd.DataFrame(GOBPs[gsets[0]]) for g in gsets[1:]: union = union.merge(GOBPs[g],how='outer').drop_duplicates([0]) return union[0].to_list()
[docs]def p_adjust_bh(p): """ Benjamini-Hochberg p-value correction for multiple hypothesis testing. Parameters ---------- p: array_like A list or array of p-values, from Fisher's exact text, for multiple hypothesis. Returns ------- q: array_like Adjusted positive False Discovery Rate (pFDR). """ p = np.asfarray(p) by_descend = p.argsort()[::-1] by_orig = by_descend.argsort() steps = float(len(p)) / np.arange(float(len(p)), 0, -1) q = np.minimum(1, np.minimum.accumulate(steps * p[by_descend])) return q[by_orig]
[docs]def compute_q_values(p_values,go_names,go_tags,alpha=0.01,return_all=False): """ Benjamini-Hochberg p-value correction for multiple hypothesis testing. Parameters ---------- p_values: array_like A list or array of p-values, from Fisher's exact text, for multiple hypothesis. Returns ------- q_values: array_like Sorted positive False Discovery Rate corrected for multiple hypothesis testing. p_values_go: array_like Names of the associated gene ontology biological processes. """ tmp = sorted(zip(go_tags, go_names, p_values),key=lambda x: x[2]) p_values_go_tags = [x[0] for x in tmp] p_values_go = [x[1] for x in tmp] p_values = [x[2] for x in tmp] p_values_arr = np.array(p_values) q_values = p_adjust_bh(p_values) rej, qs = fdrcorrection(p_values,alpha) q_tags = list(compress(p_values_go_tags,rej)) q_gos = list(compress(p_values_go,rej)) q_vs = qs[rej] #_, q_values = qvalue(p_values,threshold=0.01) if return_all==False: return q_tags, q_gos, q_vs else: return p_values_go_tags, p_values_go, q_values
[docs]def compute_p_values(sources,GO_BPs,interaction_set,total_genes,minimum_GOBP=False,size_threshold=np.inf,full=False): """ Benjamini-Hochberg p-value correction for multiple hypothesis testing. Parameters ---------- sources: list Names of factors that are causing the information transfer in the network. minimum_GOBP: list Names of gene sets for biological processes at the lowest level, i.e. these sets do not contain other gene sets. GO_BPs: array_like Gene sets for Gene Ontology Biological Processes. interaction_set: dict Set of genes receiving information from the sources. total_genes: int Total number of unique genes across all gene sets. Returns ------- go_names: dict Gene Ontology Biological Processes that are over-represented by the sources. p_values: dict Fisher's exact test p-value for Gene Ontology over-represenation analysis. """ go_tags, go_names, p_values = {}, {}, {} if minimum_GOBP==False: all_gene_sets = list(GO_BPs.keys()) else: all_gene_sets = minimum_GOBP for s_g in sources: len_interaction = int(interaction_set[s_g][0].count()) go_tags[s_g] = [] go_names[s_g] = [] p_values[s_g] = [] #for go in minimum_GOBP: for go in all_gene_sets: if int(GO_BPs[go].count())<=size_threshold: intersection = pd.merge(GO_BPs[go], interaction_set[s_g], how='inner').drop_duplicates([0]) len_intersection = int(intersection[0].count()) if len_intersection>0: len_gene_BP = int(GO_BPs[go].count()) pvalue = hg.sf(len_intersection-1, total_genes, len_gene_BP, len_interaction) p_values[s_g].append(pvalue) go_names[s_g].append(go.replace('_',' ').replace('GOBP ','')) go_tags[s_g].append(go) #if s_g=='SARS-CoV2 nsp7': # print(s_g,go,len_intersection,pvalue,intersection[0]) return go_tags, go_names, p_values def read_embedding(filename): wf = open(filename) all_lines = wf.readlines() wf.close() embed_idx = {} for l in all_lines: all_values = l.rstrip('\r\n').split(',') embed_idx[int(all_values[0])] = [] for k in all_values[1:]: embed_idx[int(all_values[0])].append(int(k)) return embed_idx