PrismEXP


An appyter interface to compute gene function predictions
This Appyter calculates predictions using the PrismEXP algorithm. It utilizes precomputed correlation scores and a pretrained model. The correlation scores are computed for 51 gene expression clusters in ARCHS4.

'GRIN3B'

'GO_Biological_Process_2018'
In [1]
import h5py as h5
import s3fs
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import List, Dict
import urllib.request
import json
import hashlib
import shutil
import ssl
import os
import sys
import re
import itertools
import pickle
import warnings
import scipy.stats as st
from lightgbm import LGBMRegressor
from sklearn.metrics import roc_auc_score, roc_curve, auc
from matplotlib import pyplot as plt
from IPython.display import display, FileLink, Markdown, HTML
In [2]
LIBRARY_LIST_URL="https://maayanlab.cloud/speedrichr/api/listlibs"
LIBRARY_DOWNLOAD_URL="https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=text&libraryName="
S3_URL="s3://mssm-prismx-100/"
GMT_EXAMPLE="https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=text&libraryName=KEGG_2019_Mouse"
In [3]
# Use the selected user input
GENE = 'GRIN3B'
LIBRARY = 'GO_Biological_Process_2018'
ENRICHR_MODE = True

S3 Data Query

The following functions are used to load gene correlation directly from precomputed correlation matrices stored on S3.

In [4]

def loadGenesS3():
    genes = 0
    s3 = s3fs.S3FileSystem(anon=True)
    with h5.File(s3.open(S3_URL+"correlation_0.h5", 'rb'), 'r', lib_version='latest') as f:
        genes = np.array([s.decode("UTF-8") for s in f["meta/genes"]])
    return [x.upper() for x in list(genes)]

def loadCorrelationS3(gene, genes, cormat):
    cor = 0
    s3 = s3fs.S3FileSystem(anon=True)
    with h5.File(s3.open(S3_URL+"correlation_"+str(cormat)+".h5", 'rb'), 'r', lib_version='latest') as f:
        idx = genes.index(gene.upper())
        cor = np.array(f["data/correlation"][idx,:]).astype(np.float64)
    return(cor)

def geneCorrelation(gene):
    cormats = list(range(100))
    cormats.append("global")
    genes = loadGenesS3()
    results = []
    pbar = tqdm(cormats)
    for i in pbar:
        pbar.set_description("Retrieve %s" % i)
        results.append(loadCorrelationS3(gene, genes, i))
    results = pd.DataFrame(np.array(results).T, index=genes)
    return(results)

Load GMT

These functions are used to load Enrichr gene set libraries.

In [5]
def getDataPath() -> str:
    path = os.path.join(
        os.path.dirname(__file__),
        'data/'
    )
    return(path)

def listLibraries():
    return(loadJSON(LIBRARY_LIST_URL)["library"])

def loadLibrary(library: str, overwrite: bool = False) -> str:
    ssl._create_default_https_context = ssl._create_unverified_context
    if not os.path.exists("gmts/"+library or overwrite):
        os.makedirs("gmts", exist_ok=True)
        print("Download Enrichr geneset library")
        urllib.request.urlretrieve(LIBRARY_DOWNLOAD_URL+library, "gmts/"+library+".gmt")
    else:
        print("File cached. To reload use loadLibrary(\""+library+"\", overwrite=True) instead.")
    lib, rlib, ugenes = readGMT("gmts/"+library+".gmt")
    print("# genesets: "+str(len(lib))+"\n# unique genes: "+str(len(ugenes)))
    return("gmts/"+library)

def printLibraries():
    libs = listLibraries()
    for i in range(0, len(libs)):
        print(str(i)+" - "+libs[i])

def loadJSON(url):
    context = ssl._create_unverified_context()
    req = urllib.request.Request(url)
    r = urllib.request.urlopen(req, context=context).read()
    return(json.loads(r.decode('utf-8')))

def readGMT(gmtFile: str, backgroundGenes: List[str] = [""], verbose=False) -> List:
    file = open(gmtFile, 'r')
    lines = file.readlines()
    library = {}
    for line in lines:
        sp = line.strip().upper().split("\t")
        sp2 = [re.sub(",.*", "",value) for value in sp[2:]]
        if len(backgroundGenes) > 1:
            backgroundGenes = [x.upper() for x in backgroundGenes]
            library[sp[0]] = [value for value in sp2 if value in backgroundGenes]
        else:
            library[sp[0]] = sp2
    ugenes = list(set(list(itertools.chain.from_iterable(library.values()))))
    ugenes.sort()
    rev_library = {}
    for ug in ugenes:
        rev_library[ug] = []
    for se in library.keys():
        for ge in library[se]:
            rev_library[ge].append(se)
    if verbose:
        print("Library loaded. Library contains "+str(len(library))+" gene sets. "+str(len(ugenes))+" unique genes found.")
    return [library, rev_library, ugenes]

Initialize Gene Set Library

Load library from Enrichr or use custom GMT file.

In [6]
if ENRICHR_MODE:
    libraryPath = loadLibrary(LIBRARY)
else:
    os.makedirs("gmts", exist_ok=True)
    shutil.copy2(LIBRARY, "gmts/"+LIBRARY)
    LIBRARY = LIBRARY.replace(".gmt", "")

library, rev_library, ugenes = readGMT("gmts/"+LIBRARY+".gmt", loadGenesS3())
Download Enrichr geneset library
# genesets: 5103

# unique genes: 14433

Load Gene Correlation

Gene correlation matrices are precalculated for 101 gene expression matrices from ARCHS4. The matrices are stored in AWS S3 and can be queried for target genes of interest. The extraction can take a moment to finish.

In [7]
correlation = geneCorrelation(GENE)
correlation
0 1 2 3 4 5 6 7 8 9 ... 91 92 93 94 95 96 97 98 99 100
A1BG 0.124634 0.059723 0.035187 0.084595 -0.005825 0.261230 0.057159 0.018311 0.013390 0.016464 ... 0.226318 0.048584 0.021378 0.212280 -0.030914 0.024704 0.034088 0.160645 0.170654 0.054962
A1BG-AS1 0.097534 0.114258 -0.017853 -0.076172 0.123474 0.013176 0.028412 -0.014328 -0.084900 0.006016 ... 0.219604 0.091919 -0.024307 0.004639 -0.016678 -0.028015 0.184326 0.325684 0.181641 0.009071
A1CF -0.095093 0.055298 0.056580 0.041290 -0.134766 0.010666 -0.052643 -0.009621 -0.063232 0.016556 ... -0.090210 0.010689 0.011787 0.032349 -0.034088 0.049225 -0.304443 -0.111694 -0.077942 -0.029175
A2M -0.047333 -0.413086 -0.022827 0.038910 -0.021500 0.056702 -0.040710 -0.023483 0.052979 0.021957 ... 0.015022 0.038818 -0.000196 0.041931 -0.012108 -0.001429 -0.049438 -0.291260 -0.037384 -0.084900
A2M-AS1 -0.173462 0.076660 0.037659 0.029358 0.008408 0.151123 0.021484 -0.011169 0.087219 0.043793 ... -0.019928 0.104187 0.007328 0.121399 0.007523 0.106079 0.034821 -0.013359 0.107666 0.107849
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ZYG11A -0.002436 0.058929 0.035675 0.169800 -0.034088 -0.087708 0.028122 0.010765 0.152710 -0.050995 ... -0.085571 -0.035095 -0.058777 -0.040436 -0.038635 0.036072 -0.049316 0.217896 -0.132080 0.007957
ZYG11B -0.041809 0.104309 -0.010658 0.058655 -0.124329 -0.169800 0.028915 -0.001225 -0.012733 0.032928 ... -0.185547 0.160400 -0.017365 -0.149414 -0.201050 -0.009193 -0.323730 -0.198242 -0.351074 -0.041046
ZYX 0.270508 0.404053 -0.029312 0.029114 0.267334 -0.013214 -0.064819 -0.026306 0.206665 0.004322 ... 0.347900 0.017120 -0.000523 0.187744 0.191528 -0.060181 0.351807 0.010773 0.261475 -0.038391
ZZEF1 0.098938 0.079407 0.013603 0.034882 0.061768 -0.028641 0.015022 -0.000592 0.014771 0.018219 ... 0.099487 -0.121277 -0.033234 0.431641 0.056427 -0.013329 -0.090637 -0.051025 0.035034 -0.033264
ZZZ3 -0.263184 0.096985 -0.009697 -0.074768 -0.305420 -0.195190 -0.009422 -0.004642 -0.033020 0.022781 ... -0.323730 -0.157104 -0.037292 -0.271973 -0.018311 -0.007324 -0.364990 -0.103943 -0.391602 -0.087769

38721 rows × 101 columns

Retrieve global: 100%|██████████| 101/101 [01:56<00:00,  1.15s/it]
In [8]
def getAverageCorrelation(correlation: pd.DataFrame, library: Dict):
    avgCor = []
    for ll in list(library.keys()):
        avgCor.append(correlation.loc[[x.upper() for x in library[ll]],:].mean(axis=0))
    avgCor = pd.DataFrame(np.array(avgCor).T, columns=list(library.keys()))
    avgCor = avgCor.transpose()
    return(avgCor)

Avg Correlation Scores

The average correlation scores are equivalent to the predictions performed by Geneshot. The values are used as prediction features of the PrismEXP model.

In [9]
avgCor = getAverageCorrelation(correlation, library)
avgCor
0 1 2 3 4 5 6 7 8 9 ... 91 92 93 94 95 96 97 98 99 100
POSITIVE REGULATION OF POSTTRANSCRIPTIONAL GENE SILENCING (GO:0060148) -0.025031 0.101779 -0.007288 0.009927 -0.000630 -0.033964 0.022896 0.000483 -0.012960 0.007964 ... -0.024098 -0.031808 -0.010409 -0.041418 -0.017316 0.001498 -0.061390 -0.046536 0.020630 -0.020820
REGULATION OF CELL CYCLE PROCESS (GO:0010564) -0.038569 0.028895 0.006442 -0.018803 0.004937 -0.012455 0.005042 0.002263 -0.000199 -0.001435 ... -0.006962 0.005159 -0.018907 -0.019413 0.021038 -0.002165 0.039597 0.050724 0.015465 -0.005417
ANGIOTENSIN-ACTIVATED SIGNALING PATHWAY (GO:0038166) 0.013555 0.106603 0.011132 -0.095154 0.069523 -0.044076 0.004615 0.014100 -0.010445 0.012744 ... 0.018912 0.002646 -0.001457 -0.128520 0.019846 0.008258 0.020867 -0.054834 0.070108 -0.001398
DNA-TEMPLATED TRANSCRIPTION, TERMINATION (GO:0006353) -0.124435 0.037998 -0.010352 -0.044983 0.000128 -0.028949 -0.013406 -0.002190 -0.018788 -0.004797 ... -0.062800 0.003059 -0.030954 -0.053735 0.005400 -0.020534 0.038047 0.074211 -0.013088 -0.022533
REGULATION OF PROTEIN SUMOYLATION (GO:0033233) 0.027558 -0.174541 -0.000098 -0.035935 0.036845 -0.007831 -0.022358 -0.003080 -0.003986 -0.000853 ... 0.024268 0.007050 -0.022980 0.011847 0.003837 -0.013287 0.063664 0.029026 0.037003 -0.026151
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
POSITIVE REGULATION OF MITOCHONDRIAL MEMBRANE POTENTIAL (GO:0010918) 0.078115 0.085164 0.018471 -0.004787 0.137957 0.061089 -0.006334 0.002159 0.009700 0.004761 ... 0.113007 0.033915 0.008636 0.073891 0.111577 0.005778 0.218099 0.130605 0.196879 0.031094
TRNA AMINOACYLATION FOR PROTEIN TRANSLATION (GO:0006418) -0.133492 0.083616 -0.013298 -0.062744 -0.058185 -0.071101 -0.029976 -0.002103 -0.046537 -0.008225 ... -0.094450 0.017410 -0.049929 -0.106415 0.036257 -0.027143 0.002111 0.091010 -0.082627 -0.051684
TRYPTOPHAN METABOLIC PROCESS (GO:0006568) -0.008502 0.118024 -0.008794 -0.042967 -0.014853 -0.019007 0.005509 0.003089 -0.080204 -0.001657 ... -0.018553 -0.000363 0.005948 -0.052277 0.026215 0.003939 -0.107077 0.003705 -0.060313 -0.001970
GUANOSINE-CONTAINING COMPOUND METABOLIC PROCESS (GO:1901068) -0.041287 -0.027287 -0.012290 -0.041519 -0.013178 -0.057527 -0.006889 -0.005707 -0.038205 -0.003398 ... -0.100397 0.006525 -0.034964 -0.142550 -0.003650 -0.006915 -0.080788 -0.057118 -0.125534 -0.029501
TRNA MODIFICATION (GO:0006400) -0.103520 0.058738 -0.000257 -0.042976 -0.036245 0.000854 -0.008092 0.001237 -0.036950 -0.003028 ... -0.070902 0.008500 -0.041694 -0.033462 0.027737 -0.006563 0.018815 0.113084 -0.025518 0.003017

5103 rows × 101 columns

In [10]
def loadPrismXModel():
    os.makedirs("model", exist_ok=True)
    urllib.request.urlretrieve("https://"+S3_URL.replace("s3:", "").replace("/", "")+".s3.amazonaws.com/prismxmodel.pkl", "model/prismxmodel.pkl")
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        model = pickle.load(open("model/prismxmodel.pkl", 'rb'))
    return(model)

def prismxPredictions(avgcor, model, verbose: bool=False) -> pd.DataFrame:
    avgcor = avgcor.fillna(0)
    predictions = pd.DataFrame(model.predict(avgcor))
    predictions.index = avgcor.index
    predictions.columns = ["predictions"]
    mean = np.mean(predictions[:], axis=0)
    std = np.std(predictions[:], axis=0)
    zscore = (predictions - mean)/std
    predictions["z-score"] = zscore
    pvalue = 1-st.norm.cdf(zscore)
    predictions["p-value"] = pvalue
    predictions["bonferroni"] = np.where(pvalue*len(pvalue) >= 1, 1, pvalue*len(pvalue))
    return predictions

Apply PrismEXP

The pretrained model is loaded and applied on the correlation features of 51 correlation matrices.

In [11]
model = loadPrismXModel()
predictions = prismxPredictions(avgCor, model)
predictions.to_csv(GENE+"_"+LIBRARY+"_predictions.tsv", sep="\t")
top_predictions = pd.DataFrame(predictions.sort_values(by=["predictions"], ascending=False).iloc[0:20:])
In [12]
def plotROC(fpr, tpr, auc):
    plt.figure(figsize=(12, 7), dpi= 300)
    plt.plot([0, 1], [0, 1], linestyle='--', label='baseline')
    plt.plot(fpr, tpr, linestyle='--', label='PrismEXP')
    plt.text(0.65, 0.1, "AUC: "+str(np.round(auc, decimals=3)), fontsize=18)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.savefig(GENE+"_"+LIBRARY+"_ROC.pdf")  
    plt.savefig(GENE+"_"+LIBRARY+"_ROC.png", dpi=300)
    plt.show()

def calculateGeneAUC(prediction: pd.DataFrame, gene: str, rev_library: Dict, minLibSize: int=1) -> List[float]:
    if gene in list(rev_library.keys()):
        gold = [i in rev_library[gene] for i in prediction.index]
        fpr, tpr, _ = roc_curve(list(gold), list(prediction.iloc[:,0]))
        roc_auc = auc(fpr, tpr)
        plotROC(fpr, tpr, roc_auc)
        return(auc)
    else:
        print("Not enough information to calcualte AUC")
        return(0)

Prediction Validation

The ROC curve shows how well previously known annotations for the gene have been recovered by PrismEXP. For this, all gene sets in the library are ranked by the prediction score in descending order. Previously known associations should rank high. The AUC can vary by gene and gene set library.

In [13]
calculateGeneAUC(predictions, GENE, rev_library)
Output figure
<function sklearn.metrics._ranking.auc(x, y)>

Top Predictions

The top 20 predictions are shown below. The p-values are calculated from the z-scores of the gene set library.

In [14]
top_predictions
predictions z-score p-value bonferroni
CENTRAL NERVOUS SYSTEM DEVELOPMENT (GO:0007417) 0.565020 4.084667 0.000022 0.112623
NEURON DIFFERENTIATION (GO:0030182) 0.558662 4.026773 0.000028 0.144281
NEGATIVE REGULATION OF PHOSPHORYLATION (GO:0042326) 0.549498 3.943324 0.000040 0.205039
RESPONSE TO ALKALOID (GO:0043279) 0.549013 3.938911 0.000041 0.208846
SYNAPTIC TRANSMISSION, CHOLINERGIC (GO:0007271) 0.528007 3.747628 0.000089 0.455481
POSITIVE REGULATION OF TRANSCRIPTION FROM RNA POLYMERASE II PROMOTER (GO:0045944) 0.527575 3.743693 0.000091 0.462677
RESPONSE TO ORGANIC CYCLIC COMPOUND (GO:0014070) 0.525820 3.727714 0.000097 0.493012
RESPONSE TO NUTRIENT LEVELS (GO:0031667) 0.506219 3.549226 0.000193 0.985812
POSITIVE REGULATION OF CELLULAR PROCESS (GO:0048522) 0.505198 3.539936 0.000200 1.000000
GENERATION OF NEURONS (GO:0048699) 0.504013 3.529147 0.000208 1.000000
ION TRANSMEMBRANE TRANSPORT (GO:0034220) 0.501233 3.503824 0.000229 1.000000
GLUCOSAMINE-CONTAINING COMPOUND METABOLIC PROCESS (GO:1901071) 0.499183 3.485162 0.000246 1.000000
REGULATION OF TRANSCRIPTION FROM RNA POLYMERASE II PROMOTER (GO:0006357) 0.497550 3.470290 0.000260 1.000000
POSITIVE REGULATION OF TRANSCRIPTION, DNA-TEMPLATED (GO:0045893) 0.497145 3.466599 0.000264 1.000000
POSITIVE REGULATION OF SIGNALING (GO:0023056) 0.495948 3.455707 0.000274 1.000000
POSITIVE REGULATION OF CELL GROWTH (GO:0030307) 0.495879 3.455077 0.000275 1.000000
NEGATIVE REGULATION OF LYASE ACTIVITY (GO:0051350) 0.495393 3.450651 0.000280 1.000000
REGULATION OF TRANSCRIPTION, DNA-TEMPLATED (GO:0006355) 0.494857 3.445770 0.000285 1.000000
RESPONSE TO ORGANONITROGEN COMPOUND (GO:0010243) 0.494510 3.442611 0.000288 1.000000
REGULATION OF CELL PROLIFERATION (GO:0042127) 0.490614 3.407130 0.000328 1.000000

Download Files

Download full prediction table and high resolution ROC plots. (If plots are opened in the browser the images might require a refresh of the browser page to display)

In [15]
display(FileLink(GENE+"_"+LIBRARY+"_predictions.tsv", result_html_prefix=str('Download prediction table: ')))
display(FileLink(GENE+"_"+LIBRARY+"_ROC.pdf", result_html_prefix=str('Download PDF: ')))
display(FileLink(GENE+"_"+LIBRARY+"_ROC.png", result_html_prefix=str('Download PNG: ')))

References

[1] Kuleshov, Maxim V., Matthew R. Jones, Andrew D. Rouillard, Nicolas F. Fernandez, Qiaonan Duan, Zichen Wang, Simon Koplev et al. "Enrichr: a comprehensive gene set enrichment analysis web server 2016 update." Nucleic acids research 44, no. W1 (2016): W90-W97.

[2] Lachmann, Alexander, Brian M. Schilder, Megan L. Wojciechowicz, Denis Torre, Maxim V. Kuleshov, Alexandra B. Keenan, and Avi Ma’ayan. "Geneshot: search engine for ranking genes from arbitrary text queries." Nucleic acids research 47, no. W1 (2019): W571-W577.

[3] Lachmann, Alexander, Denis Torre, Alexandra B. Keenan, Kathleen M. Jagodnik, Hoyjin J. Lee, Lily Wang, Moshe C. Silverstein, and Avi Ma’ayan. "Massive mining of publicly available RNA-seq data from human and mouse." Nature communications 9, no. 1 (2018): 1-10.

[4] Wang, Yuhang, Fillia S. Makedon, James C. Ford, and Justin Pearlman. "HykGene: a hybrid approach for selecting marker genes for phenotype classification using microarray gene expression data." Bioinformatics 21, no. 8 (2005): 1530-1537.