In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.cm as cm
import matplotlib.colors as colors
from IPython.display import HTML, Markdown
import requests
import time

# bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource

# display graphics
output_notebook()
Loading BokehJS ...
In [2]:
gene = 'DKK2'
cell = ''

RNA-seq-like Gene Centric Signature Reverse Search (RGCSRS)¶

In [3]:
root_path = 'https://appyters.maayanlab.cloud/storage/L1000_RNAseq_Gene_Search'
In [4]:
# load gene info
gene_info = pd.read_csv(f"{root_path}/L1000_to_RNAseq_gene_list.tsv", sep="\t", index_col=0)
landmark = "not"
inferred_l1000 = "not"
if gene_info.loc[gene, "landmark"] == True:
    landmark = ""
if gene_info.loc[gene, "inferred l1000"] == True:
    inferred_l1000 = ""
In [5]:
display(Markdown(f"**Input gene: {gene}**"))
display(Markdown(f"{gene} is {landmark} a landmark gene."))
display(Markdown(f"{gene} is {inferred_l1000} an originally inferred L1000 gene."))
display(Markdown(f"{gene} is a newly inferred (our model) gene."))
display(HTML(f"""More information about {gene} can be found at the <a href="https://cfde-gene-pages.cloud/gene/{gene}?CF=false&PS=true&Ag=true" target="_blank"> Gene and Drug Landing Page Aggregator</a>"""))

Input gene: DKK2

DKK2 is not a landmark gene.

DKK2 is an originally inferred L1000 gene.

DKK2 is a newly inferred (our model) gene.

More information about DKK2 can be found at the Gene and Drug Landing Page Aggregator
In [6]:
display(Markdown(f"**Input cell line: {'All' if cell == '' else cell}**"))

Input cell line: All

This Appyter provides visualizations of the top 5% of RNA-seq-like signatures induced by CRISPR knockouts and chemical perturbagens. Signatures are computed from transformed data profiles from the LINCS L1000 data. The transformation was performed using a two-step model:

  1. A cycleGAN model was used to first predict the RNA-seq expression of the 978 L1000 landmark genes
  2. A fully connected neural network was used to extrapolate the predicted RNA-seq expression of the 978 landmark genes to a full set of 23,164 genes

Signatures were computed using the characteristic direction method (Clark et al., 2014), as implemented here.

Each gene was pre-queried across all available RNA-seq-like signatures, and the top signatures where a gene is up or down-regulated are returned for each gene.

In [7]:
xpr_gene_data = pd.read_feather(f"{root_path}/gene_files/{gene}.f").set_index('index')
if cell != '': 
    xpr_gene_data = xpr_gene_data[xpr_gene_data.index.map(lambda x: x.split('_')[1].split('.')[0] == cell)]

cp_gene_data = pd.read_feather(f"{root_path}/cp_gene_files/{gene}.f").set_index('index')
if cell != '': 
    cp_gene_data = cp_gene_data[cp_gene_data.index.map(lambda x: x.split('_')[1].split('.')[0] == cell)]
In [8]:
def map_color(cd, logfc, red_norm, blue_norm):
    v = cd*logfc

    if v < 0: 
        return '#D3D3D3'
    elif cd < 0:
        return colors.to_hex(cm.get_cmap('Reds')(red_norm(cd*logfc)))
    else:
        return colors.to_hex(cm.get_cmap('Blues')(blue_norm(cd*logfc)))
In [9]:
def make_plot(comb_df, gene, pert_type):
    # check if there are any results
    if comb_df.shape[0] == 0: 
        display(Markdown("### **There are no signatures in the pre-processed dataset for the chosen gene, cell line, and perturbation type inputs.**"))
        return

    # set color and size for each point on plot
    v = (comb_df['logFC']*comb_df['CD']).to_numpy()
    red_norm = colors.Normalize(vmin=min(v)-0.005, vmax=max(v)+0.005)
    blue_norm = colors.Normalize(vmin=min(v)-0.005, vmax=max(v)+0.005)

    plot_colors = [map_color(row.CD, row.logFC, red_norm, blue_norm) for row in comb_df.itertuples()]

    # generate data source
    data_source = ColumnDataSource(
        data=dict(
            x = comb_df['logFC'],
            y = comb_df['CD'].apply(abs),
            cd = comb_df['CD'],
            sig = pd.Series(comb_df.index),
            fc = comb_df['FC'], 
            logfc = comb_df['logFC'],
            colors = plot_colors, 
            sizes = [8] * comb_df.shape[0],
        )
    )

    # create hover tooltip
    tools = [
        ("Signature", "@sig"),
        ("CD Coeff", "@cd"),
        ("Fold Change", "@fc"),
        ("Log2 Fold Change", "@logfc")
    ]
    # generate plot and relevant plot labels
    plot = figure(
        plot_width=700,
        plot_height=500,
        tooltips=tools
    )
    
    plot.circle(
        'x', 'y', 
        size='sizes',
        alpha=0.7, 
        line_alpha=1,
        line_width=1, 
        line_color='colors',
        source=data_source,
        fill_color='colors', 
        name=f"{gene}_expression_in_L1000_to_RNAseq_{pert_type.replace(' ','')}_volcano_plot"
    )

    plot.yaxis.axis_label = 'Abs(CD-Coefficient)'
    plot.xaxis.axis_label = 'log2(Fold Change)'
    plot.title.text = f"Differential Expression of {gene} in RNA-seq-like {pert_type} Signatures"
    plot.title.align = 'center'
    plot.title.text_font_size = '14px'
    show(plot)
In [10]:
def make_tables(comb_df, pert, is_upreg):
    dir_df = comb_df[comb_df['FC'] > 1] if is_upreg else comb_df[comb_df['FC'] < 1]
    if dir_df.shape[0] == 0: 
        display(Markdown(f"### There are no {'up-regulated' if is_upreg else 'down-regulated'} signatures for the chosen gene and cell line inputs."))
        return dir_df
    dir_df = dir_df.sort_values(by='FC', ascending=not is_upreg)
    dir_df['FC'] = dir_df['FC'].apply(lambda x: f'{x:.4f}')
    dir_df['CD'] = dir_df['CD'].apply(lambda x: f'{x:.4f}')
    if pert == 'xpr':
        dir_df['KO Gene'] = dir_df.index.map(lambda x: x.split('_')[4])
    else:
        dir_df['Perturbagen'] = dir_df.index.map(lambda x: x.split('_')[4])
        dir_df['Dose'] = dir_df.index.map(lambda x: x.split('_')[5] if len(x.split('_')) == 6 else '')
    dir_df['Cell Line'] = dir_df.index.map(lambda x: x.split('_')[1])
    dir_df['Timepoint'] = dir_df.index.map(lambda x: x.split('_')[2].lower())
    dir_df = dir_df.rename(columns={
            'FC': 'Fold Change', 
            'logFC': 'Log2(Fold Change)', 
            'CD': 'CD Coefficient',
            'Rank': 'Rank in Signature'})
    dir_df.index.names = ['Signature']
    return dir_df

# create download link for table results
def download_link(df, fname):
    if df.shape[0] == 0: return ''
    csv = df.to_csv(fname, sep='\t', index=True)
    link = f'<div>Download full results: <a href="{fname}" target=_blank>{fname}</a></div>'
    return link
In [11]:
def enrichr(pert, top_perts, direction):
    if pert == 'CRISPR':
        desc = f"Top CRISPR targets from RNA-seq-like signatures that {direction}-regulate {gene}"
        list_url = 'https://maayanlab.cloud/Enrichr/addList'
        enrich_url = 'https://maayanlab.cloud/Enrichr/enrich'
    else:
        desc = f"Top compounds from RNA-seq-like signatures that {direction}-regulate {gene}"
        list_url = 'https://maayanlab.cloud/DrugEnrichr/addList'
        enrich_url = 'https://maayanlab.cloud/DrugEnrichr/enrich'
    payload = {
        'list': (None, '\n'.join(top_perts)),
        'description': (None, desc)
    }
    response = requests.post(list_url, files=payload)
    if not response.ok:
        raise Exception('Error analyzing gene list')
    time.sleep(0.5)
    return f"{enrich_url}?dataset={response.json()['shortId']}"

def enrichr_link(pert, df, direction, gene): 
    # check if there are any results
    if df.shape[0] < 5: 
        return(f"There are not enough {direction}-regulated signatures to submit to {'Enrichr' if pert == 'CRISPR' else 'DrugEnrichr'}.")
    comb_df = df.copy()
    comb_df['pert'] = comb_df.index.map(lambda x: x.split('_')[4])
    if direction == 'up':
        top_perts = comb_df.sort_values(by='FC', ascending=False) \
            .drop_duplicates(subset=['pert'],keep='first')['pert'][:20]
    else:
        top_perts = comb_df.sort_values(by='FC', ascending=True) \
            .drop_duplicates(subset=['pert'],keep='first')['pert'][:20]
    pert_type = 'CRISPR target genes' if pert == 'CRISPR' else 'chemical compounds'
    results_url = enrichr(pert, top_perts, direction)
    return f'<a href={results_url} target="_blank">Enrichr analysis of top 20 {pert_type} that {direction}-regulate {gene}</a>'

CRISPR KO signatures¶

Volcano Plots¶

In the following volcano plot, each point represents a single CRISPR knockout signature. The x-position indicates the log2(fold change) of the expression of the chosen gene in the signature, while the y-position indicates the absolute value of the characteristic direction coefficient of the chosen gene.

Note that the fold change and characteristic direction coefficients of the gene are not necessarily in the same direction for each signature; this is because in cases where a gene is both up- and down-regulated between replicate samples, the characteristic direction method prioritizes the more consistent direction of movement, which may not be consistent with the fold change. To read more about the characteristic direction method, please refer to Clark et al., 2014.

Points with same-direction fold change and CD coefficient values are highlighted by coloring them blue (up-regulated) or red (down-regulated). Darker colored points indicate higher differential expression of the gene in the corresponding signature.

Drag the plot to pan around. Use the toolbar to the right of the plot to zoom, reset the plot view, or download the plot.

In [12]:
# CRISPR
make_plot(xpr_gene_data, gene, 'CRISPR KO')

Tables¶

The tables below display the characteristic direction (CD) coefficients, fold change values, and log2(fold change) values correponding to the expression of the chosen gene in each CRIPSR KO signature.

The rank of the gene in the signature is determined by its fold change relative to the fold change of the other genes that are regulated in the same direction; if a gene is ranked 1 in a signature where the gene is up-regulated, that means that out of all genes up-regulated in the signature, the input gene had the highest fold change and was the most up-regulated.

While only the top 10 signatures for each direction are displayed, below each table is a link to download the top 50 signatures for each direction.

A link to the Enrichr analysis results of the top 20 unique perturbations from the top signatures that up or down-regulate the input gene can be found below each table as well.

In [13]:
display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top {cell} CRISPR KO signatures where {gene} is up-regulated (based on fold change)</b></div>'))
up_xpr = make_tables(xpr_gene_data, pert='xpr', is_upreg=True)
display(HTML(up_xpr[:10].to_html(escape=False, col_space=70)))
display(HTML(download_link(up_xpr[:100], f"{gene}_UpReg_L1000_CRISPR_signatures.tsv")))
display(HTML(enrichr_link('CRISPR', xpr_gene_data, 'up', gene)))
Top CRISPR KO signatures where DKK2 is up-regulated (based on fold change)
CD Coefficient Fold Change Log2(Fold Change) Rank in Signature KO Gene Cell Line Timepoint
Signature
XPR027_PC3.311B_96H_H11_ZEB2 0.0275 18.6944 4.224537 225.0 ZEB2 PC3.311B 96h
XPR025_U251MG.311_96H_D23_TACSTD2 0.0272 3.8173 1.932553 156.0 TACSTD2 U251MG.311 96h
XPR041_KELLY.311_96H_E24_AKR1B1 0.0362 2.5970 1.376872 340.0 AKR1B1 KELLY.311 96h
XPR040_KELLY.311_96H_C23_DNMT3A 0.0290 2.1628 1.112877 515.0 DNMT3A KELLY.311 96h
XPR032_HT29.311_96H_N23_MAP2 0.0327 1.8446 0.883330 110.0 MAP2 HT29.311 96h
XPR041_KELLY.311_96H_J23_HIF1A 0.0346 1.8435 0.882446 293.0 HIF1A KELLY.311 96h
XPR012_MCF7.311_96H_O06_ATG5 0.0278 1.7853 0.836127 168.0 ATG5 MCF7.311 96h
XPR043_KELLY.311_96H_B03_CCNA2 0.0271 1.7236 0.785387 807.0 CCNA2 KELLY.311 96h
XPR036_PC3.311B_96H_N24_SH3BP4 0.0279 1.6776 0.746417 513.0 SH3BP4 PC3.311B 96h
XPR041_KELLY.311_96H_P19_NDUFS1 0.0320 1.5572 0.638954 734.0 NDUFS1 KELLY.311 96h
Download full results: DKK2_UpReg_L1000_CRISPR_signatures.tsv
Enrichr analysis of top 20 CRISPR target genes that up-regulate DKK2
In [14]:
display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top {cell} CRISPR KO signatures where {gene} is down-regulated (based on fold change)</b></div>'))
down_xpr = make_tables(xpr_gene_data, pert='xpr', is_upreg=False)
display(HTML(down_xpr[:10].to_html(escape=False, col_space=70)))
display(HTML(download_link(down_xpr[:100], f"{gene}_DnReg_L1000_CRISPR_signatures.tsv")))
display(HTML(enrichr_link('CRISPR', xpr_gene_data, 'down', gene)))
Top CRISPR KO signatures where DKK2 is down-regulated (based on fold change)
CD Coefficient Fold Change Log2(Fold Change) Rank in Signature KO Gene Cell Line Timepoint
Signature
XPR031_U251MG.311_96H_E03_ORAI3 -0.0337 0.2437 -2.037008 195.0 ORAI3 U251MG.311 96h
XPR041_KELLY.311_96H_I18_FDPS -0.0360 0.3222 -1.634040 170.0 FDPS KELLY.311 96h
XPR031_U251MG.311_96H_H02_POU2AF1 -0.0289 0.3375 -1.567209 231.0 POU2AF1 U251MG.311 96h
XPR043_KELLY.311_96H_I06_HMGCS1 -0.0279 0.4272 -1.226987 435.0 HMGCS1 KELLY.311 96h
XPR035_MCF7.311_96H_J04_P4HA2 -0.0356 0.4516 -1.146922 250.0 P4HA2 MCF7.311 96h
XPR033_MCF7.311_96H_F18_PSMD1 -0.0273 0.4878 -1.035751 382.0 PSMD1 MCF7.311 96h
XPR025_U251MG.311_96H_E12_STX11 -0.0273 0.5111 -0.968387 229.0 STX11 U251MG.311 96h
XPR038_KELLY.311_96H_P10_EBNA1BP2 -0.0288 0.5255 -0.928106 401.0 EBNA1BP2 KELLY.311 96h
XPR042_KELLY.311_96H_D02_COPA -0.0285 0.5758 -0.796269 533.0 COPA KELLY.311 96h
XPR009_PC3.311B_96H_F20_CDK2 -0.0264 0.5806 -0.784444 331.0 CDK2 PC3.311B 96h
Download full results: DKK2_DnReg_L1000_CRISPR_signatures.tsv
---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
Input In [14], in <cell line: 5>()
      3 display(HTML(down_xpr[:10].to_html(escape=False, col_space=70)))
      4 display(HTML(download_link(down_xpr[:100], f"{gene}_DnReg_L1000_CRISPR_signatures.tsv")))
----> 5 display(HTML(enrichr_link('CRISPR', xpr_gene_data, 'down', gene)))

Input In [11], in enrichr_link(pert, df, direction, gene)
     30     top_perts = comb_df.sort_values(by='FC', ascending=True) \
     31         .drop_duplicates(subset=['pert'],keep='first')['pert'][:20]
     32 pert_type = 'CRISPR target genes' if pert == 'CRISPR' else 'chemical compounds'
---> 33 results_url = enrichr(pert, top_perts, direction)
     34 return f'<a href={results_url} target="_blank">Enrichr analysis of top 20 {pert_type} that {direction}-regulate {gene}</a>'

Input In [11], in enrichr(pert, top_perts, direction)
     14 response = requests.post(list_url, files=payload)
     15 if not response.ok:
---> 16     raise Exception('Error analyzing gene list')
     17 time.sleep(0.5)
     18 return f"{enrich_url}?dataset={response.json()['shortId']}"

Exception: Error analyzing gene list

Chemical perturbation signatures¶

Volcano Plots¶

In the following volcano plot, each point represents a single chemical perturbation signature. The x-position indicates the log2(fold change) of the expression of the chosen gene in the signature, while the y-position indicates the absolute value of the characteristic direction coefficient of the chosen gene.

Note that the fold change and characteristic direction coefficients of the gene are not necessarily in the same direction for each signature; this is because in cases where a gene is both up- and down-regulated between replicate samples, the characteristic direction method prioritizes the more consistent direction of movement, which may not be consistent with the fold change. To read more about the characteristic direction method, please refer to Clark et al., 2014.

Points with same-direction fold change and CD coefficient values are highlighted by coloring them blue (up-regulated) or red (down-regulated). Darker colored points indicate higher differential expression of the gene in the corresponding signature.

Drag the plot to pan around. Use the toolbar to the right of the plot to zoom, reset the plot view, or download the plot.

In [ ]:
# CP
make_plot(cp_gene_data, gene, 'Chemical')

Tables¶

The tables below display the characteristic direction (CD) coefficients, fold change values, and log2(fold change) values correponding to the expression of the chosen gene in each chemical perturbation signature.

The rank of the gene in the signature is determined by its fold change relative to the fold change of the other genes that are regulated in the same direction; if a gene is ranked 1 in a signature where the gene is up-regulated, that means that out of all genes up-regulated in the signature, the input gene had the highest fold change and was the most up-regulated.

While only the top 10 signatures for each direction are displayed, below each table is a link to download the top 50 signatures for each direction.

A link to the Enrichr analysis results of the top 20 unique perturbations from the top signatures that up or down-regulate the input gene can be found below each table as well.

In [ ]:
display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top {cell} chemical perturbation signatures where {gene} is up-regulated (based on fold change)</b></div>'))
up_cp = make_tables(cp_gene_data, pert='cp', is_upreg=True)
display(HTML(up_cp[:10].to_html(escape=False, col_space=70)))
display(HTML(download_link(up_cp[:100], f"{gene}_UpReg_L1000_chemical_signatures.tsv")))
display(HTML(enrichr_link('chemical', cp_gene_data, 'up', gene)))
In [ ]:
display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top {cell} chemical perturbation signatures where {gene} is down-regulated (based on fold change)</b></div>'))
down_cp = make_tables(cp_gene_data, pert='cp', is_upreg=False)
display(HTML(down_cp[:10].to_html(escape=False, col_space=70)))
display(HTML(download_link(down_cp[:100], f"{gene}_DnReg_L1000_chemical_signatures.tsv")))
display(HTML(enrichr_link('chemical', cp_gene_data, 'down', gene)))
In [ ]: