# imports
import numpy as np
import pandas as pd
import matplotlib.cm as cm
import matplotlib.colors as colors
from IPython.display import HTML, Markdown
import requests
import time
# bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
# display graphics
output_notebook()
gene = 'ETS1'
cell = ''
root_path = 'https://appyters.maayanlab.cloud/storage/L1000_RNAseq_Gene_Search'
# load gene info
gene_info = pd.read_csv(f"{root_path}/L1000_to_RNAseq_gene_list.tsv", sep="\t", index_col=0)
landmark = "not"
inferred_l1000 = "not"
if gene_info.loc[gene, "landmark"] == True:
landmark = ""
if gene_info.loc[gene, "inferred l1000"] == True:
inferred_l1000 = ""
display(Markdown(f"**Input gene: {gene}**"))
display(Markdown(f"{gene} is {landmark} a landmark gene."))
display(Markdown(f"{gene} is {inferred_l1000} an originally inferred L1000 gene."))
display(Markdown(f"{gene} is a newly inferred (our model) gene."))
display(HTML(f"""More information about {gene} can be found at the <a href="https://cfde-gene-pages.cloud/gene/{gene}?CF=false&PS=true&Ag=true" target="_blank"> Gene and Drug Landing Page Aggregator</a>"""))
display(Markdown(f"**Input cell line: {'All' if cell == '' else cell}**"))
This Appyter provides visualizations of the top 5% of RNA-seq-like signatures induced by CRISPR knockouts and chemical perturbagens. Signatures are computed from transformed data profiles from the LINCS L1000 data. The transformation was performed using a two-step model:
Signatures were computed using the characteristic direction method (Clark et al., 2014), as implemented here.
Each gene was pre-queried across all available RNA-seq-like signatures, and the top signatures where a gene is up or down-regulated are returned for each gene.
xpr_gene_data = pd.read_feather(f"{root_path}/gene_files/{gene}.f").set_index('index')
if cell != '':
xpr_gene_data = xpr_gene_data[xpr_gene_data.index.map(lambda x: x.split('_')[1].split('.')[0] == cell)]
cp_gene_data = pd.read_feather(f"{root_path}/cp_gene_files/{gene}.f").set_index('index')
if cell != '':
cp_gene_data = cp_gene_data[cp_gene_data.index.map(lambda x: x.split('_')[1].split('.')[0] == cell)]
def map_color(cd, logfc, red_norm, blue_norm):
v = cd*logfc
if v < 0:
return '#D3D3D3'
elif cd < 0:
return colors.to_hex(cm.get_cmap('Reds')(red_norm(cd*logfc)))
else:
return colors.to_hex(cm.get_cmap('Blues')(blue_norm(cd*logfc)))
def make_plot(comb_df, gene, pert_type):
# check if there are any results
if comb_df.shape[0] == 0:
display(Markdown("### **There are no signatures in the pre-processed dataset for the chosen gene, cell line, and perturbation type inputs.**"))
return
# set color and size for each point on plot
v = (comb_df['logFC']*comb_df['CD']).to_numpy()
red_norm = colors.Normalize(vmin=min(v)-0.005, vmax=max(v)+0.005)
blue_norm = colors.Normalize(vmin=min(v)-0.005, vmax=max(v)+0.005)
plot_colors = [map_color(row.CD, row.logFC, red_norm, blue_norm) for row in comb_df.itertuples()]
# generate data source
data_source = ColumnDataSource(
data=dict(
x = comb_df['logFC'],
y = comb_df['CD'].apply(abs),
cd = comb_df['CD'],
sig = pd.Series(comb_df.index),
fc = comb_df['FC'],
logfc = comb_df['logFC'],
colors = plot_colors,
sizes = [8] * comb_df.shape[0],
)
)
# create hover tooltip
tools = [
("Signature", "@sig"),
("CD Coeff", "@cd"),
("Fold Change", "@fc"),
("Log2 Fold Change", "@logfc")
]
# generate plot and relevant plot labels
plot = figure(
plot_width=700,
plot_height=500,
tooltips=tools
)
plot.circle(
'x', 'y',
size='sizes',
alpha=0.7,
line_alpha=1,
line_width=1,
line_color='colors',
source=data_source,
fill_color='colors',
name=f"{gene}_expression_in_L1000_to_RNAseq_{pert_type.replace(' ','')}_volcano_plot"
)
plot.yaxis.axis_label = 'Abs(CD-Coefficient)'
plot.xaxis.axis_label = 'log2(Fold Change)'
plot.title.text = f"Differential Expression of {gene} in RNA-seq-like {pert_type} Signatures"
plot.title.align = 'center'
plot.title.text_font_size = '14px'
show(plot)
def make_tables(comb_df, pert, is_upreg):
dir_df = comb_df[comb_df['FC'] > 1] if is_upreg else comb_df[comb_df['FC'] < 1]
if dir_df.shape[0] == 0:
display(Markdown(f"### There are no {'up-regulated' if is_upreg else 'down-regulated'} signatures for the chosen gene and cell line inputs."))
return dir_df
dir_df = dir_df.sort_values(by='FC', ascending=not is_upreg)
dir_df['FC'] = dir_df['FC'].apply(lambda x: f'{x:.4f}')
dir_df['CD'] = dir_df['CD'].apply(lambda x: f'{x:.4f}')
if pert == 'xpr':
dir_df['KO Gene'] = dir_df.index.map(lambda x: x.split('_')[4])
else:
dir_df['Perturbagen'] = dir_df.index.map(lambda x: x.split('_')[4])
dir_df['Dose'] = dir_df.index.map(lambda x: x.split('_')[5] if len(x.split('_')) == 6 else '')
dir_df['Cell Line'] = dir_df.index.map(lambda x: x.split('_')[1])
dir_df['Timepoint'] = dir_df.index.map(lambda x: x.split('_')[2].lower())
dir_df = dir_df.rename(columns={
'FC': 'Fold Change',
'logFC': 'Log2(Fold Change)',
'CD': 'CD Coefficient',
'Rank': 'Rank in Signature'})
dir_df.index.names = ['Signature']
return dir_df
# create download link for table results
def download_link(df, fname):
if df.shape[0] == 0: return ''
csv = df.to_csv(fname, sep='\t', index=True)
link = f'<div>Download full results: <a href="{fname}" target=_blank>{fname}</a></div>'
return link
def enrichr(pert, top_perts, direction):
if pert == 'CRISPR':
desc = f"Top CRISPR targets from RNA-seq-like signatures that {direction}-regulate {gene}"
list_url = 'https://maayanlab.cloud/Enrichr/addList'
enrich_url = 'https://maayanlab.cloud/Enrichr/enrich'
else:
desc = f"Top compounds from RNA-seq-like signatures that {direction}-regulate {gene}"
list_url = 'https://maayanlab.cloud/DrugEnrichr/addList'
enrich_url = 'https://maayanlab.cloud/DrugEnrichr/enrich'
payload = {
'list': (None, '\n'.join(top_perts)),
'description': (None, desc)
}
response = requests.post(list_url, files=payload)
if not response.ok:
raise Exception('Error analyzing gene list')
time.sleep(0.5)
return f"{enrich_url}?dataset={response.json()['shortId']}"
def enrichr_link(pert, df, direction, gene):
# check if there are any results
if df.shape[0] < 5:
return(f"There are not enough {direction}-regulated signatures to submit to {'Enrichr' if pert == 'CRISPR' else 'DrugEnrichr'}.")
comb_df = df.copy()
comb_df['pert'] = comb_df.index.map(lambda x: x.split('_')[4])
if direction == 'up':
top_perts = comb_df.sort_values(by='FC', ascending=False) \
.drop_duplicates(subset=['pert'],keep='first')['pert'][:20]
else:
top_perts = comb_df.sort_values(by='FC', ascending=True) \
.drop_duplicates(subset=['pert'],keep='first')['pert'][:20]
pert_type = 'CRISPR target genes' if pert == 'CRISPR' else 'chemical compounds'
results_url = enrichr(pert, top_perts, direction)
return f'<a href={results_url} target="_blank">Enrichr analysis of top 20 {pert_type} that {direction}-regulate {gene}</a>'
In the following volcano plot, each point represents a single CRISPR knockout signature. The x-position indicates the log2(fold change) of the expression of the chosen gene in the signature, while the y-position indicates the absolute value of the characteristic direction coefficient of the chosen gene.
Note that the fold change and characteristic direction coefficients of the gene are not necessarily in the same direction for each signature; this is because in cases where a gene is both up- and down-regulated between replicate samples, the characteristic direction method prioritizes the more consistent direction of movement, which may not be consistent with the fold change. To read more about the characteristic direction method, please refer to Clark et al., 2014.
Points with same-direction fold change and CD coefficient values are highlighted by coloring them blue (up-regulated) or red (down-regulated). Darker colored points indicate higher differential expression of the gene in the corresponding signature.
Drag the plot to pan around. Use the toolbar to the right of the plot to zoom, reset the plot view, or download the plot.
# CRISPR
make_plot(xpr_gene_data, gene, 'CRISPR KO')
The tables below display the characteristic direction (CD) coefficients, fold change values, and log2(fold change) values correponding to the expression of the chosen gene in each CRIPSR KO signature.
The rank of the gene in the signature is determined by its fold change relative to the fold change of the other genes that are regulated in the same direction; if a gene is ranked 1 in a signature where the gene is up-regulated, that means that out of all genes up-regulated in the signature, the input gene had the highest fold change and was the most up-regulated.
While only the top 10 signatures for each direction are displayed, below each table is a link to download the top 50 signatures for each direction.
A link to the Enrichr analysis results of the top 20 unique perturbations from the top signatures that up or down-regulate the input gene can be found below each table as well.
display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top {cell} CRISPR KO signatures where {gene} is up-regulated (based on fold change)</b></div>'))
up_xpr = make_tables(xpr_gene_data, pert='xpr', is_upreg=True)
display(HTML(up_xpr[:10].to_html(escape=False, col_space=70)))
display(HTML(download_link(up_xpr[:100], f"{gene}_UpReg_L1000_CRISPR_signatures.tsv")))
display(HTML(enrichr_link('CRISPR', xpr_gene_data, 'up', gene)))
display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top {cell} CRISPR KO signatures where {gene} is down-regulated (based on fold change)</b></div>'))
down_xpr = make_tables(xpr_gene_data, pert='xpr', is_upreg=False)
display(HTML(down_xpr[:10].to_html(escape=False, col_space=70)))
display(HTML(download_link(down_xpr[:100], f"{gene}_DnReg_L1000_CRISPR_signatures.tsv")))
display(HTML(enrichr_link('CRISPR', xpr_gene_data, 'down', gene)))
In the following volcano plot, each point represents a single chemical perturbation signature. The x-position indicates the log2(fold change) of the expression of the chosen gene in the signature, while the y-position indicates the absolute value of the characteristic direction coefficient of the chosen gene.
Note that the fold change and characteristic direction coefficients of the gene are not necessarily in the same direction for each signature; this is because in cases where a gene is both up- and down-regulated between replicate samples, the characteristic direction method prioritizes the more consistent direction of movement, which may not be consistent with the fold change. To read more about the characteristic direction method, please refer to Clark et al., 2014.
Points with same-direction fold change and CD coefficient values are highlighted by coloring them blue (up-regulated) or red (down-regulated). Darker colored points indicate higher differential expression of the gene in the corresponding signature.
Drag the plot to pan around. Use the toolbar to the right of the plot to zoom, reset the plot view, or download the plot.
# CP
make_plot(cp_gene_data, gene, 'Chemical')
The tables below display the characteristic direction (CD) coefficients, fold change values, and log2(fold change) values correponding to the expression of the chosen gene in each chemical perturbation signature.
The rank of the gene in the signature is determined by its fold change relative to the fold change of the other genes that are regulated in the same direction; if a gene is ranked 1 in a signature where the gene is up-regulated, that means that out of all genes up-regulated in the signature, the input gene had the highest fold change and was the most up-regulated.
While only the top 10 signatures for each direction are displayed, below each table is a link to download the top 50 signatures for each direction.
A link to the Enrichr analysis results of the top 20 unique perturbations from the top signatures that up or down-regulate the input gene can be found below each table as well.
display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top {cell} chemical perturbation signatures where {gene} is up-regulated (based on fold change)</b></div>'))
up_cp = make_tables(cp_gene_data, pert='cp', is_upreg=True)
display(HTML(up_cp[:10].to_html(escape=False, col_space=70)))
display(HTML(download_link(up_cp[:100], f"{gene}_UpReg_L1000_chemical_signatures.tsv")))
display(HTML(enrichr_link('chemical', cp_gene_data, 'up', gene)))
display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top {cell} chemical perturbation signatures where {gene} is down-regulated (based on fold change)</b></div>'))
down_cp = make_tables(cp_gene_data, pert='cp', is_upreg=False)
display(HTML(down_cp[:10].to_html(escape=False, col_space=70)))
display(HTML(download_link(down_cp[:100], f"{gene}_DnReg_L1000_chemical_signatures.tsv")))
display(HTML(enrichr_link('chemical', cp_gene_data, 'down', gene)))