Given the gene TPP1, we request information about it from several different DCCs in hopes of creating a comprehensive knowledge report for it.
import io
import json
import uuid
import time
import logging
import requests
import numpy as np
import pandas as pd
import urllib.parse
import plotly.express as px
import scipy.stats as st
import traceback
from contextlib import contextmanager
from textwrap import dedent
from functools import partial, lru_cache
from IPython.display import HTML, Markdown, display
logger = logging.getLogger('')
formatter = logging.Formatter('%(message)s')
console = logging.StreamHandler()
console.setFormatter(formatter)
logger.addHandler(console)
from itables_javascript import show
import itables.options as opt
opt.maxBytes = 0
opt.showIndex = False
class RenderJSON(object):
''' https://gist.github.com/t27/48b3ac73a1479914f9fe9383e5d45325 '''
def __init__(self, json_data):
if isinstance(json_data, dict) or isinstance(json_data, list):
self.json_str = json.dumps(json_data)
else:
self.json_str = json_data
self.uuid = str(uuid.uuid4())
def _ipython_display_(self):
display(HTML('<div id="{}" style="height: auto; width:100%;"></div>'.format(self.uuid)))
display(HTML("""<script>
require(["https://rawgit.com/caldwell/renderjson/master/renderjson.js"], function() {
renderjson.set_show_to_level(1)
document.getElementById('%s').appendChild(renderjson(%s))
});</script>
""" % (self.uuid, self.json_str)))
def combine_pvalues(pvalues, method='fisher', select='pvalue'):
''' A helper for accessing this method via pd.agg which expects a scaler result
'''
statistic, pvalue = st.combine_pvalues(pvalues, method=method)
return dict(statistic=statistic, pvalue=pvalue)[select]
def combine_zscores(zscores):
''' A helper method for combining zscores via Stouffer method
'''
return zscores.sum() / np.sqrt(zscores.shape[0])
def one(it):
try:
return next(iter(it))
except:
return None
@contextmanager
def ignore_exceptions(error_message=None):
''' Show but ignore exceptions that occur within the context
Usage:
with ignore_exceptions():
my_fallible_operation()
'''
try:
yield
except Exception as e:
if error_message:
logger.error(error_message)
else:
traceback.print_exc()
gene_input = 'TPP1'
To interoperate with different APIs which support different gene identifier schemes. We'll first use mygene.info to resolve gene identifiers.
@lru_cache()
def mygene_info_query(geneId):
res = requests.get('https://mygene.info/v3/query', params=dict(q=geneId))
results = res.json()
return results
gene_results = mygene_info_query(gene_input)
display(RenderJSON(gene_results))
if len(gene_results['hits']) == 0:
raise Exception(f"Gene with identifier {gene_input} could not be found in MyGeneInfo!")
# find exact match
geneId = one(hit['_id'] for hit in gene_results['hits'] if hit['symbol'] == gene_input)
if not geneId:
# Select the most likely gene (mygeneinfo sorts by likelyhood)
_, geneId = max(
(hit['_score'], hit['_id'])
for hit in gene_results['hits']
if hit.keys() & {'_score', 'symbol'}
)
display(Markdown(f"**GeneID: {geneId}**"))
With the Entrez Gene ID, we can resolve lots of different identifiers and identifiability information from mygene.info.
@lru_cache()
def mygene_info(geneId):
req = requests.get(
f"https://mygene.info/v3/gene/{geneId}"
)
return req.json()
gene_info = mygene_info(geneId)
display(RenderJSON(gene_info))
display(Markdown(f"### Gene Symbol: {gene_info['symbol']}"))
@lru_cache()
def gtex_singleTissueEqtl(geneSymbol, datasetId='gtex_v8'):
res = requests.get(
'https://gtexportal.org/rest/v1/association/singleTissueEqtl',
params=dict(
format='json',
geneSymbol=geneSymbol,
datasetId=datasetId,
)
)
results = res.json()['singleTissueEqtl']
if len(results) == 0: raise Exception(f"No information for gene with identifier {geneSymbol} found in GTEx")
return pd.DataFrame(results)
with ignore_exceptions(f"Gene with identifier {gene_info['symbol']} currently not available in GTEx"):
gtex_results = gtex_singleTissueEqtl(gene_info['symbol'])
columns = list(gtex_results.columns)
columns.insert(0, columns.pop(columns.index('nes')))
columns.insert(0, columns.pop(columns.index('pValue')))
columns.insert(0, columns.pop(columns.index('tissueSiteDetailId')))
gtex_results = gtex_results[columns]
show(gtex_results, order=[[gtex_results.columns.get_loc('pValue'), 'asc']])
with ignore_exceptions('Could not process GTEx output'):
gtex_combined_stouffer_statistic = gtex_results.groupby('tissueSiteDetailId')['pValue'] \
.agg(partial(combine_pvalues, method='stouffer', select='statistic')) \
.to_frame('combined_stouffer_statistic') \
.reset_index() \
.sort_values('combined_stouffer_statistic', ascending=False)
gtex_combined_stouffer_statistic['group'] = gtex_combined_stouffer_statistic['tissueSiteDetailId'].apply(lambda name: name.split('_', maxsplit=1)[0])
fig = px.bar(
gtex_combined_stouffer_statistic,
y='combined_stouffer_statistic',
x='tissueSiteDetailId',
color='group',
orientation='v',
title=f"Tissues with significant expression of {gene_info['symbol']} in GTEx",
height=1000,
)
fig.show()
An appyter was built for performing Gene Centric signature reverse searches against the LINCS data. Its functionality is repeated here.
import matplotlib.cm as cm
import matplotlib.colors as colors
from bokeh.io import output_notebook
from bokeh.plotting import figure, show as bokeh_show
from bokeh.models import ColumnDataSource
# display graphics
output_notebook()
root_path = 'https://appyters.maayanlab.cloud/storage/L1000_RNAseq_Gene_Search'
def map_color(cd, logfc, red_norm, blue_norm):
v = cd*logfc
if v < 0:
return '#D3D3D3'
elif cd < 0:
return colors.to_hex(cm.get_cmap('Reds')(red_norm(cd*logfc)))
else:
return colors.to_hex(cm.get_cmap('Blues')(blue_norm(cd*logfc)))
def make_plot(comb_df, gene, pert_type):
# check if there are any results
if comb_df.shape[0] == 0:
display(Markdown("### **There are no signatures in the pre-processed dataset for the chosen gene, cell line, and perturbation type inputs.**"))
return
# set color and size for each point on plot
v = (comb_df['logFC']*comb_df['CD']).to_numpy()
red_norm = colors.Normalize(vmin=min(v)-0.005, vmax=max(v)+0.005)
blue_norm = colors.Normalize(vmin=min(v)-0.005, vmax=max(v)+0.005)
plot_colors = [map_color(row.CD, row.logFC, red_norm, blue_norm) for row in comb_df.itertuples()]
# generate data source
data_source = ColumnDataSource(
data=dict(
x = comb_df['logFC'],
y = comb_df['CD'].apply(abs),
cd = comb_df['CD'],
sig = pd.Series(comb_df.index),
fc = comb_df['FC'],
logfc = comb_df['logFC'],
colors = plot_colors,
sizes = [8] * comb_df.shape[0],
)
)
# create hover tooltip
tools = [
("Signature", "@sig"),
("CD Coeff", "@cd"),
("Fold Change", "@fc"),
("Log2 Fold Change", "@logfc")
]
# generate plot and relevant plot labels
plot = figure(
plot_width=700,
plot_height=500,
tooltips=tools
)
plot.circle(
'x', 'y',
size='sizes',
alpha=0.7,
line_alpha=1,
line_width=1,
line_color='colors',
source=data_source,
fill_color='colors',
name=f"{gene}_expression_in_L1000_to_RNAseq_{pert_type.replace(' ','')}_volcano_plot"
)
plot.yaxis.axis_label = 'Abs(CD-Coefficient)'
plot.xaxis.axis_label = 'log2(Fold Change)'
plot.title.text = f"Differential Expression of {gene} in RNA-seq-like {pert_type} Signatures"
plot.title.align = 'center'
plot.title.text_font_size = '14px'
bokeh_show(plot)
def make_tables(comb_df, pert, is_upreg):
dir_df = comb_df[comb_df['FC'] > 1] if is_upreg else comb_df[comb_df['FC'] < 1]
if dir_df.shape[0] == 0:
display(Markdown(f"### There are no {'up-regulated' if is_upreg else 'down-regulated'} signatures for the chosen gene and cell line inputs."))
return dir_df
dir_df = dir_df.sort_values(by='FC', ascending=not is_upreg)
dir_df['FC'] = dir_df['FC'].apply(lambda x: f'{x:.4f}')
dir_df['CD'] = dir_df['CD'].apply(lambda x: f'{x:.4f}')
if pert == 'xpr':
dir_df['KO Gene'] = dir_df.index.map(lambda x: x.split('_')[4])
else:
dir_df['Perturbagen'] = dir_df.index.map(lambda x: x.split('_')[4])
dir_df['Dose'] = dir_df.index.map(lambda x: x.split('_')[5] if len(x.split('_')) == 6 else '')
dir_df['Cell Line'] = dir_df.index.map(lambda x: x.split('_')[1])
dir_df['Timepoint'] = dir_df.index.map(lambda x: x.split('_')[2].lower())
dir_df = dir_df.rename(columns={
'FC': 'Fold Change',
'logFC': 'Log2(Fold Change)',
'CD': 'CD Coefficient',
'Rank': 'Rank in Signature'})
dir_df.index.names = ['Signature']
return dir_df
def download_link(df, fname):
if df.shape[0] == 0: return ''
csv = df.to_csv(fname, sep='\t', index=True)
link = f'<div>Download full results: <a href="{fname}" target=_blank>{fname}</a></div>'
return link
def enrichr(pert, top_perts, direction, gene):
if pert == 'CRISPR':
desc = f"Top CRISPR targets from RNA-seq-like signatures that {direction}-regulate {gene}"
list_url = 'https://maayanlab.cloud/Enrichr/addList'
enrich_url = 'https://maayanlab.cloud/Enrichr/enrich'
else:
desc = f"Top compounds from RNA-seq-like signatures that {direction}-regulate {gene}"
list_url = 'https://maayanlab.cloud/DrugEnrichr/addList'
enrich_url = 'https://maayanlab.cloud/DrugEnrichr/enrich'
payload = {
'list': (None, '\n'.join(top_perts)),
'description': (None, desc)
}
response = requests.post(list_url, files=payload)
if not response.ok:
raise Exception('Error analyzing gene list')
time.sleep(0.5)
return f"{enrich_url}?dataset={response.json()['shortId']}"
def enrichr_link(pert, df, direction, gene):
# check if there are any results
if df.shape[0] < 5:
return(f"There are not enough {direction}-regulated signatures to submit to {'Enrichr' if pert == 'CRISPR' else 'DrugEnrichr'}.")
comb_df = df.copy()
comb_df['pert'] = comb_df.index.map(lambda x: x.split('_')[4])
if direction == 'up':
top_perts = comb_df.sort_values(by='FC', ascending=False) \
.drop_duplicates(subset=['pert'],keep='first')['pert'][:20]
else:
top_perts = comb_df.sort_values(by='FC', ascending=True) \
.drop_duplicates(subset=['pert'],keep='first')['pert'][:20]
pert_type = 'CRISPR target genes' if pert == 'CRISPR' else 'chemical compounds'
results_url = enrichr(pert, top_perts, direction, gene)
return f'<a href={results_url} target="_blank">Enrichr analysis of top 20 {pert_type} that {direction}-regulate {gene}</a>'
def l1000_rnaseq_directional_gene_search(gene=None, gene_data=None, pert=None, pert_full_name=None, pert_name=None, is_upreg=None):
direction = 'up' if is_upreg else 'down'
display(HTML(f'<div style="font-size:1rem;padding=1rem;"><b>Top {pert_full_name} signatures where {gene} is {direction}-regulated (based on fold change)</b></div>'))
tbl = make_tables(gene_data, pert='cp', is_upreg=is_upreg)
display(HTML(tbl[:10].to_html(escape=False, col_space=70)))
display(HTML(download_link(tbl[:100], f"{gene}_{direction.capitalize()}Reg_L1000_{pert_name}_signatures.tsv")))
display(HTML(enrichr_link(pert_name, gene_data, direction, gene)))
def l1000_rnaseq_gene_search(gene=None, gene_data=None, pert=None, pert_full_name=None, pert_name=None):
make_plot(gene_data, gene_info['symbol'], pert_name)
l1000_rnaseq_directional_gene_search(
gene=gene, gene_data=gene_data, pert=pert, pert_full_name=pert_full_name, pert_name=pert_name, is_upreg=True,
)
l1000_rnaseq_directional_gene_search(
gene=gene, gene_data=gene_data, pert=pert, pert_full_name=pert_full_name, pert_name=pert_name, is_upreg=False,
)
with ignore_exceptions(f"No information for gene with identifier {gene_info['symbol']} found in L1000"):
l1000_rnaseq_gene_search(
gene=gene_info['symbol'],
gene_data=pd.read_feather(f"{root_path}/gene_files/{gene_info['symbol']}.f").set_index('index'),
pert='xpr',
pert_name='CRISPR',
pert_full_name='CRISPR KO',
)
l1000_rnaseq_gene_search(
gene=gene_info['symbol'],
gene_data=pd.read_feather(f"{root_path}/cp_gene_files/{gene_info['symbol']}.f").set_index('index'),
pert='cp',
pert_name='chemical',
pert_full_name='Chemical Perturbation',
)
https://www.mousephenotype.org/
IMPC contains serves mouse phenotype information associated with gene markers. Its API is described here and allows us to identify phenotypes significantly associated with a gene.
@lru_cache()
def impc_phenotype(marker_gene, rows=20):
res = requests.get(
f"https://www.ebi.ac.uk/mi/impc/solr/genotype-phenotype/select",
params=dict(
q=f"marker_symbol:{marker_gene}",
rows=rows,
),
)
response = res.json()['response']
if response['numFound'] == 0: raise Exception('No phenotypes found')
return pd.DataFrame(response['docs']).sort_values('p_value', ascending=True)
with ignore_exceptions(f"No information for gene with identifier {gene_info['symbol']} found in IPMC"):
impc_results = impc_phenotype(gene_info['symbol'].capitalize())
show(impc_results[[
'marker_accession_id',
'mp_term_id',
'mp_term_name',
'assertion_type',
'p_value',
'phenotyping_center',
'percentage_change',
'statistical_method',
]])
with ignore_exceptions('IPMC Results could not be processed'):
impc_combined_stouffer_statistic = impc_results.groupby('mp_term_name')['p_value'] \
.agg([
('combined_stouffer_statistic', partial(combine_pvalues, method='stouffer', select='pvalue')),
]) \
.reset_index() \
.sort_values('combined_stouffer_statistic', ascending=False)
impc_combined_stouffer_statistic['-logp(combined_stouffer_statistic)'] = -np.log10(impc_combined_stouffer_statistic['combined_stouffer_statistic'])
fig = px.bar(
impc_combined_stouffer_statistic,
x='-logp(combined_stouffer_statistic)',
y='mp_term_name',
text='mp_term_name',
orientation='h',
title=f"Phenotype known to be associated with {gene_info['symbol']} from IMPC",
)
fig.update_yaxes(showticklabels=False)
fig.update_traces(texttemplate='%{text}', textposition='auto', insidetextanchor='start')
fig.show()
@lru_cache()
def glygen_geneNameSearch(recommended_gene_name, organism_taxon_id=9606):
res = requests.get(
'https://api.glygen.org/directsearch/gene/',
params=dict(
query=json.dumps(dict(
recommended_gene_name=recommended_gene_name,
organism=dict(
id=organism_taxon_id
),
)),
),
verify=False, # not sure why on my system I get SSL errors
)
return res.json()
with ignore_exceptions(f"No information for gene with identifier {gene_info['symbol']} found in GlyGen"):
glygen_geneInfo = glygen_geneNameSearch(gene_info['symbol'])
display(RenderJSON(glygen_geneInfo))
d = pd.DataFrame(glygen_geneInfo['results'][0]['glycosylation'])
d['evidence'] = d['evidence'].apply(
lambda evidence: ' '.join(f"<a href='{e['url']}'>{e['url']}></a>" for e in evidence if 'url' in e)
)
show(d, order=[[d.columns.get_loc('residue'), 'asc']])
display(d)
https://ldh.clinicalgenome.org/ldh/ui/
The exRNA Linked Data Hub (LDH) facilitates efficient access to collated information such as links and select data from different data sources, which are made available using RESTful APIs. Currently, LDH focuses on linking information about human genes and variants to support exRNA curation efforts.
We provide the gene symbol to exRNA and obtain the reported linked data. The query will produce a document with all associated regulatory element in the +/- 10kb range or overlapping the gene.
@lru_cache()
def ldh_gene_lookup(id):
req = requests.get(
f"https://genboree.org/cfde-gene/Gene/id/{urllib.parse.quote(id)}",
headers={
'Accept': 'application/json',
},
)
assert req.status_code == 200, f"Gene with identifier {id} currently not available in exRNA LDH"
results = req.json()
return results
with ignore_exceptions(f"Gene with identifier {gene_info['symbol']} not available in exRNA LDH"):
ldh_results = ldh_gene_lookup(gene_info['symbol'])
display(Markdown(f"[View On LDH](https://genboree.org/cfde-gene/ui/id/{urllib.parse.quote(gene_info['symbol'])})"))
display(RenderJSON(ldh_results))
with ignore_exceptions(f"No information for gene with identifier {gene_info['symbol']} found in HuBMAP ASCT+B"):
req = requests.get('https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=text&libraryName=HuBMAP_ASCT_plus_B_augmented_w_RNAseq_Coexpression')
HuBMAP_ASCT_plus_B_gmt = pd.DataFrame({
term: {
gene: True
for gene in geneset
if gene
}
for line in req.text.splitlines()
for (term, _, *geneset) in (line.strip().split("\t"),)
})
show(HuBMAP_ASCT_plus_B_gmt.loc[gene_info['symbol']].dropna().reset_index().rename({ 'index': 'Term' }, axis=1))
display(Markdown(f"""
<div style="display: flex; align-items: flex-start">
<img class="img-fluid" style="width: 250px" src="/CFDE-Gene-Partnership/static/logos/Metabolomics.png" />
<div>
<p><a href="https://metabolomicsworkbench.org/">https://metabolomicsworkbench.org/</a></p>
<p>The National Institutes of Health (NIH) Common Fund Metabolomics Program was developed with the goal of increasing national capacity in metabolomics by supporting the development of next generation technologies, providing training and mentoring opportunities, increasing the inventory and availability of high quality reference standards, and promoting data sharing and collaboration.</p>
<p><a href="https://sc-cfdewebdev.sdsc.edu/MetGENE/index.php?GeneID={geneId}&species=hsa&GeneIDType=ENTREZID">MetGENE</a> identifies the pathways and reactions catalyzed by the given gene {gene_info['symbol']}, its related metabolites and the studies in Metabolomics Workbench with data on such metabolites.</a></p>
</div>
</div>
"""))
with ignore_exceptions(f"No information for gene with identifier {gene_info['symbol']} found in MetGene"):
metgene_table = requests.get(f"https://sc-cfdewebdev.sdsc.edu/MetGENE/mgSummary.php?species=hsa&GeneSym={gene_info['symbol']}&GeneID={geneId}&viewType=json").json()
display(HTML(f"""
<div style="display: flex; align-items: flex-start">
<table>
<thead>
<tr>
<th> </th>
<th><a href="https://sc-cfdewebdev.sdsc.edu/MetGENE/pathways.php?species=hsa&GeneIDType=ENTREZID&anatomy=NA&disease=NA&phenotype=NA&GeneInfoStr={geneId}">Pathways</a></th>
<th><a href="https://sc-cfdewebdev.sdsc.edu/MetGENE/reactions.php?species=hsa&GeneIDType=ENTREZID&anatomy=NA&disease=NA&phenotype=NA&GeneInfoStr={geneId}">Reactions</a></th>
<th><a href="https://sc-cfdewebdev.sdsc.edu/MetGENE/metabolites.php?species=hsa&GeneIDType=ENTREZID&anatomy=NA&disease=NA&phenotype=NA&GeneInfoStr={geneId}">Metabolites</a></th>
<th><a href="https://sc-cfdewebdev.sdsc.edu/MetGENE/studies.php?species=hsa&GeneIDType=ENTREZID&anatomy=NA&disease=NA&phenotype=NA&GeneInfoStr={geneId}">Studies</a></th>
</tr>
</thead>
<tbody>{''.join(f"<tr><td>{r['Genes']}</td><td>{r['Pathways']}</td><td>{r['Reactions']}</td><td>{r['Metabolites']}</td><td>{r['Studies']}</td></tr>" for r in metgene_table)}</tbody>
</table>
<div style="display: flex; flex-wrap: wrap;">
<img
style="width: 100%;"
src="https://sc-cfdewebdev.sdsc.edu/MetGENE/mgSummary.php?species=hsa&GeneSym={gene_info['symbol']}&GeneID={geneId}&viewType=png"
alt="MetGene Summary"
/>
</div>
</div>
"""))
Each DCC has assembled a large repository of knowledge besides the data directly collected by the data generation centers they coordinate. We can access this expanded knowledge as well.
We query IDG's knowledge base of targets and their Disease associations through the Pharos API.
@lru_cache()
def idg_pharos_geneDiseaseAssociations(associatedTarget):
req = requests.post(
'https://pharos-api.ncats.io/graphql',
headers={
'Content-Type': 'application/json',
'Accept': 'application/json',
},
json={
'query': dedent(f'''
query associatedDiseases {{
diseases(filter: {{associatedTarget: {json.dumps(associatedTarget)} }}) {{
diseases {{
associations {{
name
evidence
pvalue
zscore
}}
}}
}}
}}
''')
},
)
results = req.json()['data']
return results
with ignore_exceptions(f"No information for gene with identifier {gene_info['symbol']} found in Pharos"):
idg_pharos_api_results = idg_pharos_geneDiseaseAssociations(gene_info['symbol'])
idg_pharos_results = pd.DataFrame(
association
for disease in idg_pharos_api_results['diseases']['diseases']
for association in disease['associations']
)
def compute_score(r):
if pd.notnull(r['pvalue']):
return r['pvalue']
elif pd.notnull(r['zscore']):
return 1-st.distributions.norm.cdf(r['zscore'])
else:
# we give na's 0.5 pvalue to show up in the plot albiet, insignificantly
return 0.5
# convert zscores to pvalues if present instead of pvalue
idg_pharos_results['pvalue'] = idg_pharos_results.apply(compute_score, axis=1)
show(idg_pharos_results, order=[[idg_pharos_results.columns.get_loc('pvalue'), 'asc']])
with ignore_exceptions('Pharos results could not be processed'):
idg_pharos_combined_stouffer_statistic = idg_pharos_results.groupby('name')['pvalue'] \
.agg([
('combined_stouffer_statistic', partial(combine_pvalues, method='stouffer', select='pvalue')),
]) \
.reset_index() \
.sort_values('combined_stouffer_statistic', ascending=False)
idg_pharos_combined_stouffer_statistic['-logp(combined_stouffer_statistic)'] = -np.log10(idg_pharos_combined_stouffer_statistic['combined_stouffer_statistic'])
fig = px.bar(
idg_pharos_combined_stouffer_statistic,
x='-logp(combined_stouffer_statistic)',
y='name',
text='name',
orientation='h',
title=f"Disease known to be associated with {gene_info['symbol']} from IDG's Pharos",
)
fig.update_yaxes(showticklabels=False)
fig.update_traces(texttemplate='%{text}', textposition='auto', insidetextanchor='start')
fig.show()
We query the Harmonizome API for associations with various biological entities in a standardized set of numerous omics datasets, as detailed here.
@lru_cache()
def idg_harmonizome_geneInfo(gene, showAssociations=True, version='1.0'):
res = requests.get(
f"https://maayanlab.cloud/Harmonizome/api/{urllib.parse.quote(version)}/gene/{urllib.parse.quote(gene_info['symbol'])}",
params=dict(
showAssociations=json.dumps(showAssociations),
),
)
return res.json()
with ignore_exceptions(f"No information for gene with identifier {gene_info['symbol']} found in Harmonizome"):
idg_harmonizome_api_geneInfo = idg_harmonizome_geneInfo(gene_info['symbol'])
display(RenderJSON(idg_harmonizome_api_geneInfo))
idg_harmonizome_geneAssociations = pd.DataFrame([
dict(
**geneAssociation['geneSet'],
**geneAssociation,
)
for geneAssociation in idg_harmonizome_api_geneInfo['associations']
]).drop(['geneSet', 'thresholdValue', 'href'], axis=1).dropna()
show(
idg_harmonizome_geneAssociations,
order=[[idg_harmonizome_geneAssociations.columns.get_loc('standardizedValue'), 'desc']]
)
with ignore_exceptions('Harmonizome results could not be procesed'):
idg_harmonizome_geneAssociations['direction'] = idg_harmonizome_geneAssociations['standardizedValue'].apply(
lambda v: 'up' if v > 0 else 'down'
)
idg_harmonizome_geneAssociations['absoluteZscore'] = np.abs(idg_harmonizome_geneAssociations['standardizedValue'])
idg_harmonizome_geneAssociations = idg_harmonizome_geneAssociations.sort_values('absoluteZscore')
idg_harmonizome_geneAssociations_top_10_up = idg_harmonizome_geneAssociations[idg_harmonizome_geneAssociations['direction'] == 'up'].iloc[-10:]
idg_harmonizome_geneAssociations_top_10_down = idg_harmonizome_geneAssociations[idg_harmonizome_geneAssociations['direction'] == 'down'].iloc[-10:]
fig = px.bar(
pd.concat([
idg_harmonizome_geneAssociations_top_10_up,
idg_harmonizome_geneAssociations_top_10_down,
], axis=0),
x='absoluteZscore',
y='name',
orientation='h',
barmode='group',
color='direction',
facet_row='direction',
text='name',
title=f"Significant associations with {gene_info['symbol']} in IDG's Harmonizome",
height=1000,
width=1000,
)
fig.update_yaxes(matches=None, showticklabels=False)
fig.update_traces(texttemplate='%{text}', textposition='inside', insidetextanchor='start')
fig.show()
https://maayanlab.cloud/archs4/
ARCHS4 has processed numerous GEO studies and also has Tissue expression data.
@lru_cache()
def archs4_tissue_expression(search, species='human'):
res = requests.get(
f"https://maayanlab.cloud/archs4/search/loadExpressionTissue.php",
params=dict(
search=search,
species='human',
type='tissue',
),
)
df = pd.read_csv(io.StringIO(res.text)).dropna()
df.index = pd.MultiIndex.from_tuples(
df['id'].apply(lambda id: id.split('.')),
names=['', 'system', 'organ', 'tissue'],
)
return df.reset_index().drop(['id', ''], axis=1)
with ignore_exceptions(f"No information for gene with identifier {gene_info['symbol']} found in ARCHS4"):
archs4_tissue_results = archs4_tissue_expression(gene_info['symbol'])
show(archs4_tissue_results)
UniProt is a comprehensive database on protein function information. Their Proteins REST API, documented here, can be used for gene-centric queries.
https://www.ebi.ac.uk/proteins/api/genecentric?offset=0&size=100&gene=STAT3
@lru_cache()
def uniprot_genecentric(gene, offset=0, size=100):
req = requests.get(
f"https://www.ebi.ac.uk/proteins/api/genecentric",
params=dict(
gene=gene,
offset=offset,
size=size,
)
)
return req.json()
with ignore_exceptions(f"No information for gene with identifier {gene_info['symbol']} found in UniProt"):
uniprot_geneinfo = uniprot_genecentric(gene_info['symbol'])
show(pd.DataFrame([
dict(related_record, subject=record['gene']['accession'])
for record in uniprot_geneinfo
for related_record in [record['gene'], *record.get('relatedGene',[])]
]))