DrugShot searches PubMed for articles that co-mention any search term and terms that describe drugs.
It then prioritizes the top literature associated drugs and predicts additional drugs based on shared properties in drug-drug similarity matrices.
from collections import Counter
import pandas as pd
import numpy as np
# Display / graphing
from IPython.display import display, HTML
import plotly.express as px
import plotly.graph_objects as go
import sklearn.metrics
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
# API access
import requests
import io
import time
import json
# Notebook display util functions
def make_clickable(link):
return f'<a target="_blank" href="{link}">{link}</a>'
table_number = 0
figure_number = 0
def figure_header(label,title):
global table_number
global figure_number
if label == 'Table':
table_number += 1
label = f'Table {table_number}'
elif label == 'Figure':
figure_number += 1
label = f'Figure {figure_number}'
display(HTML(f"<div style='font-size:1.25rem; padding:1rem 0;'><b>{label}</b>: {title}</div>"))
def figure_legend(label,title,content=''):
global table_number
global figure_number
if label == 'Table':
label = f'Table {table_number}'
elif label == 'Figure':
label = f'Figure {figure_number}'
display(HTML(f'<style>div.caption {{text-align: center;}}</style><div class=caption><b>{label}</b>: <i>{title}</i>. {content} </div>'))
def drugenrichr_link(short_id):
display(HTML(f'<span><a href="https://maayanlab.cloud/DrugEnrichr/enrich?dataset={short_id}">Access the complete enrichment analysis on the DrugEnrichr website. </a></span>'))
def DrugEnrichr_API(drug_list, description):
# takes a drug list and provides a shortID to view enrichment analysis results on DrugEnrichr website
global short_id
DRUG_ENRICHR_URL = 'http://amp.pharm.mssm.edu/DrugEnrichr/addList'
drugs_str = '\n'.join(drug_list)
payload = {
'list': (None, drugs_str),
'description': (None, description)
}
response = requests.post(DRUG_ENRICHR_URL, files=payload)
if not response.ok:
raise Exception('Error analyzing drug list')
data = json.loads(response.text)
short_id = data['shortId']
user_list_id = data['userListId']
#sends generated userListId to DrugEnrichr enrichment API endpoint
results = {}
DRUG_ENRICHR_URL = 'http://amp.pharm.mssm.edu/DrugEnrichr/enrich'
query_string = '?userListId=%s&backgroundType=%s'
for drug_set_library in ['L1000FWD_GO_Biological_Processes_Up', 'L1000FWD_GO_Biological_Processes_Down',
'SIDER_Side_Effects']:
response = requests.get(
DRUG_ENRICHR_URL + query_string % (user_list_id, drug_set_library)
)
if not response.ok:
raise Exception('Error fetching enrichment results')
data = response.json()[drug_set_library]
try:
short_results_df = pd.DataFrame(data).sort_values(by = [2])
results[drug_set_library] = {
'terms': short_results_df[1].head(10).tolist(),
'p-values': short_results_df[2].head(10).tolist(),
'adjusted p-values': short_results_df[6].head(10).tolist()
}
except KeyError:
pass
return results
# Create bar plot of top 10 enriched terms in L1000FWD & SIDER libraries
def bar_plot(results):
# Bar colors
bar_color = 'pink'
bar_color_not_sig = 'lightgrey'
edgecolor=None
linewidth=0
for library, values in results.items():
plt.figure(figsize=(24, 12))
bar_colors = [bar_color if (x < 0.05) else bar_color_not_sig for x in values['p-values']]
fig = sns.barplot(x=np.log10(values['p-values'])*-1,
y= values['terms'],
palette=bar_colors,
edgecolor=edgecolor,
linewidth=linewidth)
fig.axes.get_yaxis().set_visible(False)
fig.set_title(library.replace('_', ' '), fontsize=26)
fig.set_xlabel('−log₁₀(p‐value)', fontsize=25)
fig.xaxis.set_major_locator(MaxNLocator(integer=True))
fig.tick_params(axis='x', which='major', labelsize=20)
if max(np.log10(results[library]['p-values'])*-1)<1:
fig.xaxis.set_ticks(np.arange(0, max(np.log10(results[library]['p-values'])*-1), 0.1))
for index,term in enumerate(results[library]['terms']):
if results[library]['adjusted p-values'][index] < 0.05:
term = ' *'.join([term,
str(str(np.format_float_scientific(results[library]['p-values'][index],
precision=2)))])
else:
term = ' '.join([term,
str(str(np.format_float_scientific(results[library]['p-values'][index],
precision=2)))])
title_start= max(fig.axes.get_xlim())/200
fig.text(title_start, index, term, ha='left', wrap = True, fontsize = 26)
fig.spines['right'].set_visible(False)
fig.spines['top'].set_visible(False)
plt.show()
matrix_files = {'L1000 Signature Similarity': 'https://appyters.maayanlab.cloud/storage/DrugShot/L1000_similarity_matrix.npz',
'AutoRIF Literature Co-mentions': 'https://appyters.maayanlab.cloud/storage/DrugShot/AutoRIF_cooccurence_matrix.tsv.gz',
'DrugRIF Literature Co-mentions' : 'https://appyters.maayanlab.cloud/storage/DrugShot/DrugRIF_cooccurence_matrix.tsv.gz'
}
RIF = pd.read_csv("https://appyters.maayanlab.cloud/storage/DrugShot/AutoRIF.tsv.gz",
sep = '\t',
usecols = ['name','PMID']).set_index('name')
i = 0
results = []
pubmed_url = ("https://eutils.ncbi.nlm.nih.gov/entrez"+
"/eutils/esearch.fcgi"+
"?db=pubmed&term={}"+
"&retmax=100000"+
"&retstart={}"+
"&retmode=json")
res = requests.get(pubmed_url.format('positive regulation of lymphocyte proliferation', i)).json()
if int(res['esearchresult']['count']) > 0:
while i <= int(res['esearchresult']['count']):
results.extend(requests.get(pubmed_url.format('positive regulation of lymphocyte proliferation',i)).json()['esearchresult']['idlist'])
i += len(res['esearchresult']['idlist'])
time.sleep(0.2)
results = list(map(int, results))
# Retrieve associated drugs based on drug:search term co-mentions in the literature
df_associated = pd.DataFrame(RIF[RIF['PMID'].isin(results) == True].index.value_counts())\
.rename(columns = {'name':'Publications with Search Term'})
# Get counts of each drug in the association file
drug_counts = Counter(RIF[RIF.index.isin(df_associated.index)].index)
# Calculate fraction of publications associated with drug+term / drug only
normalized_fraction = [(df_associated.loc[index][0]/drug_counts[index]) for index in df_associated.index]
df_associated['Publications with Search Term / Total Publications'] = normalized_fraction
# Calculate (Publications with Search Term * Normalized Fraction) used as metric to rank unweighted drug set",
df_associated['Publications with Search Term * Normalized Fraction'] = df_associated['Publications with Search Term'] * df_associated['Publications with Search Term / Total Publications']
del(RIF)
associated_table = df_associated.sort_values(by = ['Publications with Search Term'], ascending = False)
associated_table.to_csv('positive regulation of lymphocyte proliferation'.replace(' ','_')+'_associated_drug_table.csv')
figure_header('Table', 'Top Associated Compounds<br>({})</br>'.format(make_clickable('positive regulation of lymphocyte proliferation'.replace(' ','_')+'_associated_drug_table.csv')))
display(associated_table[associated_table.columns[0:2]].head(20))
figure_legend('Table', 'Top 20 Drugs associated with '+'positive regulation of lymphocyte proliferation')
del(df_associated)
| Publications with Search Term | Publications with Search Term / Total Publications | |
|---|---|---|
| ionomycin | 26 | 0.005950 |
| cyclosporin-a | 26 | 0.000872 |
| dinoprostone | 26 | 0.000908 |
| sirolimus | 25 | 0.001174 |
| dexamethasone | 21 | 0.000395 |
| retinol | 15 | 0.000330 |
| tretinoin | 14 | 0.000616 |
| cyclophosphamide | 11 | 0.000200 |
| resveratrol | 10 | 0.001055 |
| imatinib | 9 | 0.000841 |
| tacrolimus | 9 | 0.000540 |
| cholecalciferol | 9 | 0.000327 |
| ethanolamine | 7 | 0.000033 |
| calcitriol | 7 | 0.000484 |
| broxuridine | 7 | 0.000539 |
| azacitidine | 7 | 0.000956 |
| succinimide | 6 | 0.001067 |
| dronabinol | 6 | 0.000801 |
| mycophenolic acid | 6 | 0.000716 |
| daunorubicin | 6 | 0.000088 |
fig = px.scatter(associated_table.reset_index().rename(columns = {'index':'chemical'}),
x = 'Publications with Search Term',
y = 'Publications with Search Term / Total Publications',
hover_name = 'chemical',
title='positive regulation of lymphocyte proliferation')
fig.show()
Create list of top associated compounds ranked by Publications with Search Term * Normalized Fraction to treat as an unweighted drug set for further predictions
associated_compounds = associated_table.sort_values(by = 'Publications with Search Term * Normalized Fraction',
kind = 'stable',
ascending = False)[0:50].index.str.upper().tolist()
Predicted compounds are computed based on average similarity between the unweighted drug set and other drugs & small molecules within the L1000 Signature Similarity matrix
# Load correlation matrix into pandas DataFrame
response = requests.get(matrix_files["L1000 Signature Similarity"])
coexpression_matrix = np.load(io.BytesIO(response.content), allow_pickle = True)
mat = pd.DataFrame(data = coexpression_matrix['correlations'], columns = coexpression_matrix['index'],
index = coexpression_matrix['index'])
del(coexpression_matrix)
# Calculate average similarity for each drug in the prediction matrix with the associated drug set
mat = mat.loc[mat.index.str.upper().isin(associated_compounds)]
print(f'{len(mat.index)} small molecules from the associated drug set matched in the similarity matrix')
mat.loc['Similarity_Score'] = mat[mat.columns].mean()
mat.sort_values(by = ['Similarity_Score'], axis = 1, ascending = False, inplace = True)
28 small molecules from the associated drug set matched in the similarity matrix
# Calculating FPR and TPR for literature co-mentions predictions
y_score = mat.loc['Similarity_Score'].sort_values(ascending = False)
true_indices = sorted([y_score.\
index.str.upper().\
get_loc(x) for x in associated_compounds \
if x in mat.index.str.upper()])
y_true = np.zeros(len(mat.columns))
np.put(y_true, true_indices, 1)
fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_true, y_score)
auc_score = sklearn.metrics.roc_auc_score(y_true, y_score)
# Plot dataframe
data = pd.DataFrame(data=[y_score.index,y_true,fpr,tpr],
index = ['name','label','fpr','tpr']).T
# ROC Curve
fig = go.Figure()
name = f"(AUC={auc_score:.4f})"
fig.add_trace(go.Scatter(x=data['fpr'],
y=data['tpr'],
text = data['name'],
showlegend=True,
name=name,
mode='lines'))
fig.add_shape(
type='line',
line=dict(dash='dash'),
x0=0,
x1=1,
y0=0,
y1=1)
fig.layout.update(
title = f'ROC Curve for Unweighted Drug Set Rankings in L1000 Signature Similarity Matrix',
xaxis_title='False Positive Rate',
yaxis_title='True Positive Rate',
yaxis=dict(scaleanchor="x", scaleratio=0.65),
xaxis=dict(constrain='domain'),
width=800, height=500
)
fig.show()
predicted_table = pd.DataFrame(mat.loc['Similarity_Score'])
predicted_table = predicted_table[~predicted_table.index.str.upper().isin(associated_compounds)]
filename = 'positive regulation of lymphocyte proliferation'.replace(' ','_')+"_L1000 Signature Similarity".replace(' ','_')+'_predicted_drug_table.csv'
predicted_table.to_csv(filename)
predicted_table.to_csv(filename)
figure_header('Table', 'Top Predicted Compounds From L1000 Signature Similarity<br>({})</br>'.format(make_clickable(filename)))
display(predicted_table.head(20))
figure_legend('Table', 'Top 20 drugs predicted to be associated with {0} based on {1}'.format('positive regulation of lymphocyte proliferation','L1000 Signature Similarity'))
| Similarity_Score | |
|---|---|
| BIX-01294 | 0.191447 |
| purvalanol-a | 0.186698 |
| TL-HRAS-24 | 0.175511 |
| SA-25547 | 0.175222 |
| avicin-d | 0.174788 |
| KIN001-055 | 0.174067 |
| MG-132 | 0.173868 |
| neratinib | 0.172500 |
| afatinib | 0.171939 |
| crizotinib | 0.171688 |
| BRD-K40329609 | 0.169068 |
| BRD-K01896723 | 0.168310 |
| VU-0418939-2 | 0.168204 |
| SA-1478088 | 0.168009 |
| BRD-K14749055 | 0.167835 |
| BVT-948 | 0.167308 |
| WYE-125132 | 0.166824 |
| BRD-K65603160 | 0.166754 |
| ispinesib | 0.166449 |
| IKK-16 | 0.166317 |
The top 50 predicted small molecules are submitted for drug set enrichment analysis using DrugEnrichr
The output bar charts show the top 10 enriched terms in each library, along with their corresponding p-values. Colored bars correspond to terms with significant p-values (<0.05). An asterisk (*) next to a p-value indicates the term also has a significant adjusted p-value (<0.05).
drug_list = predicted_table.index[0:50].tolist()
results = DrugEnrichr_API(drug_list,'%s (DrugShot L1000 Signature Similarity)'% 'positive regulation of lymphocyte proliferation')
bar_plot(results)
Link to the complete enrichment analysis results is output below
drugenrichr_link(short_id)