In [1]:
# Standard Data Science Boilerplate
import numpy as np
import pandas as pd
import scipy

import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

import cufflinks as cf
cf.go_offline(connected=True)
cf.set_config_file(colorscale='plotly', world_readable=True)

# Extra options
#pd.options.display.max_rows = 30
#pd.options.display.max_columns = 25

# Show all code cells outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
In [2]:
import os
from IPython.display import Image, display, HTML
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
In [3]:
#Boilerplate RDKit
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
import rdkit.rdBase
from rdkit.Chem.MACCSkeys import GenMACCSKeys
from rdkit import DataStructs
from rdkit.DataStructs import BitVectToText
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit import DataStructs
import pandas as pd 
from rdkit.Chem import PandasTools as PandasTools
from rdkit.Chem import Descriptors as Descriptors
RDKit WARNING: [07:23:42] Enabling RDKit 2019.09.1 jupyter extensions
In [ ]:
 
In [4]:
sdfFilePath ='FDA2019smallmols.sdf'
moldf = PandasTools.LoadSDF(sdfFilePath,molColName='SMILES')
#moldf["HBA"].fillna(0, inplace=True)
moldf["logP"]=moldf["logP"].astype({'logP': float})
moldf["logD"]=moldf["logD"].astype({'logD': float})
moldf["mass"]=moldf["mass"].astype({'mass': float})
moldf["HBA"]=moldf["HBA"].astype({'HBA': float})
moldf["HBD"]=moldf["HBD"].astype({'HBD': float})
moldf["atomCount"]=moldf["atomCount"].astype({'atomCount': float})
moldf["TPSA"]=moldf["TPSA"].astype({'TPSA': float})
moldf["RBC"]=moldf["RBC"].astype({'RBC': float})
moldf["MR"]=moldf["MR"].astype({'MR': float})
moldf["HAC"]=moldf["HAC"].astype({'HAC': float})
moldf["FractionAromatic"]=moldf["FractionAromatic"].astype({'FractionAromatic': float})
In [5]:
moldf.head(2)
Out[5]:
Name Drug Name Active Ingredient Approval Date FDA-approved use on approval date* CSID Done abzn logP logD ... Brenda PDBligand Kegg Ligand Human Metab Lipidmaps rhea swisslipids NIH Clinical Mcule PharmGKB
0 atoms 43 bonds 48 Ubrelvy ubrogepant 12/23/2019 Acute treatment of migraine with or without au... 28536135 Done Neutral 3.07 3.07 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 atoms 31 bonds 34 Dayvigo lemborexant 12/20/2019 Insomnia 34500836 Done Neutral 3.34 3.34 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

2 rows × 61 columns

In [6]:
moldf.dtypes
Out[6]:
Name                                  object
Drug Name                             object
Active Ingredient                     object
Approval Date                         object
FDA-approved use on approval date*    object
                                       ...  
rhea                                  object
swisslipids                           object
NIH Clinical                          object
Mcule                                 object
PharmGKB                              object
Length: 61, dtype: object
In [7]:
#Need to work out how to compute range appropriate for property
@interact
def show_molecules_more_than(column=list(moldf.select_dtypes('number').columns), x=(-4, 10, 1),):
    display(HTML(f'<h2>Showing molecules with {column} > {x}<h2>'))
    display(moldf.loc[moldf[column] > x, ['Active Ingredient', 'logP','logD', 'atomCount', 'mass', 'HBA', 'SMILES', 'CSID']])
Screenshot 2020-01-15 at 07.36.40.png
In [8]:
@interact
def scatter(x=list(moldf.select_dtypes('number').columns), 
                 y=list(moldf.select_dtypes('number').columns)[1:]):
    moldf.iplot(kind='scatter', x=x, y=y, mode='markers', 
             xTitle=x.title(), yTitle=y.title(), title=f'{y.title()} vs {x.title()}')
Screenshot 2020-01-15 at 07.40.45.png
In [ ]:
 
In [10]:
@interact
def scatter_plot(x=list(moldf.select_dtypes('number').columns), 
                 y=list(moldf.select_dtypes('number').columns)[1:]):
    moldf.iplot(kind='scatter3d', x=x, y=y, z= 'abzn', mode='markers', categories='Active Ingredient',
             xTitle=x.title(), yTitle=y.title(), title=f'{y.title()} vs {x.title()}')
Screenshot 2020-01-15 at 07.43.46.png
In [ ]:
 
In [ ]:

In [12]:
#possible options for structures
In [13]:
mol=moldf.loc[3,'SMILES']
mol
Out[13]:
In [17]:
 
Out[17]:
'71602803'
In [19]:
import requests
import IPython.display as Disp  
Mycid = moldf.loc[3, 'Pubchem']
url=('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/%s/PNG' % Mycid)
Disp.Image(requests.get(url).content)
Out[19]:
In [ ]:
 
In [ ]:
 

The Jupyter notebook and data file can be downloaded from here