In [38]:
# Standard Data Science Boilerplate
import numpy as np
import pandas as pd
import scipy

import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)

import cufflinks as cf
cf.go_offline(connected=True)
cf.set_config_file(colorscale='plotly', world_readable=True)

# Extra options
#pd.options.display.max_rows = 30
#pd.options.display.max_columns = 25

# Show all code cells outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
In [39]:
import os
from IPython.display import Image, display, HTML
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
In [40]:
#Boilerplate RDKit
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
import rdkit.rdBase
from rdkit.Chem.MACCSkeys import GenMACCSKeys
from rdkit import DataStructs
from rdkit.DataStructs import BitVectToText
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit import DataStructs
import pandas as pd 
from rdkit.Chem import PandasTools as PandasTools
from rdkit.Chem import Descriptors as Descriptors
In [ ]:
 
In [41]:
sdfFilePath ='FDA2019smallmols.sdf'
moldf = PandasTools.LoadSDF(sdfFilePath,molColName='SMILES')
#moldf["HBA"].fillna(0, inplace=True)
moldf["logP"]=moldf["logP"].astype({'logP': float})
moldf["logD"]=moldf["logD"].astype({'logD': float})
moldf["mass"]=moldf["mass"].astype({'mass': float})
moldf["HBA"]=moldf["HBA"].astype({'HBA': float})
moldf["HBD"]=moldf["HBD"].astype({'HBD': float})
moldf["atomCount"]=moldf["atomCount"].astype({'atomCount': float})
moldf["TPSA"]=moldf["TPSA"].astype({'TPSA': float})
moldf["RBC"]=moldf["RBC"].astype({'RBC': float})
moldf["MR"]=moldf["MR"].astype({'MR': float})
moldf["HAC"]=moldf["HAC"].astype({'HAC': float})
moldf["FractionAromatic"]=moldf["FractionAromatic"].astype({'FractionAromatic': float})
moldf["Active Ingredient"]=moldf["Active Ingredient"].astype({'Active Ingredient': str})
moldf["abzn"]=moldf["abzn"].astype({'abzn': 'category'})
In [42]:
moldf.head(2)
Out[42]:
Name Drug Name Active Ingredient Approval Date FDA-approved use on approval date* CSID Done abzn logP logD ... Brenda PDBligand Kegg Ligand Human Metab Lipidmaps rhea swisslipids NIH Clinical Mcule PharmGKB
0 atoms 43 bonds 48 Ubrelvy ubrogepant 12/23/2019 Acute treatment of migraine with or without au... 28536135 Done Neutral 3.07 3.07 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 atoms 31 bonds 34 Dayvigo lemborexant 12/20/2019 Insomnia 34500836 Done Neutral 3.34 3.34 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

2 rows × 61 columns

In [ ]:
 
In [43]:
moldf.dtypes
Out[43]:
Name                                  object
Drug Name                             object
Active Ingredient                     object
Approval Date                         object
FDA-approved use on approval date*    object
                                       ...  
rhea                                  object
swisslipids                           object
NIH Clinical                          object
Mcule                                 object
PharmGKB                              object
Length: 61, dtype: object
In [44]:
#Need to work out how to compute range appropriate for property
@interact
def show_molecules_more_than(column=list(moldf.select_dtypes('number').columns), x=(-4, 10, 1),):
    display(HTML(f'<h2>Showing molecules with {column} > {x}<h2>'))
    display(moldf.loc[moldf[column] > x, ['Active Ingredient', 'logP','logD', 'atomCount', 'mass', 'HBA', 'SMILES', 'CSID']])
Screenshot 2020-01-15 at 07.36.40.png
In [45]:
@interact
def scatter(x=list(moldf.select_dtypes('number').columns), 
                 y=list(moldf.select_dtypes('number').columns)[1:]):
    moldf.iplot(kind='scatter', x=x, y=y, mode='markers', 
             xTitle=x.title(), yTitle=y.title(), title=f'{y.title()} vs {x.title()}')
Screenshot 2020-01-15 at 07.40.45.png
In [ ]:
 
In [46]:
@interact
def scatter_plot(x=list(moldf.select_dtypes('number').columns), 
                 y=list(moldf.select_dtypes('number').columns)[1:]):
    moldf.iplot(kind='scatter3d', x=x, y=y, z= 'abzn', mode='markers', categories='Active Ingredient',
             xTitle=x.title(), yTitle=y.title(), title=f'{y.title()} vs {x.title()}')
3DplotinJupyter.png
In [ ]:
 
In [ ]:

In [47]:
#possible options for structures
In [48]:
mol=moldf.loc[3,'SMILES']
mol
Out[48]:
In [ ]:
 
In [49]:
import requests
import IPython.display as Disp  
Mycid = moldf.loc[3, 'Pubchem']
url=('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/%s/PNG' % Mycid)
Disp.Image(requests.get(url).content)
Out[49]:

Options for displaying structures Using mpld3, the mpld3 project brings together Matplotlib, the popular Python-based graphing library, and D3js, https://mpld3.github.io/index.html

In [50]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mpld3
from rdkit import Chem
from rdkit.Chem import RDConfig
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import DataStructs
from sklearn.decomposition import PCA
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from mpld3 import plugins
mpld3.enable_notebook()

#need to edit the mpld3/_display.py as described here
#https://stackoverflow.com/questions/48015030/mpld3-with-python-error
#mpld3 is no longer being actively maintained: feature requests & bug reports are likely to go unanswered
In [51]:
def moltosvg(mol,molSize=(225,75),kekulize=True):
    mc = Chem.Mol(mol.ToBinary())
    if kekulize:
        try:
            Chem.Kekulize(mc)
        except:
            mc = Chem.Mol(mol.ToBinary())
    if not mc.GetNumConformers():
        rdDepictor.Compute2DCoords(mc)
    drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])
    drawer.DrawMolecule(mc)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    return svg.replace('svg:','')
In [52]:
def fp2arr(fp):
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp,arr)
    return arr
In [53]:
#smiles_list = ['CC','CCC','CCCC','O=C(C)Oc1ccccc1C(=O)O','c1ccccc1c1ccccc1']
#mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]

mols=moldf['SMILES']
#data1 = [0,1,2,3,4]
mylogP=moldf['logP']
mylogD=moldf['logD']
myMass=moldf['mass']
myHBA=moldf['HBA']
myHBD=moldf['HBD']
myFraAro = moldf['FractionAromatic']
labels =moldf['Active Ingredient']
myABZN=moldf['abzn'] #this is a catagorical property

svgs = [moltosvg(m) for m in mols]
In [54]:
#myABZN
In [55]:
fig, ax = plt.subplots()
ax.set_xlabel('LogP')
ax.set_ylabel('FractionAromatic')
ax.set_title('Viz chemical space!')
points = ax.scatter(mylogP, myFraAro)
# This is key point for making tooltip!
tooltip = plugins.PointHTMLTooltip(points, svgs)
plugins.connect(fig, tooltip)
Out[55]:
Text(0.5, 0, 'LogP')
Out[55]:
Text(0, 0.5, 'FractionAromatic')
Out[55]:
Text(0.5, 1.0, 'Viz chemical space!')