# Standard Data Science Boilerplate
import numpy as np
import pandas as pd
import scipy
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)
import cufflinks as cf
cf.go_offline(connected=True)
cf.set_config_file(colorscale='plotly', world_readable=True)
# Extra options
#pd.options.display.max_rows = 30
#pd.options.display.max_columns = 25
# Show all code cells outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import os
from IPython.display import Image, display, HTML
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
#Boilerplate RDKit
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
import rdkit.rdBase
from rdkit.Chem.MACCSkeys import GenMACCSKeys
from rdkit import DataStructs
from rdkit.DataStructs import BitVectToText
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit import DataStructs
import pandas as pd
from rdkit.Chem import PandasTools as PandasTools
from rdkit.Chem import Descriptors as Descriptors
sdfFilePath ='FDA2019smallmols.sdf'
moldf = PandasTools.LoadSDF(sdfFilePath,molColName='SMILES')
#moldf["HBA"].fillna(0, inplace=True)
moldf["logP"]=moldf["logP"].astype({'logP': float})
moldf["logD"]=moldf["logD"].astype({'logD': float})
moldf["mass"]=moldf["mass"].astype({'mass': float})
moldf["HBA"]=moldf["HBA"].astype({'HBA': float})
moldf["HBD"]=moldf["HBD"].astype({'HBD': float})
moldf["atomCount"]=moldf["atomCount"].astype({'atomCount': float})
moldf["TPSA"]=moldf["TPSA"].astype({'TPSA': float})
moldf["RBC"]=moldf["RBC"].astype({'RBC': float})
moldf["MR"]=moldf["MR"].astype({'MR': float})
moldf["HAC"]=moldf["HAC"].astype({'HAC': float})
moldf["FractionAromatic"]=moldf["FractionAromatic"].astype({'FractionAromatic': float})
moldf["Active Ingredient"]=moldf["Active Ingredient"].astype({'Active Ingredient': str})
moldf["abzn"]=moldf["abzn"].astype({'abzn': 'category'})
moldf.head(2)
moldf.dtypes
#Need to work out how to compute range appropriate for property
@interact
def show_molecules_more_than(column=list(moldf.select_dtypes('number').columns), x=(-4, 10, 1),):
display(HTML(f'<h2>Showing molecules with {column} > {x}<h2>'))
display(moldf.loc[moldf[column] > x, ['Active Ingredient', 'logP','logD', 'atomCount', 'mass', 'HBA', 'SMILES', 'CSID']])
@interact
def scatter(x=list(moldf.select_dtypes('number').columns),
y=list(moldf.select_dtypes('number').columns)[1:]):
moldf.iplot(kind='scatter', x=x, y=y, mode='markers',
xTitle=x.title(), yTitle=y.title(), title=f'{y.title()} vs {x.title()}')
@interact
def scatter_plot(x=list(moldf.select_dtypes('number').columns),
y=list(moldf.select_dtypes('number').columns)[1:]):
moldf.iplot(kind='scatter3d', x=x, y=y, z= 'abzn', mode='markers', categories='Active Ingredient',
xTitle=x.title(), yTitle=y.title(), title=f'{y.title()} vs {x.title()}')
#possible options for structures
mol=moldf.loc[3,'SMILES']
mol
import requests
import IPython.display as Disp
Mycid = moldf.loc[3, 'Pubchem']
url=('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/%s/PNG' % Mycid)
Disp.Image(requests.get(url).content)
Options for displaying structures Using mpld3, the mpld3 project brings together Matplotlib, the popular Python-based graphing library, and D3js, https://mpld3.github.io/index.html
%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mpld3
from rdkit import Chem
from rdkit.Chem import RDConfig
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import DataStructs
from sklearn.decomposition import PCA
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from mpld3 import plugins
mpld3.enable_notebook()
#need to edit the mpld3/_display.py as described here
#https://stackoverflow.com/questions/48015030/mpld3-with-python-error
#mpld3 is no longer being actively maintained: feature requests & bug reports are likely to go unanswered
def moltosvg(mol,molSize=(225,75),kekulize=True):
mc = Chem.Mol(mol.ToBinary())
if kekulize:
try:
Chem.Kekulize(mc)
except:
mc = Chem.Mol(mol.ToBinary())
if not mc.GetNumConformers():
rdDepictor.Compute2DCoords(mc)
drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])
drawer.DrawMolecule(mc)
drawer.FinishDrawing()
svg = drawer.GetDrawingText()
return svg.replace('svg:','')
def fp2arr(fp):
arr = np.zeros((1,))
DataStructs.ConvertToNumpyArray(fp,arr)
return arr
#smiles_list = ['CC','CCC','CCCC','O=C(C)Oc1ccccc1C(=O)O','c1ccccc1c1ccccc1']
#mols = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
mols=moldf['SMILES']
#data1 = [0,1,2,3,4]
mylogP=moldf['logP']
mylogD=moldf['logD']
myMass=moldf['mass']
myHBA=moldf['HBA']
myHBD=moldf['HBD']
myFraAro = moldf['FractionAromatic']
labels =moldf['Active Ingredient']
myABZN=moldf['abzn'] #this is a catagorical property
svgs = [moltosvg(m) for m in mols]
#myABZN
fig, ax = plt.subplots()
ax.set_xlabel('LogP')
ax.set_ylabel('FractionAromatic')
ax.set_title('Viz chemical space!')
points = ax.scatter(mylogP, myFraAro)
# This is key point for making tooltip!
tooltip = plugins.PointHTMLTooltip(points, svgs)
plugins.connect(fig, tooltip)