Skip to content

Instantly share code, notes, and snippets.

@pstjohn
Last active September 5, 2019 22:53
Show Gist options
  • Save pstjohn/b7e2f05136bf8bfd22f68f375437452d to your computer and use it in GitHub Desktop.
Save pstjohn/b7e2f05136bf8bfd22f68f375437452d to your computer and use it in GitHub Desktop.

YSI Prediction and Analysis code

Flattened jupyter notebooks containing the analysis workflow in the manuscript A quantitative model for the prediction of sooting tendency from molecular structure.

Full repository (with data files) is available at pstjohn/ysi_qsar_energy_fuels

name: ysi
channels:
- https://conda.anaconda.org/rdkit
dependencies:
- python=3.5
- jupyter
- pandas
- seaborn
- rdkit
- jinja2
- pip:
- https://github.com/pstjohn/scikit-learn/archive/pstjohn-changes.zip
- pubchempy
- keras==0.3.3
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import numpy as np\n",
"np.random.seed(0)\n",
"import pandas as pd\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"sns.set_style('darkgrid')\n",
"sns.set_context('talk', font_scale=1.5)\n",
"sns.set(color_codes=True)\n",
"\n",
"import matplotlib.gridspec as gridspec\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using Theano backend.\n"
]
}
],
"source": [
"from ysi_utils.models.setup_model import model\n",
"\n",
"from ysi_utils.data import low\n",
"from ysi_utils.descriptors import dragon\n",
"from ysi_utils.validation import y_train as y\n",
"from ysi_utils.validation import y_test\n",
"\n",
"X = dragon.loc[y.index]\n",
"X_test = dragon.loc[y_test.index]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.ensemble import BaggingRegressor\n",
"from sklearn.metrics import median_absolute_error"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"*This next step takes a considerable amount of computing time, since the model is refit 25 times*"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"BaggingRegressor(base_estimator=Pipeline(steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('filter', VarianceThreshold(threshold=0.0)), ('scaler', MaxAbsScaler(copy=True)), ('feature_extraction', RFE(estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',\n",
" kerne...step=10, verbose=0)), ('ffann', <keras.wrappers.scikit_learn.KerasRegressor object at 0x111467588>)]),\n",
" bootstrap=False, bootstrap_features=False, max_features=1.0,\n",
" max_samples=0.9, n_estimators=25, n_jobs=1, oob_score=False,\n",
" random_state=0, verbose=False, warm_start=False)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bagging_model = BaggingRegressor(base_estimator=model, random_state=0, verbose=False,\n",
" max_samples=.9, bootstrap=False, n_estimators=25)\n",
"bagging_model.fit(X.values, y.YSI.values)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import keras"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'0.3.3'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"keras.__version__"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"['../ysi_utils/models/bagging_model.pkl.large']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import sys\n",
"# This is required to prevent errors on pickling.\n",
"sys.setrecursionlimit(10000)\n",
"\n",
"from sklearn.externals import joblib\n",
"joblib.dump(bagging_model, '../ysi_utils/models/bagging_model.pkl.large', compress=9)"
]
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [conda env:ysi]",
"language": "python",
"name": "conda-env-ysi-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import numpy as np
np.random.seed(0)
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (StandardScaler, Normalizer, RobustScaler,
MaxAbsScaler, Imputer)
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold, RFE, SelectKBest, f_regression
from sklearn import svm
from ysi_utils.descriptors import dragon
from ysi_utils.validation import y_train as y
from ysi_utils.validation import y_test
X = dragon.loc[y.index]
X_test = dragon.loc[y_test.index]
# These are the optimal parameters as selected by the hyperopt fitting routine.
params = \
{'activation': 'relu',
'batch_size': 39,
'dropout1': 0.36632971139053216,
'dropout2': 0.030115015871571206,
'feature_extractor': {'n_features_to_select': 390, 'step': 10, 'type': 'RFE'},
'nb_epochs': 100,
'num_layers': {'dropout3': 0.31435729897990794,
'layers': 'three',
'units3': 161},
'optimizer': 'adam',
'preprocessor': 'MaxAbsScaler',
'units1': 140,
'units2': 140}
def get_input_size(params):
if 'n_components' in params['feature_extractor']:
return params['feature_extractor']['n_components']
elif 'k' in params['feature_extractor']:
return params['feature_extractor']['k']
elif 'n_features_to_select' in params['feature_extractor']:
return params['feature_extractor']['n_features_to_select']
def build_model(params=params):
model = Sequential()
model.add(Dense(output_dim=params['units1'],
input_dim=get_input_size(params)))
model.add(Activation(params['activation']))
model.add(Dropout(params['dropout1']))
model.add(Dense(output_dim=params['units2'], init="glorot_uniform"))
model.add(Activation(params['activation']))
model.add(Dropout(params['dropout2']))
if params['num_layers']['layers']== 'three':
model.add(Dense(output_dim=params['num_layers']['units3'], init="glorot_uniform"))
model.add(Activation(params['activation']))
model.add(Dropout(params['num_layers']['dropout3']))
model.add(Dense(output_dim=1))
model.compile(loss='mae', optimizer=params['optimizer'])
return model
ffann = KerasRegressor(build_fn=build_model,
nb_epoch=params['nb_epochs'],
batch_size=params['batch_size'],
verbose=0)
# Set up preprocessing pipeline
imputer = Imputer()
var_filter = VarianceThreshold()
preprocessor_dict = {
'StandardScaler' : StandardScaler,
'MaxAbsScaler' : MaxAbsScaler,
'Normalizer' : Normalizer,
'RobustScaler' : RobustScaler,
}
scaler = preprocessor_dict[params['preprocessor']]()
if params['feature_extractor']['type'] == 'pca':
opts = dict(params['feature_extractor'])
del opts['type']
feature_extraction = PCA(**opts)
elif params['feature_extractor']['type'] == 'RFE':
opts = dict(params['feature_extractor'])
del opts['type']
svr = svm.SVR(kernel='linear')
feature_extraction = RFE(estimator=svr, **opts)
elif params['feature_extractor']['type'] == 'SelectKBest':
opts = dict(params['feature_extractor'])
del opts['type']
feature_extraction = SelectKBest(score_func=f_regression, **opts)
model = Pipeline(steps=[
('imputer', imputer),
('filter', var_filter),
('scaler', scaler),
('feature_extraction', feature_extraction),
('ffann', ffann)
])
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment