Skip to content

Instantly share code, notes, and snippets.

@danemacaulay
Created November 7, 2017 15:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danemacaulay/148d46d1b228e8decba9c6a100a78e10 to your computer and use it in GitHub Desktop.
Save danemacaulay/148d46d1b228e8decba9c6a100a78e10 to your computer and use it in GitHub Desktop.
build, evaluate, and save scikitlearn pipeline
import os
import time
import string
import pickle
import pandas as pd
from operator import itemgetter
from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report as clsr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split as tts
from sklearn.linear_model import Perceptron
from nltk_preprocessor import NLTKPreprocessor
from sklearn import metrics
def timeit(func):
def wrapper(*args, **kwargs):
start = time.time()
result = func(*args, **kwargs)
delta = time.time() - start
return result, delta
return wrapper
def identity(arg):
return arg
# implement gridsearch
# http://scikit-learn.org/stable/auto_examples/plot_compare_reduction.html#sphx-glr-auto-examples-plot-compare-reduction-py
@timeit
def build_and_evaluate(X, y, classifier=SGDClassifier, outpath=None, verbose=True, cachedir=None):
@timeit
def build(classifier, X, y=None):
model = Pipeline([
('preprocessor', NLTKPreprocessor()),
('vectorizer', TfidfVectorizer(
tokenizer=identity, preprocessor=None, lowercase=False)),
('classifier', classifier),
], memory=cachedir)
model.fit(X, y)
return model
# # Label encode the targets
labels = LabelEncoder()
y = labels.fit_transform(y)
target_names = list(map(str, labels.classes_))
# Begin evaluation
if verbose:
print("Building for evaluation")
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=0, stratify=y)
model, secs = build(classifier, X_train, y_train)
if verbose:
print("Evaluation model fit in {:0.3f} seconds".format(secs))
if verbose:
print("Classification Report:\n")
y_pred = model.predict(X_test)
print(clsr(y_test, y_pred, target_names=target_names))
print(metrics.confusion_matrix(y_test, y_pred))
if verbose:
print("Building complete model and saving ...")
model, secs = build(classifier, X, y)
model.labels_ = labels
if verbose:
print("Complete model fit in {:0.3f} seconds".format(secs))
if outpath:
with open(outpath, 'wb') as f:
pickle.dump(model, f)
print("Model written out to {}".format(outpath))
return model
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment