Created
November 7, 2017 15:38
-
-
Save danemacaulay/148d46d1b228e8decba9c6a100a78e10 to your computer and use it in GitHub Desktop.
build, evaluate, and save scikitlearn pipeline
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import time | |
import string | |
import pickle | |
import pandas as pd | |
from operator import itemgetter | |
from nltk.corpus import stopwords as sw | |
from nltk.corpus import wordnet as wn | |
from nltk import wordpunct_tokenize | |
from nltk import WordNetLemmatizer | |
from nltk import sent_tokenize | |
from nltk import pos_tag | |
from sklearn.pipeline import Pipeline | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.linear_model import SGDClassifier | |
from sklearn.base import BaseEstimator, TransformerMixin | |
from sklearn.metrics import classification_report as clsr | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.cross_validation import train_test_split as tts | |
from sklearn.linear_model import Perceptron | |
from nltk_preprocessor import NLTKPreprocessor | |
from sklearn import metrics | |
def timeit(func): | |
def wrapper(*args, **kwargs): | |
start = time.time() | |
result = func(*args, **kwargs) | |
delta = time.time() - start | |
return result, delta | |
return wrapper | |
def identity(arg): | |
return arg | |
# implement gridsearch | |
# http://scikit-learn.org/stable/auto_examples/plot_compare_reduction.html#sphx-glr-auto-examples-plot-compare-reduction-py | |
@timeit | |
def build_and_evaluate(X, y, classifier=SGDClassifier, outpath=None, verbose=True, cachedir=None): | |
@timeit | |
def build(classifier, X, y=None): | |
model = Pipeline([ | |
('preprocessor', NLTKPreprocessor()), | |
('vectorizer', TfidfVectorizer( | |
tokenizer=identity, preprocessor=None, lowercase=False)), | |
('classifier', classifier), | |
], memory=cachedir) | |
model.fit(X, y) | |
return model | |
# # Label encode the targets | |
labels = LabelEncoder() | |
y = labels.fit_transform(y) | |
target_names = list(map(str, labels.classes_)) | |
# Begin evaluation | |
if verbose: | |
print("Building for evaluation") | |
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=0, stratify=y) | |
model, secs = build(classifier, X_train, y_train) | |
if verbose: | |
print("Evaluation model fit in {:0.3f} seconds".format(secs)) | |
if verbose: | |
print("Classification Report:\n") | |
y_pred = model.predict(X_test) | |
print(clsr(y_test, y_pred, target_names=target_names)) | |
print(metrics.confusion_matrix(y_test, y_pred)) | |
if verbose: | |
print("Building complete model and saving ...") | |
model, secs = build(classifier, X, y) | |
model.labels_ = labels | |
if verbose: | |
print("Complete model fit in {:0.3f} seconds".format(secs)) | |
if outpath: | |
with open(outpath, 'wb') as f: | |
pickle.dump(model, f) | |
print("Model written out to {}".format(outpath)) | |
return model |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment