danemacaulay · November 7, 2017 15:38
diff --git a/build_model.py b/build_model.py
 import os
 import time
 import string
 import pickle
 import pandas as pd

 from operator import itemgetter

 from nltk.corpus import stopwords as sw
 from nltk.corpus import wordnet as wn
 from nltk import wordpunct_tokenize
 from nltk import WordNetLemmatizer
 from nltk import sent_tokenize
 from nltk import pos_tag

 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import LabelEncoder
 from sklearn.linear_model import SGDClassifier
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.metrics import classification_report as clsr
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.cross_validation import train_test_split as tts
 from sklearn.linear_model import Perceptron
 from nltk_preprocessor import NLTKPreprocessor
 from sklearn import metrics


 def timeit(func):
    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        delta = time.time() - start
        return result, delta
    return wrapper


 def identity(arg):
    return arg


 # implement gridsearch
 # http://scikit-learn.org/stable/auto_examples/plot_compare_reduction.html#sphx-glr-auto-examples-plot-compare-reduction-py

 @timeit
 def build_and_evaluate(X, y, classifier=SGDClassifier, outpath=None, verbose=True, cachedir=None):

    @timeit
    def build(classifier, X, y=None):

        model = Pipeline([
            ('preprocessor', NLTKPreprocessor()),
            ('vectorizer', TfidfVectorizer(
                tokenizer=identity, preprocessor=None, lowercase=False)),
            ('classifier', classifier),
        ], memory=cachedir)

        model.fit(X, y)
        return model

    # # Label encode the targets
    labels = LabelEncoder()
    y = labels.fit_transform(y)
    target_names = list(map(str, labels.classes_))

    # Begin evaluation
    if verbose:
        print("Building for evaluation")
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=0, stratify=y)
    model, secs = build(classifier, X_train, y_train)

    if verbose:
        print("Evaluation model fit in {:0.3f} seconds".format(secs))
    if verbose:
        print("Classification Report:\n")

    y_pred = model.predict(X_test)

    print(clsr(y_test, y_pred, target_names=target_names))
    print(metrics.confusion_matrix(y_test, y_pred))

    if verbose:
        print("Building complete model and saving ...")
    model, secs = build(classifier, X, y)
    model.labels_ = labels

    if verbose:
        print("Complete model fit in {:0.3f} seconds".format(secs))

    if outpath:
        with open(outpath, 'wb') as f:
            pickle.dump(model, f)

        print("Model written out to {}".format(outpath))

    return model
	import os
	import time
	import string
	import pickle
	import pandas as pd

	from operator import itemgetter

	from nltk.corpus import stopwords as sw
	from nltk.corpus import wordnet as wn
	from nltk import wordpunct_tokenize
	from nltk import WordNetLemmatizer
	from nltk import sent_tokenize
	from nltk import pos_tag

	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import LabelEncoder
	from sklearn.linear_model import SGDClassifier
	from sklearn.base import BaseEstimator, TransformerMixin
	from sklearn.metrics import classification_report as clsr
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.cross_validation import train_test_split as tts
	from sklearn.linear_model import Perceptron
	from nltk_preprocessor import NLTKPreprocessor
	from sklearn import metrics


	def timeit(func):
	def wrapper(args, *kwargs):
	start = time.time()
	result = func(args, *kwargs)
	delta = time.time() - start
	return result, delta
	return wrapper


	def identity(arg):
	return arg


	# implement gridsearch
	# http://scikit-learn.org/stable/auto_examples/plot_compare_reduction.html#sphx-glr-auto-examples-plot-compare-reduction-py

	@timeit
	def build_and_evaluate(X, y, classifier=SGDClassifier, outpath=None, verbose=True, cachedir=None):

	@timeit
	def build(classifier, X, y=None):

	model = Pipeline([
	('preprocessor', NLTKPreprocessor()),
	('vectorizer', TfidfVectorizer(
	tokenizer=identity, preprocessor=None, lowercase=False)),
	('classifier', classifier),
	], memory=cachedir)

	model.fit(X, y)
	return model

	# # Label encode the targets
	labels = LabelEncoder()
	y = labels.fit_transform(y)
	target_names = list(map(str, labels.classes_))

	# Begin evaluation
	if verbose:
	print("Building for evaluation")
	X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=0, stratify=y)
	model, secs = build(classifier, X_train, y_train)

	if verbose:
	print("Evaluation model fit in {:0.3f} seconds".format(secs))
	if verbose:
	print("Classification Report:\n")

	y_pred = model.predict(X_test)

	print(clsr(y_test, y_pred, target_names=target_names))
	print(metrics.confusion_matrix(y_test, y_pred))

	if verbose:
	print("Building complete model and saving ...")
	model, secs = build(classifier, X, y)
	model.labels_ = labels

	if verbose:
	print("Complete model fit in {:0.3f} seconds".format(secs))

	if outpath:
	with open(outpath, 'wb') as f:
	pickle.dump(model, f)

	print("Model written out to {}".format(outpath))

	return model