Skip to content

Instantly share code, notes, and snippets.

@aronwc
Last active December 31, 2015 20:58
Show Gist options
  • Save aronwc/8043204 to your computer and use it in GitHub Desktop.
Save aronwc/8043204 to your computer and use it in GitHub Desktop.
L1 feature selection example
import numpy as np
from sklearn import linear_model
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
def print_features(coef, names):
""" Print sorted list of non-zero features/weights. """
print "\n".join('%s/%.2f' % (names[j], coef[j]) for j in np.argsort(coef)[::-1] if coef[j] != 0)
if (__name__ == '__main__'):
rand = np.random.mtrand.RandomState(8675309)
cats = ['rec.sport.baseball', 'sci.crypt']
data = fetch_20newsgroups(subset='train',
categories=cats,
shuffle=True,
random_state=rand)
# Smaller C means fewer features selected.
clf = linear_model.LogisticRegression(penalty='l1', C=.1)
vec = CountVectorizer()
X = vec.fit_transform(data.data)
clf.fit(X, data.target)
print_features(clf.coef_[0], vec.get_feature_names())
@aronwc
Copy link
Author

aronwc commented Dec 19, 2013

Output is:

clipper/1.43
key/0.84
encryption/0.81
code/0.58
your/0.32
nsa/0.29
uk/0.27
chip/0.25
com/0.19
of/0.18
pgp/0.16
government/0.15
it/0.15
to/0.11
org/0.11
gtoal/0.08
writes/0.08
can/0.05
clinton/0.02
by/0.02
for/0.02
would/0.02
us/0.02
that/0.01
is/0.01
sports/-0.00
in/-0.01
philadelphia/-0.02
go/-0.02
the/-0.03
think/-0.04
fan/-0.06
braves/-0.08
have/-0.08
cubs/-0.09
than/-0.09
games/-0.22
phillies/-0.22
out/-0.24
lines/-0.26
edu/-0.31
subject/-0.42
players/-0.42
team/-0.51
game/-0.53
year/-0.54
he/-0.58
baseball/-1.40

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment