Skip to content

Instantly share code, notes, and snippets.

@anhpt379
Forked from marcelcaraciolo/tf_idf_final.py
Created January 13, 2012 07:51
Show Gist options
  • Save anhpt379/1605037 to your computer and use it in GitHub Desktop.
Save anhpt379/1605037 to your computer and use it in GitHub Desktop.
tf-idf example
#-*- coding: utf-8 -*-
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk import bigrams, trigrams
import math
stopwords = nltk.corpus.stopwords.words('portuguese')
tokenizer = RegexpTokenizer("[\w’]+", flags=re.UNICODE)
def freq(word, doc):
return doc.count(word)
def word_count(doc):
return len(doc)
def tf(word, doc):
return (freq(word, doc) / float(word_count(doc)))
def num_docs_containing(word, list_of_docs):
count = 0
for document in list_of_docs:
if freq(word, document) > 0:
count += 1
return 1 + count
def idf(word, list_of_docs):
return math.log(len(list_of_docs) /
float(num_docs_containing(word, list_of_docs)))
def tf_idf(word, doc, list_of_docs):
return (tf(word, doc) * idf(word, list_of_docs))
#Compute the frequency for each term.
vocabulary = []
docs = {}
all_tips = []
for tip in (['documment 1', 'documment 2']):
tokens = tokenizer.tokenize(tip.text)
bi_tokens = bigrams(tokens)
tri_tokens = trigrams(tokens)
tokens = [token.lower() for token in tokens if len(token) > 2]
tokens = [token for token in tokens if token not in stopwords]
bi_tokens = [' '.join(token).lower() for token in bi_tokens]
bi_tokens = [token for token in bi_tokens if token not in stopwords]
tri_tokens = [' '.join(token).lower() for token in tri_tokens]
tri_tokens = [token for token in tri_tokens if token not in stopwords]
final_tokens = []
final_tokens.extend(tokens)
final_tokens.extend(bi_tokens)
final_tokens.extend(tri_tokens)
docs[tip] = {'freq': {}, 'tf': {}, 'idf': {},
'tf-idf': {}, 'tokens': []}
for token in final_tokens:
#The frequency computed for each tip
docs[tip]['freq'][token] = freq(token, final_tokens)
#The term-frequency (Normalized Frequency)
docs[tip]['tf'][token] = tf(token, final_tokens)
docs[tip]['tokens'] = final_tokens
vocabulary.append(final_tokens)
for doc in docs:
for token in docs[doc]['tf']:
#The Inverse-Document-Frequency
docs[doc]['idf'][token] = idf(token, vocabulary)
#The tf-idf
docs[doc]['tf-idf'][token] = tf_idf(token, docs[doc]['tokens'], vocabulary)
#Now let's find out the most relevant words by tf-idf.
words = {}
for doc in docs:
for token in docs[doc]['tf-idf']:
if token not in words:
words[token] = docs[doc]['tf-idf'][token]
else:
if docs[doc]['tf-idf'][token] > words[token]:
words[token] = docs[doc]['tf-idf'][token]
print doc
for token in docs[doc]['tf-idf']:
print token, docs[doc]['tf-idf'][token]
for item in sorted(words.items(), key=lambda x: x[1], reverse=True):
print "%f <= %s" % (item[1], item[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment