Skip to content

Instantly share code, notes, and snippets.

@kvtoraman
Forked from tmylk/pydataberlin.ipynb
Created August 17, 2017 02:21
Show Gist options
  • Save kvtoraman/c2069802358c307dd632b704b169d3cc to your computer and use it in GitHub Desktop.
Save kvtoraman/c2069802358c307dd632b704b169d3cc to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Video is at https://youtu.be/tAxrlAVw-Tk\n",
"# Recommend running in this gensim docker container as it already has FastText\n",
"# https://github.com/RaRe-Technologies/gensim/tree/develop/docker\n",
"\n",
"# credit: Derived from great tutorial by Andraz Hribernik at Cytora. http://blog.cytora.com/insights/2016/11/30/natural-language-processing-in-10-lines-of-code-part-1",
"import os\n",
"import logging\n",
"logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
" #run these commands to setup spacy in the docker container\n",
" #!pip3 install spacy\n",
" #!python3 -m spacy download en"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# get the text of PnP\n",
"!wget http://www.gutenberg.org/files/1342/1342-0.txt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def read_file(file_name):\n",
" with open(file_name, 'r') as file:\n",
" return file.read()#.decode('utf-8')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Process `text` with Spacy NLP Parser\n",
"#filename = 'data/pride_and_prejudice.txt'\n",
"filename = '1342-0.txt'\n",
"text = read_file(filename)\n"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"# pre-processing with spacy\n",
"import spacy\n",
"nlp = spacy.load('en')\n",
"processed_text = nlp(text)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5990\n",
"[My dear Mr. Bennet,” replied his wife, “how can you be so tiresome!]\n"
]
}
],
"source": [
"# How many sentences are in the book (Pride & Prejudice)?\n",
"sentences = [s for s in processed_text.sents]\n",
"print(len(sentences))\n",
"\n",
"# Print sentences from index 10 to index 15, to make sure that we have parsed the correct book\n",
"print(sentences[20:21])"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"124592"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(processed_text.text.split())"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n",
"2017-07-02 11:34:58,351 : INFO : 'pattern' package not found; tag filters are not available for English\n"
]
}
],
"source": [
"import gensim\n",
"from gensim.models import Word2Vec"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"processed_sentences = [sent.lemma_.split() for sent in processed_text.sents]"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['\\ufeffthe',\n",
" 'project',\n",
" 'gutenberg',\n",
" 'ebook',\n",
" 'of',\n",
" 'pride',\n",
" 'and',\n",
" 'prejudice',\n",
" ',',\n",
" 'by',\n",
" 'jane',\n",
" 'austen',\n",
" 'this',\n",
" 'ebook',\n",
" 'be',\n",
" 'for',\n",
" 'the',\n",
" 'use',\n",
" 'of',\n",
" 'anyone',\n",
" 'anywhere',\n",
" 'at',\n",
" 'no',\n",
" 'cost',\n",
" 'and',\n",
" 'with',\n",
" 'almost',\n",
" 'no',\n",
" 'restriction',\n",
" 'whatsoever',\n",
" '.']"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"processed_sentences[0]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'2.2.0'"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import gensim, multiprocessing\n",
"gensim.__version__\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2017-07-02 11:34:58,635 : INFO : collecting all words and their counts\n",
"2017-07-02 11:34:58,638 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n",
"2017-07-02 11:34:58,705 : INFO : collected 5129 word types from a corpus of 148171 raw words and 5990 sentences\n",
"2017-07-02 11:34:58,710 : INFO : Loading a fresh vocabulary\n",
"2017-07-02 11:34:58,729 : INFO : min_count=5 retains 1800 unique words (35% of original 5129, drops 3329)\n",
"2017-07-02 11:34:58,732 : INFO : min_count=5 leaves 142433 word corpus (96% of original 148171, drops 5738)\n",
"2017-07-02 11:34:58,773 : INFO : deleting the raw counts dictionary of 5129 items\n",
"2017-07-02 11:34:58,783 : INFO : sample=0.001 downsamples 49 most-common words\n",
"2017-07-02 11:34:58,785 : INFO : downsampling leaves estimated 82416 word corpus (57.9% of prior 142433)\n",
"2017-07-02 11:34:58,790 : INFO : estimated required memory for 1800 words and 100 dimensions: 2340000 bytes\n",
"2017-07-02 11:34:58,808 : INFO : resetting layer weights\n",
"2017-07-02 11:34:58,870 : INFO : training model with 3 workers on 1800 vocabulary and 100 features, using sg=1 hs=0 sample=0.001 negative=5 window=2\n",
"2017-07-02 11:34:59,915 : INFO : PROGRESS: at 65.90% examples, 269152 words/s, in_qsize 5, out_qsize 0\n",
"2017-07-02 11:35:00,505 : INFO : worker thread finished; awaiting finish of 2 more threads\n",
"2017-07-02 11:35:00,518 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
"2017-07-02 11:35:00,541 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
"2017-07-02 11:35:00,544 : INFO : training on 740855 raw words (412154 effective words) took 1.6s, 251459 effective words/s\n"
]
},
{
"data": {
"text/plain": [
"1800"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"interchangeable_words_model = Word2Vec(\n",
" sentences=processed_sentences,\n",
" workers=multiprocessing.cpu_count() - 1, # use your cores\n",
" window=2, sg=1)\n",
"\n",
"len(interchangeable_words_model.wv.vocab)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2017-07-02 11:35:00,557 : INFO : precomputing L2-norms of word weight vectors\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"('wickham', 0.9769910573959351)\n",
"('bingley', 0.9764766693115234)\n",
"('collins', 0.9623229503631592)\n",
"('gardiner', 0.9002351760864258)\n",
"('bennet', 0.8968710899353027)\n",
"('forster', 0.8846153616905212)\n",
"('hurst', 0.8742482662200928)\n",
"('lydia', 0.8677858114242554)\n",
"('phillips', 0.8670917749404907)\n",
"('lucas', 0.8582533597946167)\n"
]
}
],
"source": [
"for w,sim in interchangeable_words_model.most_similar(u'darcy'):\n",
" print((w, sim))"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('bingley', 0.9838923215866089)\n",
"('collins', 0.9821895360946655)\n",
"('darcy', 0.9769909977912903)\n",
"('gardiner', 0.9459742903709412)\n",
"('hurst', 0.9369016885757446)\n",
"('phillips', 0.9289063215255737)\n",
"('forster', 0.9270267486572266)\n",
"('lydia', 0.9262116551399231)\n",
"('bennet', 0.9245997071266174)\n",
"('lucas', 0.9152452945709229)\n"
]
}
],
"source": [
"for w,sim in interchangeable_words_model.most_similar(u'wickham'):\n",
" print((w, sim))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2017-07-02 11:35:00,678 : INFO : collecting all words and their counts\n",
"2017-07-02 11:35:00,681 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n",
"2017-07-02 11:35:00,813 : INFO : collected 5129 word types from a corpus of 148171 raw words and 5990 sentences\n",
"2017-07-02 11:35:00,815 : INFO : Loading a fresh vocabulary\n",
"2017-07-02 11:35:00,832 : INFO : min_count=5 retains 1800 unique words (35% of original 5129, drops 3329)\n",
"2017-07-02 11:35:00,836 : INFO : min_count=5 leaves 142433 word corpus (96% of original 148171, drops 5738)\n",
"2017-07-02 11:35:00,853 : INFO : deleting the raw counts dictionary of 5129 items\n",
"2017-07-02 11:35:00,855 : INFO : sample=0.001 downsamples 49 most-common words\n",
"2017-07-02 11:35:00,857 : INFO : downsampling leaves estimated 82416 word corpus (57.9% of prior 142433)\n",
"2017-07-02 11:35:00,859 : INFO : estimated required memory for 1800 words and 100 dimensions: 2340000 bytes\n",
"2017-07-02 11:35:00,877 : INFO : resetting layer weights\n",
"2017-07-02 11:35:00,947 : INFO : training model with 3 workers on 1800 vocabulary and 100 features, using sg=1 hs=0 sample=0.001 negative=5 window=50\n",
"2017-07-02 11:35:02,009 : INFO : PROGRESS: at 12.85% examples, 53190 words/s, in_qsize 5, out_qsize 0\n",
"2017-07-02 11:35:03,031 : INFO : PROGRESS: at 29.36% examples, 59053 words/s, in_qsize 5, out_qsize 0\n",
"2017-07-02 11:35:04,058 : INFO : PROGRESS: at 56.23% examples, 75235 words/s, in_qsize 5, out_qsize 0\n",
"2017-07-02 11:35:05,069 : INFO : PROGRESS: at 83.68% examples, 83716 words/s, in_qsize 5, out_qsize 0\n",
"2017-07-02 11:35:05,642 : INFO : worker thread finished; awaiting finish of 2 more threads\n",
"2017-07-02 11:35:05,656 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
"2017-07-02 11:35:05,680 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
"2017-07-02 11:35:05,681 : INFO : training on 740855 raw words (411902 effective words) took 4.7s, 87291 effective words/s\n"
]
},
{
"data": {
"text/plain": [
"1800"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"attributes_of_model = Word2Vec(\n",
" sentences=processed_sentences,\n",
" workers=multiprocessing.cpu_count() - 1, # use your cores\n",
" window=50, sg=1)\n",
"\n",
"len(attributes_of_model.wv.vocab)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2017-07-02 11:35:05,704 : INFO : precomputing L2-norms of word weight vectors\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"('astonishment', 0.7948324084281921)\n",
"('bingley', 0.7947502732276917)\n",
"('moment', 0.7796972990036011)\n",
"('account', 0.7642396688461304)\n",
"('hear', 0.762365996837616)\n",
"('surprised', 0.757840633392334)\n",
"('acquaint', 0.747281551361084)\n",
"('smile', 0.7398712635040283)\n",
"('thought', 0.7388651371002197)\n",
"('silent', 0.7383260726928711)\n"
]
}
],
"source": [
"for w, sim in attributes_of_model.most_similar(u'darcy'):\n",
" print((w, sim))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"('george', 0.7982236742973328)\n",
"('everything', 0.7934660315513611)\n",
"('history', 0.786710798740387)\n",
"('friend', 0.7766832113265991)\n",
"('perfectly', 0.7764248847961426)\n",
"('friendship', 0.7637182474136353)\n",
"('acquaint', 0.7593115568161011)\n",
"('exactly', 0.7584044933319092)\n",
"('relation', 0.7576615810394287)\n",
"('conceal', 0.7575635313987732)\n"
]
}
],
"source": [
"for w, sim in attributes_of_model.most_similar(u'wickham'):\n",
" print((w, sim))"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2017-07-02 11:57:59,650 : INFO : loading 1800 words for fastText model from /tmp/ft_model.bin\n",
"2017-07-02 11:58:04,928 : INFO : loading weights for 1800 words for fastText model from /tmp/ft_model.bin\n",
"2017-07-02 11:58:05,115 : INFO : loaded (1800, 100) weight matrix for fastText model from /tmp/ft_model.bin\n",
"2017-07-02 11:59:12,084 : INFO : loading 1800 words for fastText model from /tmp/ft_model.bin\n",
"2017-07-02 11:59:15,940 : INFO : loading weights for 1800 words for fastText model from /tmp/ft_model.bin\n",
"2017-07-02 11:59:16,162 : INFO : loaded (1800, 100) weight matrix for fastText model from /tmp/ft_model.bin\n"
]
}
],
"source": [
"# fasttext\n",
"from gensim.models.wrappers import FastText\n",
"ft_home = '/gensim/gensim_dependencies/fastText'\n",
"ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None\n",
"\n",
"processed_file = 'processed.txt'\n",
"with open(processed_file, 'w') as f:\n",
" f.write(\" \".join(\" \".join(s) for s in processed_sentences))\n",
" \n",
"\n",
"ft_interchangeable = FastText.train(ft_path, processed_file, window=2)\n",
"ft_attributes = FastText.train(ft_path, processed_file, window=50)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2017-07-02 11:59:16,258 : INFO : precomputing L2-norms of word weight vectors\n",
"2017-07-02 11:59:16,265 : INFO : precomputing L2-norms of ngram weight vectors\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"('bennet', 0.9956306219100952)\n",
"('bennets', 0.9956260919570923)\n",
"('bingley', 0.9909453988075256)\n",
"('collins', 0.9804142713546753)\n",
"('gardiners', 0.9606761336326599)\n",
"('collinses', 0.9604480266571045)\n",
"('gardiner', 0.9554274082183838)\n",
"('wickham', 0.9443435072898865)\n",
"('phillips', 0.9129102826118469)\n",
"('hurst', 0.909385085105896)\n"
]
}
],
"source": [
"for w,sim in ft_interchangeable.most_similar(u'darcy'):\n",
" print((w, sim))"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2017-07-02 11:59:16,562 : INFO : precomputing L2-norms of word weight vectors\n",
"2017-07-02 11:59:16,578 : INFO : precomputing L2-norms of ngram weight vectors\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"('have', 0.9981114268302917)\n",
"('every', 0.9965208768844604)\n",
"('dare', 0.9959427118301392)\n",
"('never', 0.9949673414230347)\n",
"('save', 0.9941790103912354)\n",
"('hear', 0.9937617778778076)\n",
"('everything', 0.992755651473999)\n",
"('something', 0.9925262928009033)\n",
"('here', 0.9919098615646362)\n",
"('sure', 0.9904596209526062)\n"
]
}
],
"source": [
"for w,sim in ft_attributes.most_similar(u'darcy'):\n",
" print((w, sim))"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# singular and plural similarity"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"night\" in ft_interchangeable.wv.vocab"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"nights\" in ft_interchangeable.wv.vocab"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.99996042559761056"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ft_interchangeable.similarity(\"nights\", \"night\")"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# side effect is that knight and night are similar. \n",
"# Because it has never seen knight in a real text, \n",
"# so assumes they are related just because they share word chunks\n",
"\"knight\" in ft_interchangeable.wv.vocab"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.99972795156314342"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ft_interchangeable.similarity(\"knight\", \"night\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# wordrank\n",
"# TODO: install MPI package in docker to enable wordrank\n",
"from gensim.models.wrappers import Wordrank\n",
"wr_home = os.environ.get('WR_HOME', None)\n",
"out_dir = 'wr_model'\n",
"wordrank_interchangeable = Wordrank.train(wr_home, filename, out_dir, window=2)\n",
"wordrank_attributes = Wordrank.train(wr_home, filename, out_dir, window=50)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment