kvtoraman · August 17, 2017 02:21
diff --git a/pydataberlin.ipynb b/pydataberlin.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Video is at https://youtu.be/tAxrlAVw-Tk\n",
    "# Recommend running in this gensim docker container as it already has FastText\n",
    "# https://github.com/RaRe-Technologies/gensim/tree/develop/docker\n",
    "\n",
    
    "# credit: Derived from great tutorial by Andraz Hribernik at Cytora. http://blog.cytora.com/insights/2016/11/30/natural-language-processing-in-10-lines-of-code-part-1",

    "import os\n",
    "import logging\n",
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "    #run these commands to setup spacy in the docker container\n",
    "    #!pip3 install spacy\n",
    "    #!python3 -m spacy download en"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# get the text of PnP\n",
    "!wget http://www.gutenberg.org/files/1342/1342-0.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def read_file(file_name):\n",
    "    with open(file_name, 'r') as file:\n",
    "        return file.read()#.decode('utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Process `text` with Spacy NLP Parser\n",
    "#filename = 'data/pride_and_prejudice.txt'\n",
    "filename = '1342-0.txt'\n",
    "text = read_file(filename)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "# pre-processing with spacy\n",
    "import spacy\n",
    "nlp = spacy.load('en')\n",
    "processed_text = nlp(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5990\n",
      "[My dear Mr. Bennet,” replied his wife, “how can you be so tiresome!]\n"
     ]
    }
   ],
   "source": [
    "# How many sentences are in the book (Pride & Prejudice)?\n",
    "sentences = [s for s in processed_text.sents]\n",
    "print(len(sentences))\n",
    "\n",
    "# Print sentences from index 10 to index 15, to make sure that we have parsed the correct book\n",
    "print(sentences[20:21])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "124592"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(processed_text.text.split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n",
      "2017-07-02 11:34:58,351 : INFO : 'pattern' package not found; tag filters are not available for English\n"
     ]
    }
   ],
   "source": [
    "import gensim\n",
    "from gensim.models import Word2Vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "processed_sentences = [sent.lemma_.split() for sent in processed_text.sents]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['\\ufeffthe',\n",
       " 'project',\n",
       " 'gutenberg',\n",
       " 'ebook',\n",
       " 'of',\n",
       " 'pride',\n",
       " 'and',\n",
       " 'prejudice',\n",
       " ',',\n",
       " 'by',\n",
       " 'jane',\n",
       " 'austen',\n",
       " 'this',\n",
       " 'ebook',\n",
       " 'be',\n",
       " 'for',\n",
       " 'the',\n",
       " 'use',\n",
       " 'of',\n",
       " 'anyone',\n",
       " 'anywhere',\n",
       " 'at',\n",
       " 'no',\n",
       " 'cost',\n",
       " 'and',\n",
       " 'with',\n",
       " 'almost',\n",
       " 'no',\n",
       " 'restriction',\n",
       " 'whatsoever',\n",
       " '.']"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processed_sentences[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'2.2.0'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import gensim, multiprocessing\n",
    "gensim.__version__\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2017-07-02 11:34:58,635 : INFO : collecting all words and their counts\n",
      "2017-07-02 11:34:58,638 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n",
      "2017-07-02 11:34:58,705 : INFO : collected 5129 word types from a corpus of 148171 raw words and 5990 sentences\n",
      "2017-07-02 11:34:58,710 : INFO : Loading a fresh vocabulary\n",
      "2017-07-02 11:34:58,729 : INFO : min_count=5 retains 1800 unique words (35% of original 5129, drops 3329)\n",
      "2017-07-02 11:34:58,732 : INFO : min_count=5 leaves 142433 word corpus (96% of original 148171, drops 5738)\n",
      "2017-07-02 11:34:58,773 : INFO : deleting the raw counts dictionary of 5129 items\n",
      "2017-07-02 11:34:58,783 : INFO : sample=0.001 downsamples 49 most-common words\n",
      "2017-07-02 11:34:58,785 : INFO : downsampling leaves estimated 82416 word corpus (57.9% of prior 142433)\n",
      "2017-07-02 11:34:58,790 : INFO : estimated required memory for 1800 words and 100 dimensions: 2340000 bytes\n",
      "2017-07-02 11:34:58,808 : INFO : resetting layer weights\n",
      "2017-07-02 11:34:58,870 : INFO : training model with 3 workers on 1800 vocabulary and 100 features, using sg=1 hs=0 sample=0.001 negative=5 window=2\n",
      "2017-07-02 11:34:59,915 : INFO : PROGRESS: at 65.90% examples, 269152 words/s, in_qsize 5, out_qsize 0\n",
      "2017-07-02 11:35:00,505 : INFO : worker thread finished; awaiting finish of 2 more threads\n",
      "2017-07-02 11:35:00,518 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
      "2017-07-02 11:35:00,541 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
      "2017-07-02 11:35:00,544 : INFO : training on 740855 raw words (412154 effective words) took 1.6s, 251459 effective words/s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1800"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "interchangeable_words_model = Word2Vec(\n",
    "    sentences=processed_sentences,\n",
    "    workers=multiprocessing.cpu_count() - 1, # use your cores\n",
    "    window=2, sg=1)\n",
    "\n",
    "len(interchangeable_words_model.wv.vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2017-07-02 11:35:00,557 : INFO : precomputing L2-norms of word weight vectors\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('wickham', 0.9769910573959351)\n",
      "('bingley', 0.9764766693115234)\n",
      "('collins', 0.9623229503631592)\n",
      "('gardiner', 0.9002351760864258)\n",
      "('bennet', 0.8968710899353027)\n",
      "('forster', 0.8846153616905212)\n",
      "('hurst', 0.8742482662200928)\n",
      "('lydia', 0.8677858114242554)\n",
      "('phillips', 0.8670917749404907)\n",
      "('lucas', 0.8582533597946167)\n"
     ]
    }
   ],
   "source": [
    "for w,sim in interchangeable_words_model.most_similar(u'darcy'):\n",
    "    print((w, sim))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('bingley', 0.9838923215866089)\n",
      "('collins', 0.9821895360946655)\n",
      "('darcy', 0.9769909977912903)\n",
      "('gardiner', 0.9459742903709412)\n",
      "('hurst', 0.9369016885757446)\n",
      "('phillips', 0.9289063215255737)\n",
      "('forster', 0.9270267486572266)\n",
      "('lydia', 0.9262116551399231)\n",
      "('bennet', 0.9245997071266174)\n",
      "('lucas', 0.9152452945709229)\n"
     ]
    }
   ],
   "source": [
    "for w,sim in interchangeable_words_model.most_similar(u'wickham'):\n",
    "    print((w, sim))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2017-07-02 11:35:00,678 : INFO : collecting all words and their counts\n",
      "2017-07-02 11:35:00,681 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n",
      "2017-07-02 11:35:00,813 : INFO : collected 5129 word types from a corpus of 148171 raw words and 5990 sentences\n",
      "2017-07-02 11:35:00,815 : INFO : Loading a fresh vocabulary\n",
      "2017-07-02 11:35:00,832 : INFO : min_count=5 retains 1800 unique words (35% of original 5129, drops 3329)\n",
      "2017-07-02 11:35:00,836 : INFO : min_count=5 leaves 142433 word corpus (96% of original 148171, drops 5738)\n",
      "2017-07-02 11:35:00,853 : INFO : deleting the raw counts dictionary of 5129 items\n",
      "2017-07-02 11:35:00,855 : INFO : sample=0.001 downsamples 49 most-common words\n",
      "2017-07-02 11:35:00,857 : INFO : downsampling leaves estimated 82416 word corpus (57.9% of prior 142433)\n",
      "2017-07-02 11:35:00,859 : INFO : estimated required memory for 1800 words and 100 dimensions: 2340000 bytes\n",
      "2017-07-02 11:35:00,877 : INFO : resetting layer weights\n",
      "2017-07-02 11:35:00,947 : INFO : training model with 3 workers on 1800 vocabulary and 100 features, using sg=1 hs=0 sample=0.001 negative=5 window=50\n",
      "2017-07-02 11:35:02,009 : INFO : PROGRESS: at 12.85% examples, 53190 words/s, in_qsize 5, out_qsize 0\n",
      "2017-07-02 11:35:03,031 : INFO : PROGRESS: at 29.36% examples, 59053 words/s, in_qsize 5, out_qsize 0\n",
      "2017-07-02 11:35:04,058 : INFO : PROGRESS: at 56.23% examples, 75235 words/s, in_qsize 5, out_qsize 0\n",
      "2017-07-02 11:35:05,069 : INFO : PROGRESS: at 83.68% examples, 83716 words/s, in_qsize 5, out_qsize 0\n",
      "2017-07-02 11:35:05,642 : INFO : worker thread finished; awaiting finish of 2 more threads\n",
      "2017-07-02 11:35:05,656 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
      "2017-07-02 11:35:05,680 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
      "2017-07-02 11:35:05,681 : INFO : training on 740855 raw words (411902 effective words) took 4.7s, 87291 effective words/s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "1800"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "attributes_of_model = Word2Vec(\n",
    "    sentences=processed_sentences,\n",
    "    workers=multiprocessing.cpu_count() - 1, # use your cores\n",
    "    window=50, sg=1)\n",
    "\n",
    "len(attributes_of_model.wv.vocab)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2017-07-02 11:35:05,704 : INFO : precomputing L2-norms of word weight vectors\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('astonishment', 0.7948324084281921)\n",
      "('bingley', 0.7947502732276917)\n",
      "('moment', 0.7796972990036011)\n",
      "('account', 0.7642396688461304)\n",
      "('hear', 0.762365996837616)\n",
      "('surprised', 0.757840633392334)\n",
      "('acquaint', 0.747281551361084)\n",
      "('smile', 0.7398712635040283)\n",
      "('thought', 0.7388651371002197)\n",
      "('silent', 0.7383260726928711)\n"
     ]
    }
   ],
   "source": [
    "for w, sim in attributes_of_model.most_similar(u'darcy'):\n",
    "    print((w, sim))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('george', 0.7982236742973328)\n",
      "('everything', 0.7934660315513611)\n",
      "('history', 0.786710798740387)\n",
      "('friend', 0.7766832113265991)\n",
      "('perfectly', 0.7764248847961426)\n",
      "('friendship', 0.7637182474136353)\n",
      "('acquaint', 0.7593115568161011)\n",
      "('exactly', 0.7584044933319092)\n",
      "('relation', 0.7576615810394287)\n",
      "('conceal', 0.7575635313987732)\n"
     ]
    }
   ],
   "source": [
    "for w, sim in attributes_of_model.most_similar(u'wickham'):\n",
    "    print((w, sim))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2017-07-02 11:57:59,650 : INFO : loading 1800 words for fastText model from /tmp/ft_model.bin\n",
      "2017-07-02 11:58:04,928 : INFO : loading weights for 1800 words for fastText model from /tmp/ft_model.bin\n",
      "2017-07-02 11:58:05,115 : INFO : loaded (1800, 100) weight matrix for fastText model from /tmp/ft_model.bin\n",
      "2017-07-02 11:59:12,084 : INFO : loading 1800 words for fastText model from /tmp/ft_model.bin\n",
      "2017-07-02 11:59:15,940 : INFO : loading weights for 1800 words for fastText model from /tmp/ft_model.bin\n",
      "2017-07-02 11:59:16,162 : INFO : loaded (1800, 100) weight matrix for fastText model from /tmp/ft_model.bin\n"
     ]
    }
   ],
   "source": [
    "# fasttext\n",
    "from gensim.models.wrappers import FastText\n",
    "ft_home = '/gensim/gensim_dependencies/fastText'\n",
    "ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None\n",
    "\n",
    "processed_file = 'processed.txt'\n",
    "with open(processed_file, 'w') as f:\n",
    "    f.write(\" \".join(\" \".join(s) for s in processed_sentences))\n",
    "    \n",
    "\n",
    "ft_interchangeable = FastText.train(ft_path, processed_file, window=2)\n",
    "ft_attributes = FastText.train(ft_path, processed_file, window=50)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2017-07-02 11:59:16,258 : INFO : precomputing L2-norms of word weight vectors\n",
      "2017-07-02 11:59:16,265 : INFO : precomputing L2-norms of ngram weight vectors\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('bennet', 0.9956306219100952)\n",
      "('bennets', 0.9956260919570923)\n",
      "('bingley', 0.9909453988075256)\n",
      "('collins', 0.9804142713546753)\n",
      "('gardiners', 0.9606761336326599)\n",
      "('collinses', 0.9604480266571045)\n",
      "('gardiner', 0.9554274082183838)\n",
      "('wickham', 0.9443435072898865)\n",
      "('phillips', 0.9129102826118469)\n",
      "('hurst', 0.909385085105896)\n"
     ]
    }
   ],
   "source": [
    "for w,sim in ft_interchangeable.most_similar(u'darcy'):\n",
    "    print((w, sim))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2017-07-02 11:59:16,562 : INFO : precomputing L2-norms of word weight vectors\n",
      "2017-07-02 11:59:16,578 : INFO : precomputing L2-norms of ngram weight vectors\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('have', 0.9981114268302917)\n",
      "('every', 0.9965208768844604)\n",
      "('dare', 0.9959427118301392)\n",
      "('never', 0.9949673414230347)\n",
      "('save', 0.9941790103912354)\n",
      "('hear', 0.9937617778778076)\n",
      "('everything', 0.992755651473999)\n",
      "('something', 0.9925262928009033)\n",
      "('here', 0.9919098615646362)\n",
      "('sure', 0.9904596209526062)\n"
     ]
    }
   ],
   "source": [
    "for w,sim in ft_attributes.most_similar(u'darcy'):\n",
    "    print((w, sim))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# singular and plural similarity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"night\" in ft_interchangeable.wv.vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"nights\" in ft_interchangeable.wv.vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.99996042559761056"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ft_interchangeable.similarity(\"nights\", \"night\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# side effect is that knight and night are similar. \n",
    "# Because it has never seen knight in a real text, \n",
    "# so assumes they are related just because they share word chunks\n",
    "\"knight\" in ft_interchangeable.wv.vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.99972795156314342"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ft_interchangeable.similarity(\"knight\", \"night\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# wordrank\n",
    "# TODO: install MPI package in docker to enable wordrank\n",
    "from gensim.models.wrappers import Wordrank\n",
    "wr_home = os.environ.get('WR_HOME', None)\n",
    "out_dir = 'wr_model'\n",
    "wordrank_interchangeable = Wordrank.train(wr_home, filename, out_dir, window=2)\n",
    "wordrank_attributes = Wordrank.train(wr_home, filename, out_dir, window=50)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Video is at https://youtu.be/tAxrlAVw-Tk\n",
	"# Recommend running in this gensim docker container as it already has FastText\n",
	"# https://github.com/RaRe-Technologies/gensim/tree/develop/docker\n",
	"\n",

	"# credit: Derived from great tutorial by Andraz Hribernik at Cytora. http://blog.cytora.com/insights/2016/11/30/natural-language-processing-in-10-lines-of-code-part-1",

	"import os\n",
	"import logging\n",
	"logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 58,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	" #run these commands to setup spacy in the docker container\n",
	" #!pip3 install spacy\n",
	" #!python3 -m spacy download en"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# get the text of PnP\n",
	"!wget http://www.gutenberg.org/files/1342/1342-0.txt"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def read_file(file_name):\n",
	" with open(file_name, 'r') as file:\n",
	" return file.read()#.decode('utf-8')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Process `text` with Spacy NLP Parser\n",
	"#filename = 'data/pride_and_prejudice.txt'\n",
	"filename = '1342-0.txt'\n",
	"text = read_file(filename)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 59,
	"metadata": {},
	"outputs": [],
	"source": [
	"# pre-processing with spacy\n",
	"import spacy\n",
	"nlp = spacy.load('en')\n",
	"processed_text = nlp(text)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"5990\n",
	"[My dear Mr. Bennet,” replied his wife, “how can you be so tiresome!]\n"
	]
	}
	],
	"source": [
	"# How many sentences are in the book (Pride & Prejudice)?\n",
	"sentences = [s for s in processed_text.sents]\n",
	"print(len(sentences))\n",
	"\n",
	"# Print sentences from index 10 to index 15, to make sure that we have parsed the correct book\n",
	"print(sentences[20:21])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"124592"
	]
	},
	"execution_count": 10,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(processed_text.text.split())"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"Using TensorFlow backend.\n",
	"2017-07-02 11:34:58,351 : INFO : 'pattern' package not found; tag filters are not available for English\n"
	]
	}
	],
	"source": [
	"import gensim\n",
	"from gensim.models import Word2Vec"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 12,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"processed_sentences = [sent.lemma_.split() for sent in processed_text.sents]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['\\ufeffthe',\n",
	" 'project',\n",
	" 'gutenberg',\n",
	" 'ebook',\n",
	" 'of',\n",
	" 'pride',\n",
	" 'and',\n",
	" 'prejudice',\n",
	" ',',\n",
	" 'by',\n",
	" 'jane',\n",
	" 'austen',\n",
	" 'this',\n",
	" 'ebook',\n",
	" 'be',\n",
	" 'for',\n",
	" 'the',\n",
	" 'use',\n",
	" 'of',\n",
	" 'anyone',\n",
	" 'anywhere',\n",
	" 'at',\n",
	" 'no',\n",
	" 'cost',\n",
	" 'and',\n",
	" 'with',\n",
	" 'almost',\n",
	" 'no',\n",
	" 'restriction',\n",
	" 'whatsoever',\n",
	" '.']"
	]
	},
	"execution_count": 30,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"processed_sentences[0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 13,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"'2.2.0'"
	]
	},
	"execution_count": 13,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"import gensim, multiprocessing\n",
	"gensim.__version__\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 14,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"2017-07-02 11:34:58,635 : INFO : collecting all words and their counts\n",
	"2017-07-02 11:34:58,638 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n",
	"2017-07-02 11:34:58,705 : INFO : collected 5129 word types from a corpus of 148171 raw words and 5990 sentences\n",
	"2017-07-02 11:34:58,710 : INFO : Loading a fresh vocabulary\n",
	"2017-07-02 11:34:58,729 : INFO : min_count=5 retains 1800 unique words (35% of original 5129, drops 3329)\n",
	"2017-07-02 11:34:58,732 : INFO : min_count=5 leaves 142433 word corpus (96% of original 148171, drops 5738)\n",
	"2017-07-02 11:34:58,773 : INFO : deleting the raw counts dictionary of 5129 items\n",
	"2017-07-02 11:34:58,783 : INFO : sample=0.001 downsamples 49 most-common words\n",
	"2017-07-02 11:34:58,785 : INFO : downsampling leaves estimated 82416 word corpus (57.9% of prior 142433)\n",
	"2017-07-02 11:34:58,790 : INFO : estimated required memory for 1800 words and 100 dimensions: 2340000 bytes\n",
	"2017-07-02 11:34:58,808 : INFO : resetting layer weights\n",
	"2017-07-02 11:34:58,870 : INFO : training model with 3 workers on 1800 vocabulary and 100 features, using sg=1 hs=0 sample=0.001 negative=5 window=2\n",
	"2017-07-02 11:34:59,915 : INFO : PROGRESS: at 65.90% examples, 269152 words/s, in_qsize 5, out_qsize 0\n",
	"2017-07-02 11:35:00,505 : INFO : worker thread finished; awaiting finish of 2 more threads\n",
	"2017-07-02 11:35:00,518 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
	"2017-07-02 11:35:00,541 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
	"2017-07-02 11:35:00,544 : INFO : training on 740855 raw words (412154 effective words) took 1.6s, 251459 effective words/s\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"1800"
	]
	},
	"execution_count": 14,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"interchangeable_words_model = Word2Vec(\n",
	" sentences=processed_sentences,\n",
	" workers=multiprocessing.cpu_count() - 1, # use your cores\n",
	" window=2, sg=1)\n",
	"\n",
	"len(interchangeable_words_model.wv.vocab)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 15,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"2017-07-02 11:35:00,557 : INFO : precomputing L2-norms of word weight vectors\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"('wickham', 0.9769910573959351)\n",
	"('bingley', 0.9764766693115234)\n",
	"('collins', 0.9623229503631592)\n",
	"('gardiner', 0.9002351760864258)\n",
	"('bennet', 0.8968710899353027)\n",
	"('forster', 0.8846153616905212)\n",
	"('hurst', 0.8742482662200928)\n",
	"('lydia', 0.8677858114242554)\n",
	"('phillips', 0.8670917749404907)\n",
	"('lucas', 0.8582533597946167)\n"
	]
	}
	],
	"source": [
	"for w,sim in interchangeable_words_model.most_similar(u'darcy'):\n",
	" print((w, sim))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 16,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"('bingley', 0.9838923215866089)\n",
	"('collins', 0.9821895360946655)\n",
	"('darcy', 0.9769909977912903)\n",
	"('gardiner', 0.9459742903709412)\n",
	"('hurst', 0.9369016885757446)\n",
	"('phillips', 0.9289063215255737)\n",
	"('forster', 0.9270267486572266)\n",
	"('lydia', 0.9262116551399231)\n",
	"('bennet', 0.9245997071266174)\n",
	"('lucas', 0.9152452945709229)\n"
	]
	}
	],
	"source": [
	"for w,sim in interchangeable_words_model.most_similar(u'wickham'):\n",
	" print((w, sim))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 17,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"2017-07-02 11:35:00,678 : INFO : collecting all words and their counts\n",
	"2017-07-02 11:35:00,681 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n",
	"2017-07-02 11:35:00,813 : INFO : collected 5129 word types from a corpus of 148171 raw words and 5990 sentences\n",
	"2017-07-02 11:35:00,815 : INFO : Loading a fresh vocabulary\n",
	"2017-07-02 11:35:00,832 : INFO : min_count=5 retains 1800 unique words (35% of original 5129, drops 3329)\n",
	"2017-07-02 11:35:00,836 : INFO : min_count=5 leaves 142433 word corpus (96% of original 148171, drops 5738)\n",
	"2017-07-02 11:35:00,853 : INFO : deleting the raw counts dictionary of 5129 items\n",
	"2017-07-02 11:35:00,855 : INFO : sample=0.001 downsamples 49 most-common words\n",
	"2017-07-02 11:35:00,857 : INFO : downsampling leaves estimated 82416 word corpus (57.9% of prior 142433)\n",
	"2017-07-02 11:35:00,859 : INFO : estimated required memory for 1800 words and 100 dimensions: 2340000 bytes\n",
	"2017-07-02 11:35:00,877 : INFO : resetting layer weights\n",
	"2017-07-02 11:35:00,947 : INFO : training model with 3 workers on 1800 vocabulary and 100 features, using sg=1 hs=0 sample=0.001 negative=5 window=50\n",
	"2017-07-02 11:35:02,009 : INFO : PROGRESS: at 12.85% examples, 53190 words/s, in_qsize 5, out_qsize 0\n",
	"2017-07-02 11:35:03,031 : INFO : PROGRESS: at 29.36% examples, 59053 words/s, in_qsize 5, out_qsize 0\n",
	"2017-07-02 11:35:04,058 : INFO : PROGRESS: at 56.23% examples, 75235 words/s, in_qsize 5, out_qsize 0\n",
	"2017-07-02 11:35:05,069 : INFO : PROGRESS: at 83.68% examples, 83716 words/s, in_qsize 5, out_qsize 0\n",
	"2017-07-02 11:35:05,642 : INFO : worker thread finished; awaiting finish of 2 more threads\n",
	"2017-07-02 11:35:05,656 : INFO : worker thread finished; awaiting finish of 1 more threads\n",
	"2017-07-02 11:35:05,680 : INFO : worker thread finished; awaiting finish of 0 more threads\n",
	"2017-07-02 11:35:05,681 : INFO : training on 740855 raw words (411902 effective words) took 4.7s, 87291 effective words/s\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"1800"
	]
	},
	"execution_count": 17,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"attributes_of_model = Word2Vec(\n",
	" sentences=processed_sentences,\n",
	" workers=multiprocessing.cpu_count() - 1, # use your cores\n",
	" window=50, sg=1)\n",
	"\n",
	"len(attributes_of_model.wv.vocab)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 18,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"2017-07-02 11:35:05,704 : INFO : precomputing L2-norms of word weight vectors\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"('astonishment', 0.7948324084281921)\n",
	"('bingley', 0.7947502732276917)\n",
	"('moment', 0.7796972990036011)\n",
	"('account', 0.7642396688461304)\n",
	"('hear', 0.762365996837616)\n",
	"('surprised', 0.757840633392334)\n",
	"('acquaint', 0.747281551361084)\n",
	"('smile', 0.7398712635040283)\n",
	"('thought', 0.7388651371002197)\n",
	"('silent', 0.7383260726928711)\n"
	]
	}
	],
	"source": [
	"for w, sim in attributes_of_model.most_similar(u'darcy'):\n",
	" print((w, sim))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 19,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"('george', 0.7982236742973328)\n",
	"('everything', 0.7934660315513611)\n",
	"('history', 0.786710798740387)\n",
	"('friend', 0.7766832113265991)\n",
	"('perfectly', 0.7764248847961426)\n",
	"('friendship', 0.7637182474136353)\n",
	"('acquaint', 0.7593115568161011)\n",
	"('exactly', 0.7584044933319092)\n",
	"('relation', 0.7576615810394287)\n",
	"('conceal', 0.7575635313987732)\n"
	]
	}
	],
	"source": [
	"for w, sim in attributes_of_model.most_similar(u'wickham'):\n",
	" print((w, sim))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 38,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"2017-07-02 11:57:59,650 : INFO : loading 1800 words for fastText model from /tmp/ft_model.bin\n",
	"2017-07-02 11:58:04,928 : INFO : loading weights for 1800 words for fastText model from /tmp/ft_model.bin\n",
	"2017-07-02 11:58:05,115 : INFO : loaded (1800, 100) weight matrix for fastText model from /tmp/ft_model.bin\n",
	"2017-07-02 11:59:12,084 : INFO : loading 1800 words for fastText model from /tmp/ft_model.bin\n",
	"2017-07-02 11:59:15,940 : INFO : loading weights for 1800 words for fastText model from /tmp/ft_model.bin\n",
	"2017-07-02 11:59:16,162 : INFO : loaded (1800, 100) weight matrix for fastText model from /tmp/ft_model.bin\n"
	]
	}
	],
	"source": [
	"# fasttext\n",
	"from gensim.models.wrappers import FastText\n",
	"ft_home = '/gensim/gensim_dependencies/fastText'\n",
	"ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None\n",
	"\n",
	"processed_file = 'processed.txt'\n",
	"with open(processed_file, 'w') as f:\n",
	" f.write(\" \".join(\" \".join(s) for s in processed_sentences))\n",
	" \n",
	"\n",
	"ft_interchangeable = FastText.train(ft_path, processed_file, window=2)\n",
	"ft_attributes = FastText.train(ft_path, processed_file, window=50)\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 39,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"2017-07-02 11:59:16,258 : INFO : precomputing L2-norms of word weight vectors\n",
	"2017-07-02 11:59:16,265 : INFO : precomputing L2-norms of ngram weight vectors\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"('bennet', 0.9956306219100952)\n",
	"('bennets', 0.9956260919570923)\n",
	"('bingley', 0.9909453988075256)\n",
	"('collins', 0.9804142713546753)\n",
	"('gardiners', 0.9606761336326599)\n",
	"('collinses', 0.9604480266571045)\n",
	"('gardiner', 0.9554274082183838)\n",
	"('wickham', 0.9443435072898865)\n",
	"('phillips', 0.9129102826118469)\n",
	"('hurst', 0.909385085105896)\n"
	]
	}
	],
	"source": [
	"for w,sim in ft_interchangeable.most_similar(u'darcy'):\n",
	" print((w, sim))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 40,
	"metadata": {},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"2017-07-02 11:59:16,562 : INFO : precomputing L2-norms of word weight vectors\n",
	"2017-07-02 11:59:16,578 : INFO : precomputing L2-norms of ngram weight vectors\n"
	]
	},
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"('have', 0.9981114268302917)\n",
	"('every', 0.9965208768844604)\n",
	"('dare', 0.9959427118301392)\n",
	"('never', 0.9949673414230347)\n",
	"('save', 0.9941790103912354)\n",
	"('hear', 0.9937617778778076)\n",
	"('everything', 0.992755651473999)\n",
	"('something', 0.9925262928009033)\n",
	"('here', 0.9919098615646362)\n",
	"('sure', 0.9904596209526062)\n"
	]
	}
	],
	"source": [
	"for w,sim in ft_attributes.most_similar(u'darcy'):\n",
	" print((w, sim))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 48,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# singular and plural similarity"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 45,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"True"
	]
	},
	"execution_count": 45,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"\"night\" in ft_interchangeable.wv.vocab"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 46,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"False"
	]
	},
	"execution_count": 46,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"\"nights\" in ft_interchangeable.wv.vocab"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 47,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.99996042559761056"
	]
	},
	"execution_count": 47,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"ft_interchangeable.similarity(\"nights\", \"night\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 51,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"False"
	]
	},
	"execution_count": 51,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# side effect is that knight and night are similar. \n",
	"# Because it has never seen knight in a real text, \n",
	"# so assumes they are related just because they share word chunks\n",
	"\"knight\" in ft_interchangeable.wv.vocab"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 52,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.99972795156314342"
	]
	},
	"execution_count": 52,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"ft_interchangeable.similarity(\"knight\", \"night\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# wordrank\n",
	"# TODO: install MPI package in docker to enable wordrank\n",
	"from gensim.models.wrappers import Wordrank\n",
	"wr_home = os.environ.get('WR_HOME', None)\n",
	"out_dir = 'wr_model'\n",
	"wordrank_interchangeable = Wordrank.train(wr_home, filename, out_dir, window=2)\n",
	"wordrank_attributes = Wordrank.train(wr_home, filename, out_dir, window=50)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}