Created
May 1, 2015 22:42
-
-
Save spencermountain/696fdc5e7f48035b83e4 to your computer and use it in GitHub Desktop.
sentence segmentation v0.3.9 from nlp_compromise
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//(Rule-based sentence boundary segmentation) - chop given text into its proper sentences. | |
// Ignore periods/questions/exclamations used in acronyms/abbreviations/numbers, etc. | |
// @spencermountain 2015 MIT | |
var sentence_parser = function(text) { | |
var sentences = []; | |
//first do a greedy-split.. | |
var chunks = text.split(/(\S.+?[.\?!])(?=\s+|$|")/g); | |
//honourifics | |
var abbrevs = ["jr", "mr", "mrs", "ms", "dr", "prof", "sr", "sen", "corp", "rep", "gov", "atty", "supt", "det", "rev", "col", "gen", "lt", "cmdr", "adm", "capt", "sgt", "cpl", "maj", "miss", "misses", "mister", "sir", "esq", "mstr", "phd", "adj", "adv", "asst", "bldg", "brig", "comdr", "hon", "messrs", "mlle", "mme", "op", "ord", "pvt", "reps", "res", "sens", "sfc", "surg"]; | |
//common abbreviations | |
abbrevs = abbrevs.concat(["arc", "al", "ave", "blvd", "cl", "ct", "cres", "exp", "rd", "st", "dist", "mt", "ft", "fy", "hwy", "la", "pd", "pl", "plz", "tce", "vs", "etc", "esp", "llb", "md", "bl", "ma", "ba", "lit", "fl", "ex", "eg", "ie"]); | |
//place abbrevs | |
abbrevs = abbrevs.concat(["ala", "ariz", "ark", "cal", "calif", "col", "colo", "conn", "del", "fed", "fla", "ga", "ida", "ind", "ia", "kan", "kans", "ken", "ky", "la", "md", "mich", "minn", "mont", "neb", "nebr", "nev", "okla", "penna", "penn", "pa", "dak", "tenn", "tex", "ut", "vt", "va", "wash", "wis", "wisc", "wy", "wyo", "usafa", "alta", "ont", "que", "sask", "yuk", "bc"]); | |
//date abbrevs | |
abbrevs = abbrevs.concat(["jan", "feb", "mar", "apr", "jun", "jul", "aug", "sep", "oct", "nov", "dec", "sept", "sep"]); | |
//org abbrevs | |
abbrevs = abbrevs.concat(["dept", "univ", "assn", "bros", "inc", "ltd", "co", "corp"]); | |
//proper nouns with exclamation marks | |
abbrevs = abbrevs.concat(["yahoo", "joomla", "jeopardy"]); | |
//detection of non-sentence chunks | |
var abbrev_reg = new RegExp("(^| )(" + abbrevs.join("|") + ")[.!?] ?$", "i"); | |
var acronym_reg= new RegExp("[ |\.][A-Z]\.?$", "i") | |
var elipses_reg= new RegExp("\\.\\.\\.*$") | |
//loop through these chunks, and join the non-sentence chunks back together.. | |
var chunks_length = chunks.length; | |
for (i = 0; i < chunks_length; i++) { | |
if (chunks[i]) { | |
//trim whitespace | |
chunks[i] = chunks[i].replace(/^\s+|\s+$/g, ""); | |
//should this chunk be combined with the next one? | |
if (chunks[i+1] && chunks[i].match(abbrev_reg) || chunks[i].match(acronym_reg) || chunks[i].match(elipses_reg) ) { | |
chunks[i + 1] = ((chunks[i]||'') + " " + (chunks[i + 1]||'')).replace(/ +/g, " "); | |
} else if(chunks[i] && chunks[i].length>0){ //this chunk is a proper sentence.. | |
sentences.push(chunks[i]); | |
chunks[i] = ""; | |
} | |
} | |
} | |
//if we never got a sentence, return the given text | |
if (sentences.length === 0) { | |
return [text] | |
} | |
return sentences; | |
} | |
if (typeof module !== "undefined" && module.exports) { | |
exports.sentences = sentence_parser; | |
} | |
// console.log(sentence_parser('Tony is nice. He lives in Japan.').length === 2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment