Skip to content

Instantly share code, notes, and snippets.

@djvanderlaan
Last active October 14, 2015 09:21
Show Gist options
  • Save djvanderlaan/9bd1ff9830c3a2326b8f to your computer and use it in GitHub Desktop.
Save djvanderlaan/9bd1ff9830c3a2326b8f to your computer and use it in GitHub Desktop.
Jaro similarity score C++ implementation with R-interface
#include <cstring>
#include <cstdlib>
#include <R.h>
#include <Rdefines.h>
double jaro(const char* a, const char* b, double range = 0.5, double match_weight = 1.0,
double trans_weight = 1.0) {
unsigned int la = strlen(a);
unsigned int lb = strlen(b);
// create a buffers; these store for each character in a and b which have
// been matched to characters in the other string
char* match_a = (char*)calloc(la, sizeof(char));
char* match_b = (char*)calloc(lb, sizeof(char));
// determine search range (only characters within search range can be
// matched)
unsigned int search_range = la > lb ? range*la - 1.0 : range*lb - 1.0;
if (search_range < 0) search_range = 0;
// determine the number of matching characters; a character can only be
// matched once, so aa and a has one match; the second a in the first string
// can not be match to the a in the second string as this a is already
// matched to the first a in the first string
unsigned int nmatch = 0;
for (unsigned int i = 0; i < la; ++i) {
unsigned int jmin = (i > search_range) ? i - search_range : 0;
unsigned int jmax = ((i+search_range+1) < lb) ? i + search_range + 1: lb;
for (unsigned int j = jmin; j < jmax; ++j) {
if ((match_b[j] != 1) && (a[i] == b[j])) {
nmatch++;
match_a[i] = 1;
match_b[j] = 1;
break;
}
}
}
if (nmatch == 0) {
free(match_a);
free(match_b);
return 0.0;
}
// determine the number of transpositions: count the number of matched
// characters (previous step; there are marked in match_a) that do not align;
// these are the number of half transpositions: AB BA has two half
// transpositions (A does not align with the A in the second string and the B
// does not align with the B in the second string) and one transposition.
unsigned int ntransp = 0;
unsigned int k = 0;
for (unsigned int i = 0; i < la; ++i) {
if (match_a[i] == 1) {
for (unsigned int j = k; j < lb; ++j) {
if (match_b[j] == 1) {
k = j + 1;
if (a[i] != b[j]) ++ntransp;
break;
}
}
}
}
ntransp /= 2;
// cleanup and calculate final score
free(match_a);
free(match_b);
double tot_weight = 2.0*match_weight + trans_weight;
return (match_weight*(double)nmatch/la + match_weight*(double)nmatch/lb +
trans_weight*(double)(nmatch - ntransp)/nmatch)/tot_weight;
}
extern "C" {
SEXP r_jaro(SEXP a, SEXP b) {
if (LENGTH(a) != LENGTH(b)) return R_NilValue;
SEXP res = PROTECT(allocVector(REALSXP, LENGTH(a)));
double* p = REAL(res);
for (int i = 0; i < LENGTH(a); ++i) {
SEXP str_a = STRING_ELT(a, i);
SEXP str_b = STRING_ELT(b, i);
(*p) = jaro(CHAR(str_a), CHAR(str_b));
}
UNPROTECT(1);
return res;
}
}
# Build jaro.so by running
# R CMD SHLIB jaro.cpp
dyn.load("jaro.so")
.Call("jw", 'shackleford', 'shackelford')
.Call("jw", 'jones', 'johnson')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment