Jaro similarity

func jaro(s, t) {

    return 1 if (s == t)

    var s_len = s.len
    var t_len = t.len

    var match_distance = ((s_len `max` t_len) // 2 - 1)

    var s_matches = []
    var t_matches = []

    var matches = 0
    var transpositions = 0

    for i (^s_len) {
        var start = (0 `max` i-match_distance)
        var end = (i+match_distance `min` t_len-1)

        for k (start..end) {
            t_matches[k] && next
            s[i] == t[k] || next
            s_matches[i] = true
            t_matches[k] = true
            matches++
            break
        }
    }

    return 0 if (matches == 0)

    var k = 0
    for i (^s_len) {
        s_matches[i] || next
        while (!t_matches[k]) { ++k }
        s[i] == t[k] || ++transpositions
        ++k
    }

    ((matches / s_len) +
      (matches / t_len) +
        ((matches - transpositions/2) / matches)) / 3
}

for pair in [
    [%c"MARTHA",    %c"MARHTA"],
    [%c"DIXON",     %c"DICKSONX"],
    [%c"JELLYFISH", %c"SMELLYFISH"],
] {
    say "jaro(#{pair.map{.join.dump}.join(', ')}) = #{'%.10f' % jaro(pair...)}"
}

Output:

jaro("MARTHA", "MARHTA") = 0.9444444444
jaro("DIXON", "DICKSONX") = 0.7666666667
jaro("JELLYFISH", "SMELLYFISH") = 0.8962962963

Last updated