Burrows Wheeler transform

class BurrowsWheelerTransform (String L = "\002") {
 
    method encode(String s) {
        assert(!s.contains(L), "String cannot contain `#{L.dump}`")
        s = (L + s)
        s.len.of{|i| s.substr(i) + s.substr(0, i) }.sort.map{.last}.join
    }
 
    method decode(String s) {
        var t = s.len.of("")
        var c = s.chars
        { t = (c »+« t).sort } * s.len
        t.first { .begins_with(L) }.substr(L.len)
    }
}
 
var tests = [
    "banana", "appellee", "dogwood", "TOBEORNOTTOBEORTOBEORNOT"
    "SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES",
]
 
var bwt = BurrowsWheelerTransform(L: '$')
 
tests.each { |str|
    var enc = bwt.encode(str)
    var dec = bwt.decode(enc)
    say "BWT(#{dec.dump}) = #{enc.dump}"
    assert_eq(str, dec)
}

Output:

BWT("banana") = "annb$aa"
BWT("appellee") = "e$elplepa"
BWT("dogwood") = "do$oodwg"
BWT("TOBEORNOTTOBEORTOBEORNOT") = "TOOOBBBRRTTTEEENNOOOOR$TO"
BWT("SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES") = "STEXYDST.E.IXXIIXXSSMPPS.B..EE.$.USFXDIIOIIIT"

More efficient implementation, running in O(n*log(n)) time, using O(n) space:

define LOOKAHEAD_LEN = 128

func bwt_sort (String s) {    # O(n * LOOKAHEAD_LEN) space (fast)

    ^s.len -> map {|i|
        var t = s.slice(i, LOOKAHEAD_LEN)

        if (t.len < LOOKAHEAD_LEN) {
            t += s.slice(0, min(i, LOOKAHEAD_LEN - t.len))
        }

        [t, i]
    }.sort {|a,b|
        (a[0] <=> b[0]) || (s.rotate(a[1]) <=> s.rotate(b[1]))
    }.map { .[1] }
}

func bwt_encode(String s) {

    var bwt    = bwt_sort(s)
    var ret    = bwt.map {|i| s.slice(i-1, 1) }.join
    var prefix = s.slice(0, LOOKAHEAD_LEN)
    var len    = prefix.len

    var idx = 0;
    for i in (bwt) {

        var lookahead = s.slice(i, len)

        if (lookahead.len < len) {
            lookahead += s.slice(0, len - lookahead.len)
        }

        if (lookahead == prefix) {
            var row = s.rotate(i)
            if (row == s) {
                break
            }
        }
        ++idx
    }

    return (ret, idx)
}

func bwt_decode(String bwt, Number idx) {    # fast inversion

    var tail = bwt.chars
    var head = tail.sort

    var indices = Hash()
    tail.each_kv {|i,v|
        indices{v} := [] << i
    }

    var table = []
    head.each_kv {|i,v|
        table[i] = indices{v}.shift
    }

    var dec = ''
    var i = idx

    head.len.times {
        dec += head[i]
        i = table[i]
    }

    return dec
}

var tests = [
    "banana", "appellee", "dogwood", "TOBEORNOTTOBEORTOBEORNOT"
    "SIX.MIXED.PIXIES.SIFT.SIXTY.PIXIE.DUST.BOXES",
]

tests.each { |str|
    var (enc, idx) = bwt_encode(str)
    var dec = bwt_decode(enc, idx)
    say "BWT(#{dec.dump}) = (#{enc.dump}, #{idx})"
    assert_eq(str, dec)
}

Last updated