From 95243c368ccc96d8845b4d5f38dce89cad2118d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Remko=20Tron=C3=A7on?= Date: Sat, 20 Jan 2024 18:12:44 +0100 Subject: [PATCH] thurtle: Add lzw & ulz compression --- src/web/thurtle/lzw.ts | 46 ++++++++++++++++++ src/web/thurtle/ulz.ts | 104 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 150 insertions(+) create mode 100644 src/web/thurtle/lzw.ts create mode 100644 src/web/thurtle/ulz.ts diff --git a/src/web/thurtle/lzw.ts b/src/web/thurtle/lzw.ts new file mode 100644 index 0000000..4043902 --- /dev/null +++ b/src/web/thurtle/lzw.ts @@ -0,0 +1,46 @@ +export function lzwEncode(data: Uint8Array) { + const dict = new Map(); + const out: Array = []; + let phrase = String.fromCharCode(data[0]); + let code = 256; + for (let i = 1; i < data.length; i++) { + const chr = String.fromCharCode(data[i]); + const nphrase = phrase + chr; + if (dict.has(nphrase)) { + phrase = nphrase; + } else { + out.push(phrase.length > 1 ? dict.get(phrase)! : phrase.charCodeAt(0)); + dict.set(nphrase, code); + code++; + phrase = chr; + } + } + out.push(phrase.length > 1 ? dict.get(phrase)! : phrase.charCodeAt(0)); + return new TextEncoder().encode( + out.map((c) => String.fromCharCode(c)).join("") + ); +} + +export function lzwDecode(rdata: Uint8Array) { + const data = [...new TextDecoder().decode(rdata)].map((c) => c.charCodeAt(0)); + let dict = new Map(); + var curChar = data[0]; + var curPhrase = [curChar]; + var out = [curChar]; + var code = 256; + for (var i = 1; i < data.length; i++) { + const c = data[i]; + const phrase = + c < 256 + ? [data[i]] + : dict.has(c) + ? dict.get(c)! + : curPhrase.concat(curChar); + out.push(...phrase); + curChar = phrase[0]; + dict.set(code, curPhrase.concat(curChar)); + code++; + curPhrase = phrase; + } + return new Uint8Array(out); +} diff --git a/src/web/thurtle/ulz.ts b/src/web/thurtle/ulz.ts new file mode 100644 index 0000000..3e0775f --- /dev/null +++ b/src/web/thurtle/ulz.ts @@ -0,0 +1,104 @@ +export function ulzDecode(src: Uint8Array) { + const dst: Array = []; + let sp = 0; + while (sp < src.length) { + const c = src[sp++]; + if (c & 0x80) { + // CPY + let length; + if (c & 0x40) { + if (sp >= src.length) { + throw new Error(`incomplete CPY2`); + } + length = ((c & 0x3f) << 8) | src[sp++]; + } else { + length = c & 0x3f; + } + if (sp >= src.length) { + throw new Error(`incomplete CPY`); + } + let cp = dst.length - (src[sp++] + 1); + if (cp < 0) { + throw new Error(`CPY underflow`); + } + for (let i = 0; i < length + 4; i++) { + dst.push(dst[cp++]); + } + } else { + // LIT + if (sp + c >= src.length) { + throw new Error(`LIT out of bounds: ${sp} + ${c} >= ${src.length}`); + } + for (let i = 0; i < c + 1; i++) { + dst.push(src[sp++]); + } + } + } + return new Uint8Array(dst); +} + +const MIN_MAX_LENGTH = 4; + +function findBestMatch( + src: Uint8Array, + sp: number, + dlen: number, + slen: number +) { + let bmlen = 0; + let bmp = 0; + let dp = sp - dlen; + for (; dlen; dp++, dlen--) { + let i = 0; + for (; ; i++) { + if (i == slen) { + return [dp, i]; + } + if (src[sp + i] != src[dp + (i % dlen)]) { + break; + } + } + if (i > bmlen) { + bmlen = i; + bmp = dp; + } + } + return [bmp, bmlen]; +} + +export function ulzEncode(src: Uint8Array) { + let dst: Array = []; + let sp = 0; + let litp = -1; + while (sp < src.length) { + const dlen = Math.min(sp, 256); + const slen = Math.min(src.length - sp, 0x3fff + MIN_MAX_LENGTH); + const [bmp, bmlen] = findBestMatch(src, sp, dlen, slen); + if (bmlen >= MIN_MAX_LENGTH) { + // CPY + const bmctl = bmlen - MIN_MAX_LENGTH; + if (bmctl > 0x3f) { + // CPY2 + dst.push((bmctl >> 8) | 0xc0); + dst.push(bmctl & 0xff); + } else { + dst.push(bmctl | 0x80); + } + dst.push(sp - bmp - 1); + sp += bmlen; + litp = -1; + } else { + // LIT + if (litp >= 0) { + if ((dst[litp] += 1) == 127) { + litp = -1; + } + } else { + dst.push(0); + litp = dst.length - 1; + } + dst.push(src[sp++]); + } + } + return new Uint8Array(dst); +}