diff options
| author | Patrick Kingston <patrick@pkingston.xyz> | 2026-01-26 22:36:06 -0500 |
|---|---|---|
| committer | Patrick Kingston <patrick@pkingston.xyz> | 2026-01-26 22:36:06 -0500 |
| commit | f5dab0d68b0c3268016e27bd0d1ba2391017cad7 (patch) | |
| tree | 4c216780bd40d6df0c033669da6bd643c559dbb2 /bible | |
| parent | 37fc0ccbba83ac0b22889c12d4c8351e81696e74 (diff) | |
Build dictifier in fullcompressor
Diffstat (limited to 'bible')
| -rw-r--r-- | bible/fullcompressor.clj | 42 |
1 files changed, 10 insertions, 32 deletions
diff --git a/bible/fullcompressor.clj b/bible/fullcompressor.clj index 96c16f8..7f3130b 100644 --- a/bible/fullcompressor.clj +++ b/bible/fullcompressor.clj @@ -57,48 +57,26 @@ ) -;;; To encode the most common words, we use 1DDDDDDD <- 7 bits for the 127 most common words -;;; To encode the rest, we use 0DDDDDDD DDDDDDDD <- 15 bits for the other dictionary entries -(comment - (let [sorted-toks (sort-by val > (frequencies optimized-tokens)) - totalcount (reduce + (map val sorted-toks)) - top127 (take 128 sorted-toks) - top127count (reduce + (map val top127))] - {:total totalcount - :topwords top127count - :percent-savings (* 100 (double (/ top127count totalcount))) - :total-remaining-words (- totalcount top127count)}) - ; {:total 965223, - ; :topwords 715122, - ; :percent-savings 74.08878570029931, - ; :total-remaining-words 250101} - ) +;;; First we'll dictionary-encode our tokens -;;; We'll start off by bit-packing our representations a bit. +;; For a less efficient (in the long term) encoding algorithm, see dictionary-packed.clj (def word-ids (let [sorted-toks (sort-by val > (frequencies optimized-tokens)) - top128 (take 128 sorted-toks) - rest128 (drop 128 sorted-toks) - top128-reprs + token-reprs (into {} (map-indexed (fn [id [tok _freq]] - [tok (unchecked-byte (bit-or 0x80 id))]) - top128)) - rest128-reprs - (into {} - (map-indexed - (fn [id [tok _freq]] - [tok [(unchecked-byte (bit-or 2r10000000 (bit-shift-right id 8))) - (unchecked-byte (bit-and 0x00FF id))]]) - rest128)) - token-reprs (merge top128-reprs rest128-reprs)] - (assert (= (count token-reprs) (+ (count top128-reprs) (count rest128-reprs)))) + [tok [(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]]) + sorted-toks))] token-reprs)) (def dict-id-compressed-text (flatten (map word-ids optimized-tokens))) (comment - (count dict-id-compressed-text) ;Whittled it down to 1212042 total bytes + (count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices ) + +;;; TODO: Build dictionary + +;;; Next, we'll run LZSS on our stream of tokens |
