aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Kingston <patrick@pkingston.xyz>2026-01-26 22:36:06 -0500
committerPatrick Kingston <patrick@pkingston.xyz>2026-01-26 22:36:06 -0500
commitf5dab0d68b0c3268016e27bd0d1ba2391017cad7 (patch)
tree4c216780bd40d6df0c033669da6bd643c559dbb2
parent37fc0ccbba83ac0b22889c12d4c8351e81696e74 (diff)
Build dictifier in fullcompressor
-rw-r--r--bible/fullcompressor.clj42
1 files changed, 10 insertions, 32 deletions
diff --git a/bible/fullcompressor.clj b/bible/fullcompressor.clj
index 96c16f8..7f3130b 100644
--- a/bible/fullcompressor.clj
+++ b/bible/fullcompressor.clj
@@ -57,48 +57,26 @@
)
-;;; To encode the most common words, we use 1DDDDDDD <- 7 bits for the 127 most common words
-;;; To encode the rest, we use 0DDDDDDD DDDDDDDD <- 15 bits for the other dictionary entries
-(comment
- (let [sorted-toks (sort-by val > (frequencies optimized-tokens))
- totalcount (reduce + (map val sorted-toks))
- top127 (take 128 sorted-toks)
- top127count (reduce + (map val top127))]
- {:total totalcount
- :topwords top127count
- :percent-savings (* 100 (double (/ top127count totalcount)))
- :total-remaining-words (- totalcount top127count)})
- ; {:total 965223,
- ; :topwords 715122,
- ; :percent-savings 74.08878570029931,
- ; :total-remaining-words 250101}
- )
+;;; First we'll dictionary-encode our tokens
-;;; We'll start off by bit-packing our representations a bit.
+;; For a less efficient (in the long term) encoding algorithm, see dictionary-packed.clj
(def word-ids
(let [sorted-toks (sort-by val > (frequencies optimized-tokens))
- top128 (take 128 sorted-toks)
- rest128 (drop 128 sorted-toks)
- top128-reprs
+ token-reprs
(into {}
(map-indexed
(fn [id [tok _freq]]
- [tok (unchecked-byte (bit-or 0x80 id))])
- top128))
- rest128-reprs
- (into {}
- (map-indexed
- (fn [id [tok _freq]]
- [tok [(unchecked-byte (bit-or 2r10000000 (bit-shift-right id 8)))
- (unchecked-byte (bit-and 0x00FF id))]])
- rest128))
- token-reprs (merge top128-reprs rest128-reprs)]
- (assert (= (count token-reprs) (+ (count top128-reprs) (count rest128-reprs))))
+ [tok [(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]])
+ sorted-toks))]
token-reprs))
(def dict-id-compressed-text
(flatten (map word-ids optimized-tokens)))
(comment
- (count dict-id-compressed-text) ;Whittled it down to 1212042 total bytes
+ (count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices
)
+
+;;; TODO: Build dictionary
+
+;;; Next, we'll run LZSS on our stream of tokens