Build dictifier in fullcompressor

author: Patrick Kingston <patrick@pkingston.xyz> 2026-01-26 22:36:06 -0500
committer: Patrick Kingston <patrick@pkingston.xyz> 2026-01-26 22:36:06 -0500
commit: f5dab0d68b0c3268016e27bd0d1ba2391017cad7 (patch)
tree: 4c216780bd40d6df0c033669da6bd643c559dbb2
parent: 37fc0ccbba83ac0b22889c12d4c8351e81696e74 (diff)
1 files changed, 10 insertions, 32 deletions
diff --git a/bible/fullcompressor.clj b/bible/fullcompressor.clj
index 96c16f8..7f3130b 100644
--- a/bible/fullcompressor.clj
+++ b/bible/fullcompressor.clj
@@ -57,48 +57,26 @@
          ) 
 
 
-;;; To encode the most common words, we use 1DDDDDDD <- 7 bits for the 127 most common words
-;;; To encode the rest, we use 0DDDDDDD DDDDDDDD <- 15 bits for the other dictionary entries
-(comment 
-  (let [sorted-toks (sort-by val > (frequencies optimized-tokens))
-        totalcount (reduce + (map val sorted-toks))
-        top127 (take 128 sorted-toks)
-        top127count (reduce + (map val top127))]
-    {:total totalcount
-     :topwords top127count
-     :percent-savings (* 100 (double (/ top127count totalcount)))
-     :total-remaining-words (- totalcount top127count)})
-  ; {:total 965223,
-     ;  :topwords 715122,
-     ;  :percent-savings 74.08878570029931,
-     ;  :total-remaining-words 250101}
-  )
+;;; First we'll dictionary-encode our tokens
 
-;;; We'll start off by bit-packing our representations a bit.
+;; For a less efficient (in the long term) encoding algorithm, see dictionary-packed.clj
 (def word-ids
   (let [sorted-toks (sort-by val > (frequencies optimized-tokens))
-        top128 (take 128 sorted-toks)
-        rest128 (drop 128 sorted-toks)
-        top128-reprs 
+        token-reprs 
         (into {} 
               (map-indexed 
                 (fn [id [tok _freq]]
-                  [tok (unchecked-byte (bit-or 0x80 id))])
-                top128))
-        rest128-reprs 
-        (into {} 
-              (map-indexed
-                (fn [id [tok _freq]]
-                  [tok [(unchecked-byte (bit-or 2r10000000 (bit-shift-right id 8)))
-                        (unchecked-byte (bit-and 0x00FF id))]])
-                rest128))
-        token-reprs (merge top128-reprs rest128-reprs)]
-    (assert (= (count token-reprs) (+ (count top128-reprs) (count rest128-reprs))))
+                  [tok [(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]])
+                sorted-toks))]
     token-reprs))
 
 (def dict-id-compressed-text
   (flatten (map word-ids optimized-tokens)))
 
 (comment
-  (count dict-id-compressed-text) ;Whittled it down to 1212042 total bytes
+  (count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices
   )
+
+;;; TODO: Build dictionary
+
+;;; Next, we'll run LZSS on our stream of tokens
author	Patrick Kingston <patrick@pkingston.xyz>	2026-01-26 22:36:06 -0500
committer	Patrick Kingston <patrick@pkingston.xyz>	2026-01-26 22:36:06 -0500
commit	f5dab0d68b0c3268016e27bd0d1ba2391017cad7 (patch)
tree	4c216780bd40d6df0c033669da6bd643c559dbb2
parent	37fc0ccbba83ac0b22889c12d4c8351e81696e74 (diff)