aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Kingston <patrick@pkingston.xyz>2026-01-28 19:23:13 -0500
committerPatrick Kingston <patrick@pkingston.xyz>2026-01-28 19:23:13 -0500
commitfc8569a88200ec88c3773c338fdcf16b16ea51d5 (patch)
tree32b7616811d5248ef53dc8d98bd996652f25b967
parent0e17cdb8d895589dbf1470f5b06a49b1b94e2e4d (diff)
Get compressor working
-rw-r--r--bible/fullcompressor.clj20
-rw-r--r--bible/huffman.clj10
2 files changed, 20 insertions, 10 deletions
diff --git a/bible/fullcompressor.clj b/bible/fullcompressor.clj
index 65a035b..e06a242 100644
--- a/bible/fullcompressor.clj
+++ b/bible/fullcompressor.clj
@@ -78,7 +78,7 @@
;;; TODO: Build dictionary
;;; Next, we'll run LZSS on our tok-id-list
-(def WINDOW-SIZE 1024) ; The maximum distance we look back, only allowing 1k RAM
+(def WINDOW-SIZE 512) ; The maximum distance we look back, only allowing 1k RAM 512 shorts
(def MIN-MATCH 3) ; Minimum length to bother with a reference
(def MATCH-FLAG (count (frequencies dict-id-compressed-text)))
@@ -128,7 +128,23 @@
(if (and best-match (>= (:len best-match) MIN-MATCH))
;; CASE A: Match Found
- (recur (+ cursor (:len best-match))
+ #_(let [match-len (:len best-match)] ;This is an optimziation the LLM came up with
+ (recur (+ cursor match-len)
+ ;; EAGER INDEXING: Add all skipped positions to the index
+ (reduce
+ (fn [idx i]
+ (let [sub-triplet (if (<= (+ i MIN-MATCH) data-len)
+ (subvec data-vec i (+ i MIN-MATCH))
+ nil)]
+ (if sub-triplet
+ (assoc idx sub-triplet (conj (get idx sub-triplet []) i))
+ idx)))
+ index
+ (range cursor (+ cursor match-len)))
+ (conj! out {:type :match
+ :dist (:dist best-match)
+ :len match-len})))
+ (recur (+ cursor (:len best-match))
;; Note: We are still "Lazy Indexing" here for speed.
;; To maximize compression, you'd loop here to add skipped parts to `index`.
(assoc index triplet (conj (get index triplet []) cursor))
diff --git a/bible/huffman.clj b/bible/huffman.clj
index c219b7e..fccdae1 100644
--- a/bible/huffman.clj
+++ b/bible/huffman.clj
@@ -149,14 +149,8 @@
(if (= basecount len)
base
(str (apply str (take (- len basecount) (repeat "0"))) base))))
-
;; --- little class for Huffman Codewords
-(def sorted-huffman-tree-codewords
- (->> huffman-tree-syms
- (sort-by (juxt (comp count val) key))
- (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1)))))))
-
(defn build-canonical-encodings
"Build canonical huffman encodings from a huffman tree
takes [symbols] a list of huffman codes derived from a code-tree"
@@ -197,7 +191,7 @@
(build-canonical-encodings huffman-tree))
-(assert
+#_(assert
(= (map #(.length %1) sorted-huffman-tree-codewords)
(map #(.length %1) sorted-huffman-tree-codewords))
"Some of the codes changed length when canonicalizing")
@@ -228,7 +222,7 @@
{:bits totalbits
:bytes totalbytes
:kb totalkb
- :mb totalmb}))
+ :mb totalmb}))
(def stage1-optimized-token-encoding-length
(get-encoded-length canonical-encodings tokens)))