Get compressor working

author: Patrick Kingston <patrick@pkingston.xyz> 2026-01-28 19:23:13 -0500
committer: Patrick Kingston <patrick@pkingston.xyz> 2026-01-28 19:23:13 -0500
commit: fc8569a88200ec88c3773c338fdcf16b16ea51d5 (patch)
tree: 32b7616811d5248ef53dc8d98bd996652f25b967
parent: 0e17cdb8d895589dbf1470f5b06a49b1b94e2e4d (diff)
2 files changed, 20 insertions, 10 deletions
diff --git a/bible/fullcompressor.clj b/bible/fullcompressor.clj
index 65a035b..e06a242 100644
--- a/bible/fullcompressor.clj
+++ b/bible/fullcompressor.clj
@@ -78,7 +78,7 @@
 ;;; TODO: Build dictionary
 
 ;;; Next, we'll run LZSS on our tok-id-list
-(def WINDOW-SIZE 1024) ; The maximum distance we look back, only allowing 1k RAM
+(def WINDOW-SIZE 512) ; The maximum distance we look back, only allowing 1k RAM 512 shorts
 (def MIN-MATCH 3)    ; Minimum length to bother with a reference
 
 (def MATCH-FLAG (count (frequencies dict-id-compressed-text))) 
@@ -128,7 +128,23 @@
 
             (if (and best-match (>= (:len best-match) MIN-MATCH))
               ;; CASE A: Match Found
-              (recur (+ cursor (:len best-match))
+              #_(let [match-len (:len best-match)] ;This is an optimziation the LLM came up with
+                (recur (+ cursor match-len)
+                       ;; EAGER INDEXING: Add all skipped positions to the index
+                       (reduce 
+                         (fn [idx i]
+                           (let [sub-triplet (if (<= (+ i MIN-MATCH) data-len)
+                                               (subvec data-vec i (+ i MIN-MATCH))
+                                               nil)]
+                             (if sub-triplet
+                               (assoc idx sub-triplet (conj (get idx sub-triplet []) i))
+                               idx)))
+                         index
+                         (range cursor (+ cursor match-len)))
+                       (conj! out {:type :match 
+                                   :dist (:dist best-match) 
+                                   :len match-len})))
+              (recur (+ cursor (:len best-match)) 
                      ;; Note: We are still "Lazy Indexing" here for speed. 
                      ;; To maximize compression, you'd loop here to add skipped parts to `index`.
                      (assoc index triplet (conj (get index triplet []) cursor))
diff --git a/bible/huffman.clj b/bible/huffman.clj
index c219b7e..fccdae1 100644
--- a/bible/huffman.clj
+++ b/bible/huffman.clj
@@ -149,14 +149,8 @@
     (if (= basecount len)
       base
       (str (apply str (take (- len basecount) (repeat "0"))) base))))
-
 ;; --- little class for Huffman Codewords
 
-(def sorted-huffman-tree-codewords 
-  (->> huffman-tree-syms
-       (sort-by (juxt (comp count val) key))
-       (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1)))))))
-
 (defn build-canonical-encodings
   "Build canonical huffman encodings from a huffman tree
    takes [symbols] a list of huffman codes derived from a code-tree"
@@ -197,7 +191,7 @@
   (build-canonical-encodings huffman-tree))
 
 
-(assert
+#_(assert
   (= (map #(.length %1) sorted-huffman-tree-codewords)
      (map #(.length %1) sorted-huffman-tree-codewords))
   "Some of the codes changed length when canonicalizing")
@@ -228,7 +222,7 @@
       {:bits totalbits
        :bytes totalbytes
        :kb totalkb
-       :mb totalmb}))
+       :mb totalmb})) 
   (def stage1-optimized-token-encoding-length
     (get-encoded-length canonical-encodings tokens)))
author	Patrick Kingston <patrick@pkingston.xyz>	2026-01-28 19:23:13 -0500
committer	Patrick Kingston <patrick@pkingston.xyz>	2026-01-28 19:23:13 -0500
commit	fc8569a88200ec88c3773c338fdcf16b16ea51d5 (patch)
tree	32b7616811d5248ef53dc8d98bd996652f25b967
parent	0e17cdb8d895589dbf1470f5b06a49b1b94e2e4d (diff)