diff options
| author | Patrick Kingston <patrick@pkingston.xyz> | 2026-01-28 19:23:13 -0500 |
|---|---|---|
| committer | Patrick Kingston <patrick@pkingston.xyz> | 2026-01-28 19:23:13 -0500 |
| commit | fc8569a88200ec88c3773c338fdcf16b16ea51d5 (patch) | |
| tree | 32b7616811d5248ef53dc8d98bd996652f25b967 /bible | |
| parent | 0e17cdb8d895589dbf1470f5b06a49b1b94e2e4d (diff) | |
Get compressor working
Diffstat (limited to 'bible')
| -rw-r--r-- | bible/fullcompressor.clj | 20 | ||||
| -rw-r--r-- | bible/huffman.clj | 10 |
2 files changed, 20 insertions, 10 deletions
diff --git a/bible/fullcompressor.clj b/bible/fullcompressor.clj index 65a035b..e06a242 100644 --- a/bible/fullcompressor.clj +++ b/bible/fullcompressor.clj @@ -78,7 +78,7 @@ ;;; TODO: Build dictionary ;;; Next, we'll run LZSS on our tok-id-list -(def WINDOW-SIZE 1024) ; The maximum distance we look back, only allowing 1k RAM +(def WINDOW-SIZE 512) ; The maximum distance we look back, only allowing 1k RAM 512 shorts (def MIN-MATCH 3) ; Minimum length to bother with a reference (def MATCH-FLAG (count (frequencies dict-id-compressed-text))) @@ -128,7 +128,23 @@ (if (and best-match (>= (:len best-match) MIN-MATCH)) ;; CASE A: Match Found - (recur (+ cursor (:len best-match)) + #_(let [match-len (:len best-match)] ;This is an optimziation the LLM came up with + (recur (+ cursor match-len) + ;; EAGER INDEXING: Add all skipped positions to the index + (reduce + (fn [idx i] + (let [sub-triplet (if (<= (+ i MIN-MATCH) data-len) + (subvec data-vec i (+ i MIN-MATCH)) + nil)] + (if sub-triplet + (assoc idx sub-triplet (conj (get idx sub-triplet []) i)) + idx))) + index + (range cursor (+ cursor match-len))) + (conj! out {:type :match + :dist (:dist best-match) + :len match-len}))) + (recur (+ cursor (:len best-match)) ;; Note: We are still "Lazy Indexing" here for speed. ;; To maximize compression, you'd loop here to add skipped parts to `index`. (assoc index triplet (conj (get index triplet []) cursor)) diff --git a/bible/huffman.clj b/bible/huffman.clj index c219b7e..fccdae1 100644 --- a/bible/huffman.clj +++ b/bible/huffman.clj @@ -149,14 +149,8 @@ (if (= basecount len) base (str (apply str (take (- len basecount) (repeat "0"))) base)))) - ;; --- little class for Huffman Codewords -(def sorted-huffman-tree-codewords - (->> huffman-tree-syms - (sort-by (juxt (comp count val) key)) - (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1))))))) - (defn build-canonical-encodings "Build canonical huffman encodings from a huffman tree takes [symbols] a list of huffman codes derived from a code-tree" @@ -197,7 +191,7 @@ (build-canonical-encodings huffman-tree)) -(assert +#_(assert (= (map #(.length %1) sorted-huffman-tree-codewords) (map #(.length %1) sorted-huffman-tree-codewords)) "Some of the codes changed length when canonicalizing") @@ -228,7 +222,7 @@ {:bits totalbits :bytes totalbytes :kb totalkb - :mb totalmb})) + :mb totalmb})) (def stage1-optimized-token-encoding-length (get-encoded-length canonical-encodings tokens))) |
