diff options
| author | Patrick Kingston <patrick@pkingston.xyz> | 2026-01-28 19:23:13 -0500 |
|---|---|---|
| committer | Patrick Kingston <patrick@pkingston.xyz> | 2026-01-28 19:23:13 -0500 |
| commit | fc8569a88200ec88c3773c338fdcf16b16ea51d5 (patch) | |
| tree | 32b7616811d5248ef53dc8d98bd996652f25b967 /bible/fullcompressor.clj | |
| parent | 0e17cdb8d895589dbf1470f5b06a49b1b94e2e4d (diff) | |
Get compressor working
Diffstat (limited to 'bible/fullcompressor.clj')
| -rw-r--r-- | bible/fullcompressor.clj | 20 |
1 files changed, 18 insertions, 2 deletions
diff --git a/bible/fullcompressor.clj b/bible/fullcompressor.clj index 65a035b..e06a242 100644 --- a/bible/fullcompressor.clj +++ b/bible/fullcompressor.clj @@ -78,7 +78,7 @@ ;;; TODO: Build dictionary ;;; Next, we'll run LZSS on our tok-id-list -(def WINDOW-SIZE 1024) ; The maximum distance we look back, only allowing 1k RAM +(def WINDOW-SIZE 512) ; The maximum distance we look back, only allowing 1k RAM 512 shorts (def MIN-MATCH 3) ; Minimum length to bother with a reference (def MATCH-FLAG (count (frequencies dict-id-compressed-text))) @@ -128,7 +128,23 @@ (if (and best-match (>= (:len best-match) MIN-MATCH)) ;; CASE A: Match Found - (recur (+ cursor (:len best-match)) + #_(let [match-len (:len best-match)] ;This is an optimziation the LLM came up with + (recur (+ cursor match-len) + ;; EAGER INDEXING: Add all skipped positions to the index + (reduce + (fn [idx i] + (let [sub-triplet (if (<= (+ i MIN-MATCH) data-len) + (subvec data-vec i (+ i MIN-MATCH)) + nil)] + (if sub-triplet + (assoc idx sub-triplet (conj (get idx sub-triplet []) i)) + idx))) + index + (range cursor (+ cursor match-len))) + (conj! out {:type :match + :dist (:dist best-match) + :len match-len}))) + (recur (+ cursor (:len best-match)) ;; Note: We are still "Lazy Indexing" here for speed. ;; To maximize compression, you'd loop here to add skipped parts to `index`. (assoc index triplet (conj (get index triplet []) cursor)) |
