aboutsummaryrefslogtreecommitdiff
path: root/bible/fullcompressor.clj
diff options
context:
space:
mode:
Diffstat (limited to 'bible/fullcompressor.clj')
-rw-r--r--bible/fullcompressor.clj20
1 files changed, 18 insertions, 2 deletions
diff --git a/bible/fullcompressor.clj b/bible/fullcompressor.clj
index 65a035b..e06a242 100644
--- a/bible/fullcompressor.clj
+++ b/bible/fullcompressor.clj
@@ -78,7 +78,7 @@
;;; TODO: Build dictionary
;;; Next, we'll run LZSS on our tok-id-list
-(def WINDOW-SIZE 1024) ; The maximum distance we look back, only allowing 1k RAM
+(def WINDOW-SIZE 512) ; The maximum distance we look back, only allowing 1k RAM 512 shorts
(def MIN-MATCH 3) ; Minimum length to bother with a reference
(def MATCH-FLAG (count (frequencies dict-id-compressed-text)))
@@ -128,7 +128,23 @@
(if (and best-match (>= (:len best-match) MIN-MATCH))
;; CASE A: Match Found
- (recur (+ cursor (:len best-match))
+ #_(let [match-len (:len best-match)] ;This is an optimziation the LLM came up with
+ (recur (+ cursor match-len)
+ ;; EAGER INDEXING: Add all skipped positions to the index
+ (reduce
+ (fn [idx i]
+ (let [sub-triplet (if (<= (+ i MIN-MATCH) data-len)
+ (subvec data-vec i (+ i MIN-MATCH))
+ nil)]
+ (if sub-triplet
+ (assoc idx sub-triplet (conj (get idx sub-triplet []) i))
+ idx)))
+ index
+ (range cursor (+ cursor match-len)))
+ (conj! out {:type :match
+ :dist (:dist best-match)
+ :len match-len})))
+ (recur (+ cursor (:len best-match))
;; Note: We are still "Lazy Indexing" here for speed.
;; To maximize compression, you'd loop here to add skipped parts to `index`.
(assoc index triplet (conj (get index triplet []) cursor))