1 files changed, 119 insertions, 4 deletions
diff --git a/bible/fullcompressor.clj b/bible/fullcompressor.clj
index 7f3130b..5606b8b 100644
--- a/bible/fullcompressor.clj
+++ b/bible/fullcompressor.clj
@@ -66,17 +66,132 @@
         (into {} 
               (map-indexed 
                 (fn [id [tok _freq]]
-                  [tok [(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]])
+                  [tok (unchecked-short (bit-and 0xffff id))#_[(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]])
                 sorted-toks))]
     token-reprs))
 
 (def dict-id-compressed-text
-  (flatten (map word-ids optimized-tokens)))
+  (map word-ids optimized-tokens))
 
 (comment
-  (count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices
+  (count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices (it's 962868 shorts)
   )
 
 ;;; TODO: Build dictionary
 
-;;; Next, we'll run LZSS on our stream of tokens
+;;; Next, we'll run LZSS on our tok-id-list
+(def WINDOW-SIZE 1024) ; The maximum distance we look back, only allowing 1k RAM
+(def MIN-MATCH 3)    ; Minimum length to bother with a reference
+
+(def MATCH-FLAG (count (frequencies dict-id-compressed-text))) 
+;Basically it's the total number of tokens they only need 0-(total-1)
+; This is the START. Then 10 bits for the offset (1k), and 4 bits for the length (255 toks).
+
+(defn- get-match-len [data-vec pos match-pos max-len]
+  (loop [len 0]
+    (if (and (< len max-len)
+             (= (nth data-vec (+ pos len))
+                (nth data-vec (+ match-pos len))))
+      (recur (inc len))
+      len)))
+
+#_(defn compress
+  "LZSS using transient vectors and a hash-index for O(1) lookups."
+  [data]
+  (let [data-vec (vec data)
+        data-len (count data-vec)
+        #_#_MIN-MATCH 3] ;; Standard MIN-MATCH
+    (loop [cursor 0
+           ;; map of [c1 c2 c3] -> list of indices
+           index {} 
+           ;; Transient vector for high-performance output collection
+           out (transient [])]
+      (if (>= cursor data-len)
+        (apply str (persistent! out)) ;; Finalize the transient and join
+        
+        (let [;; 1. Check if we have a potential match using the index
+              triplet (subvec data-vec cursor (min (+ cursor MIN-MATCH) data-len))
+              match-indices (get index triplet)
+              
+              ;; 2. Find the best match among the candidates
+              best-match (when (>= (count triplet) MIN-MATCH)
+                           (reduce (fn [best idx]
+                                     (let [len (get-match-len data-vec cursor idx (min (- data-len cursor) WINDOW-SIZE))]
+                                       (if (> len (:length best 0))
+                                         {:offset (- cursor idx) :length len}
+                                         best)))
+                                   nil
+                                   (filter #(> % (- cursor WINDOW-SIZE)) match-indices)))]
+          
+          (if (and best-match (>= (:length best-match) MIN-MATCH))
+            ;; Match found
+            (let [len (:length best-match)
+                  ref-str (str "<" (:offset best-match) "|" len ">")
+                  #_#_ref-str "<00|00"]
+              (recur (+ cursor len)
+                     ;; Update index for every triplet we skip (simplification: just first)
+                     (assoc index triplet (conj (get index triplet []) cursor))
+                     (conj! out ref-str)))
+            
+            ;; No match found
+            (let [char-lit (str (nth data-vec cursor))]
+              (recur (inc cursor)
+                     (if (>= (count triplet) MIN-MATCH)
+                       (assoc index triplet (conj (get index triplet []) cursor))
+                       index)
+                     (conj! out char-lit)))))))))
+
+(defn compress-tokens
+  "Takes a sequence of 16-bit token IDs. 
+   Returns a vector of maps: {:type :lit :val v} or {:type :match :off o :len l}"
+  [data]
+  (let [data-vec (vec data)
+        data-len (count data-vec)]
+    (loop [cursor 0
+           index {} 
+           out (transient [])]
+      
+      (if (>= cursor data-len)
+        (persistent! out) ;; Return clean vector of data structures
+        
+        ;; 1. Setup search parameters
+        (let [max-search-len (min (- data-len cursor) 255) ;; Cap length (e.g. 255) for 8-bit length fields
+              triplet (if (>= max-search-len MIN-MATCH)
+                        (subvec data-vec cursor (+ cursor MIN-MATCH))
+                        nil)
+              match-indices (get index triplet)]
+
+          ;; 2. Find Best Match
+          (let [best-match (when (and triplet match-indices)
+                             (reduce (fn [best candidate-idx]
+                                       ;; Check if candidate is within WINDOW-SIZE
+                                       (if (> candidate-idx (- cursor WINDOW-SIZE))
+                                         (let [len (get-match-len data-vec cursor candidate-idx max-search-len)]
+                                           (if (> len (:len best 0))
+                                             {:dist (- cursor candidate-idx) :len len}
+                                             best))
+                                         best)) ;; Too old, skip
+                                     nil
+                                     match-indices))]
+
+            (if (and best-match (>= (:len best-match) MIN-MATCH))
+              ;; CASE A: Match Found
+              (recur (+ cursor (:len best-match))
+                     ;; Note: We are still "Lazy Indexing" here for speed. 
+                     ;; To maximize compression, you'd loop here to add skipped parts to `index`.
+                     (assoc index triplet (conj (get index triplet []) cursor))
+                     (conj! out {:type :match 
+                                 :dist (:dist best-match) 
+                                 :len (:len best-match)}))
+
+              ;; CASE B: Literal
+              (recur (inc cursor)
+                     (if triplet
+                       (assoc index triplet (conj (get index triplet []) cursor))
+                       index)
+                     (conj! out {:type :lit 
+                                 :val (nth data-vec cursor)})))))))))
+
+(def lzss-compressed-dict-ids (compress-tokens dict-id-compressed-text))
+; The way this works is it outputs literal shorts, and then if there's a short
+; that says MATCH-FLAG it doesn't bother huffman encoding.