aboutsummaryrefslogtreecommitdiff
path: root/bible/fullcompressor.clj
diff options
context:
space:
mode:
Diffstat (limited to 'bible/fullcompressor.clj')
-rw-r--r--bible/fullcompressor.clj123
1 files changed, 119 insertions, 4 deletions
diff --git a/bible/fullcompressor.clj b/bible/fullcompressor.clj
index 7f3130b..5606b8b 100644
--- a/bible/fullcompressor.clj
+++ b/bible/fullcompressor.clj
@@ -66,17 +66,132 @@
(into {}
(map-indexed
(fn [id [tok _freq]]
- [tok [(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]])
+ [tok (unchecked-short (bit-and 0xffff id))#_[(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]])
sorted-toks))]
token-reprs))
(def dict-id-compressed-text
- (flatten (map word-ids optimized-tokens)))
+ (map word-ids optimized-tokens))
(comment
- (count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices
+ (count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices (it's 962868 shorts)
)
;;; TODO: Build dictionary
-;;; Next, we'll run LZSS on our stream of tokens
+;;; Next, we'll run LZSS on our tok-id-list
+(def WINDOW-SIZE 1024) ; The maximum distance we look back, only allowing 1k RAM
+(def MIN-MATCH 3) ; Minimum length to bother with a reference
+
+(def MATCH-FLAG (count (frequencies dict-id-compressed-text)))
+;Basically it's the total number of tokens they only need 0-(total-1)
+; This is the START. Then 10 bits for the offset (1k), and 4 bits for the length (255 toks).
+
+(defn- get-match-len [data-vec pos match-pos max-len]
+ (loop [len 0]
+ (if (and (< len max-len)
+ (= (nth data-vec (+ pos len))
+ (nth data-vec (+ match-pos len))))
+ (recur (inc len))
+ len)))
+
+#_(defn compress
+ "LZSS using transient vectors and a hash-index for O(1) lookups."
+ [data]
+ (let [data-vec (vec data)
+ data-len (count data-vec)
+ #_#_MIN-MATCH 3] ;; Standard MIN-MATCH
+ (loop [cursor 0
+ ;; map of [c1 c2 c3] -> list of indices
+ index {}
+ ;; Transient vector for high-performance output collection
+ out (transient [])]
+ (if (>= cursor data-len)
+ (apply str (persistent! out)) ;; Finalize the transient and join
+
+ (let [;; 1. Check if we have a potential match using the index
+ triplet (subvec data-vec cursor (min (+ cursor MIN-MATCH) data-len))
+ match-indices (get index triplet)
+
+ ;; 2. Find the best match among the candidates
+ best-match (when (>= (count triplet) MIN-MATCH)
+ (reduce (fn [best idx]
+ (let [len (get-match-len data-vec cursor idx (min (- data-len cursor) WINDOW-SIZE))]
+ (if (> len (:length best 0))
+ {:offset (- cursor idx) :length len}
+ best)))
+ nil
+ (filter #(> % (- cursor WINDOW-SIZE)) match-indices)))]
+
+ (if (and best-match (>= (:length best-match) MIN-MATCH))
+ ;; Match found
+ (let [len (:length best-match)
+ ref-str (str "<" (:offset best-match) "|" len ">")
+ #_#_ref-str "<00|00"]
+ (recur (+ cursor len)
+ ;; Update index for every triplet we skip (simplification: just first)
+ (assoc index triplet (conj (get index triplet []) cursor))
+ (conj! out ref-str)))
+
+ ;; No match found
+ (let [char-lit (str (nth data-vec cursor))]
+ (recur (inc cursor)
+ (if (>= (count triplet) MIN-MATCH)
+ (assoc index triplet (conj (get index triplet []) cursor))
+ index)
+ (conj! out char-lit)))))))))
+
+(defn compress-tokens
+ "Takes a sequence of 16-bit token IDs.
+ Returns a vector of maps: {:type :lit :val v} or {:type :match :off o :len l}"
+ [data]
+ (let [data-vec (vec data)
+ data-len (count data-vec)]
+ (loop [cursor 0
+ index {}
+ out (transient [])]
+
+ (if (>= cursor data-len)
+ (persistent! out) ;; Return clean vector of data structures
+
+ ;; 1. Setup search parameters
+ (let [max-search-len (min (- data-len cursor) 255) ;; Cap length (e.g. 255) for 8-bit length fields
+ triplet (if (>= max-search-len MIN-MATCH)
+ (subvec data-vec cursor (+ cursor MIN-MATCH))
+ nil)
+ match-indices (get index triplet)]
+
+ ;; 2. Find Best Match
+ (let [best-match (when (and triplet match-indices)
+ (reduce (fn [best candidate-idx]
+ ;; Check if candidate is within WINDOW-SIZE
+ (if (> candidate-idx (- cursor WINDOW-SIZE))
+ (let [len (get-match-len data-vec cursor candidate-idx max-search-len)]
+ (if (> len (:len best 0))
+ {:dist (- cursor candidate-idx) :len len}
+ best))
+ best)) ;; Too old, skip
+ nil
+ match-indices))]
+
+ (if (and best-match (>= (:len best-match) MIN-MATCH))
+ ;; CASE A: Match Found
+ (recur (+ cursor (:len best-match))
+ ;; Note: We are still "Lazy Indexing" here for speed.
+ ;; To maximize compression, you'd loop here to add skipped parts to `index`.
+ (assoc index triplet (conj (get index triplet []) cursor))
+ (conj! out {:type :match
+ :dist (:dist best-match)
+ :len (:len best-match)}))
+
+ ;; CASE B: Literal
+ (recur (inc cursor)
+ (if triplet
+ (assoc index triplet (conj (get index triplet []) cursor))
+ index)
+ (conj! out {:type :lit
+ :val (nth data-vec cursor)})))))))))
+
+(def lzss-compressed-dict-ids (compress-tokens dict-id-compressed-text))
+; The way this works is it outputs literal shorts, and then if there's a short
+; that says MATCH-FLAG it doesn't bother huffman encoding.