diff options
Diffstat (limited to 'bible/fullcompressor.clj')
| -rw-r--r-- | bible/fullcompressor.clj | 123 |
1 files changed, 119 insertions, 4 deletions
diff --git a/bible/fullcompressor.clj b/bible/fullcompressor.clj index 7f3130b..5606b8b 100644 --- a/bible/fullcompressor.clj +++ b/bible/fullcompressor.clj @@ -66,17 +66,132 @@ (into {} (map-indexed (fn [id [tok _freq]] - [tok [(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]]) + [tok (unchecked-short (bit-and 0xffff id))#_[(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]]) sorted-toks))] token-reprs)) (def dict-id-compressed-text - (flatten (map word-ids optimized-tokens))) + (map word-ids optimized-tokens)) (comment - (count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices + (count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices (it's 962868 shorts) ) ;;; TODO: Build dictionary -;;; Next, we'll run LZSS on our stream of tokens +;;; Next, we'll run LZSS on our tok-id-list +(def WINDOW-SIZE 1024) ; The maximum distance we look back, only allowing 1k RAM +(def MIN-MATCH 3) ; Minimum length to bother with a reference + +(def MATCH-FLAG (count (frequencies dict-id-compressed-text))) +;Basically it's the total number of tokens they only need 0-(total-1) +; This is the START. Then 10 bits for the offset (1k), and 4 bits for the length (255 toks). + +(defn- get-match-len [data-vec pos match-pos max-len] + (loop [len 0] + (if (and (< len max-len) + (= (nth data-vec (+ pos len)) + (nth data-vec (+ match-pos len)))) + (recur (inc len)) + len))) + +#_(defn compress + "LZSS using transient vectors and a hash-index for O(1) lookups." + [data] + (let [data-vec (vec data) + data-len (count data-vec) + #_#_MIN-MATCH 3] ;; Standard MIN-MATCH + (loop [cursor 0 + ;; map of [c1 c2 c3] -> list of indices + index {} + ;; Transient vector for high-performance output collection + out (transient [])] + (if (>= cursor data-len) + (apply str (persistent! out)) ;; Finalize the transient and join + + (let [;; 1. Check if we have a potential match using the index + triplet (subvec data-vec cursor (min (+ cursor MIN-MATCH) data-len)) + match-indices (get index triplet) + + ;; 2. Find the best match among the candidates + best-match (when (>= (count triplet) MIN-MATCH) + (reduce (fn [best idx] + (let [len (get-match-len data-vec cursor idx (min (- data-len cursor) WINDOW-SIZE))] + (if (> len (:length best 0)) + {:offset (- cursor idx) :length len} + best))) + nil + (filter #(> % (- cursor WINDOW-SIZE)) match-indices)))] + + (if (and best-match (>= (:length best-match) MIN-MATCH)) + ;; Match found + (let [len (:length best-match) + ref-str (str "<" (:offset best-match) "|" len ">") + #_#_ref-str "<00|00"] + (recur (+ cursor len) + ;; Update index for every triplet we skip (simplification: just first) + (assoc index triplet (conj (get index triplet []) cursor)) + (conj! out ref-str))) + + ;; No match found + (let [char-lit (str (nth data-vec cursor))] + (recur (inc cursor) + (if (>= (count triplet) MIN-MATCH) + (assoc index triplet (conj (get index triplet []) cursor)) + index) + (conj! out char-lit))))))))) + +(defn compress-tokens + "Takes a sequence of 16-bit token IDs. + Returns a vector of maps: {:type :lit :val v} or {:type :match :off o :len l}" + [data] + (let [data-vec (vec data) + data-len (count data-vec)] + (loop [cursor 0 + index {} + out (transient [])] + + (if (>= cursor data-len) + (persistent! out) ;; Return clean vector of data structures + + ;; 1. Setup search parameters + (let [max-search-len (min (- data-len cursor) 255) ;; Cap length (e.g. 255) for 8-bit length fields + triplet (if (>= max-search-len MIN-MATCH) + (subvec data-vec cursor (+ cursor MIN-MATCH)) + nil) + match-indices (get index triplet)] + + ;; 2. Find Best Match + (let [best-match (when (and triplet match-indices) + (reduce (fn [best candidate-idx] + ;; Check if candidate is within WINDOW-SIZE + (if (> candidate-idx (- cursor WINDOW-SIZE)) + (let [len (get-match-len data-vec cursor candidate-idx max-search-len)] + (if (> len (:len best 0)) + {:dist (- cursor candidate-idx) :len len} + best)) + best)) ;; Too old, skip + nil + match-indices))] + + (if (and best-match (>= (:len best-match) MIN-MATCH)) + ;; CASE A: Match Found + (recur (+ cursor (:len best-match)) + ;; Note: We are still "Lazy Indexing" here for speed. + ;; To maximize compression, you'd loop here to add skipped parts to `index`. + (assoc index triplet (conj (get index triplet []) cursor)) + (conj! out {:type :match + :dist (:dist best-match) + :len (:len best-match)})) + + ;; CASE B: Literal + (recur (inc cursor) + (if triplet + (assoc index triplet (conj (get index triplet []) cursor)) + index) + (conj! out {:type :lit + :val (nth data-vec cursor)}))))))))) + +(def lzss-compressed-dict-ids (compress-tokens dict-id-compressed-text)) +; The way this works is it outputs literal shorts, and then if there's a short +; that says MATCH-FLAG it doesn't bother huffman encoding. |
