diff options
Diffstat (limited to 'bible/fullcompressor.clj')
| -rw-r--r-- | bible/fullcompressor.clj | 221 |
1 files changed, 173 insertions, 48 deletions
diff --git a/bible/fullcompressor.clj b/bible/fullcompressor.clj index d9963ea..65a035b 100644 --- a/bible/fullcompressor.clj +++ b/bible/fullcompressor.clj @@ -93,52 +93,6 @@ (recur (inc len)) len))) -#_(defn compress - "LZSS using transient vectors and a hash-index for O(1) lookups." - [data] - (let [data-vec (vec data) - data-len (count data-vec) - #_#_MIN-MATCH 3] ;; Standard MIN-MATCH - (loop [cursor 0 - ;; map of [c1 c2 c3] -> list of indices - index {} - ;; Transient vector for high-performance output collection - out (transient [])] - (if (>= cursor data-len) - (apply str (persistent! out)) ;; Finalize the transient and join - - (let [;; 1. Check if we have a potential match using the index - triplet (subvec data-vec cursor (min (+ cursor MIN-MATCH) data-len)) - match-indices (get index triplet) - - ;; 2. Find the best match among the candidates - best-match (when (>= (count triplet) MIN-MATCH) - (reduce (fn [best idx] - (let [len (get-match-len data-vec cursor idx (min (- data-len cursor) WINDOW-SIZE))] - (if (> len (:length best 0)) - {:offset (- cursor idx) :length len} - best))) - nil - (filter #(> % (- cursor WINDOW-SIZE)) match-indices)))] - - (if (and best-match (>= (:length best-match) MIN-MATCH)) - ;; Match found - (let [len (:length best-match) - ref-str (str "<" (:offset best-match) "|" len ">") - #_#_ref-str "<00|00"] - (recur (+ cursor len) - ;; Update index for every triplet we skip (simplification: just first) - (assoc index triplet (conj (get index triplet []) cursor)) - (conj! out ref-str))) - - ;; No match found - (let [char-lit (str (nth data-vec cursor))] - (recur (inc cursor) - (if (>= (count triplet) MIN-MATCH) - (assoc index triplet (conj (get index triplet []) cursor)) - index) - (conj! out char-lit))))))))) - (defn compress-tokens "Takes a sequence of 16-bit token IDs. Returns a vector of maps: {:type :lit :val v} or {:type :match :off o :len l}" @@ -191,5 +145,176 @@ :val (nth data-vec cursor)}))))))))) (def lzss-compressed-dict-ids (compress-tokens dict-id-compressed-text)) -; The way this works is it outputs literal shorts, and then if there's a short -; that says MATCH-FLAG it doesn't bother huffman encoding. + +;; Some stats on lzss-compression +(comment + (let [totalbytes (reduce + (map (fn [lzsscode] + (if (= (:type lzsscode) :lit) + 2 + 3)) lzss-compressed-dict-ids))];A match requires 3 bytes, one for MATCH-FLAG 2 for offset and length + totalbytes) ; down to 1604373 total bytes! + ) + +;;; Third, we huffman encode + +;; If I were to write the LZSS encoding I would write just a bunch of shorts +;; Since we're huffman-encoding this, the MATCH-FLAG will be a huffman symbol +;; Which, when writing, we'll just write the 16 bit length-offset sequence in +;; un huffman-encode, so for instance 100101010100 HUFFIFIED-MATCH-FLAG 0xFFFF (but 10 bits are length, 4 are offset) + + +(def match-queue ; We'll use this when writing the file per above + (remove #(= (:type %1) :lit) lzss-compressed-dict-ids)) + +(assert (= (count lzss-compressed-dict-ids) + (+ (count match-queue) + (count (filter #(= (:type %1) :lit) lzss-compressed-dict-ids))))) + +(def huffman-friendly-lzss-compressed-dict-ids + (map (fn [lzsscode] + (if (= (:type lzsscode) :lit) + (:val lzsscode) + MATCH-FLAG)) ;MATCH-FlAG is a symbol + lzss-compressed-dict-ids)) + +(assert (= (count match-queue) + (get (frequencies huffman-friendly-lzss-compressed-dict-ids) MATCH-FLAG)) + "Somehow dropped/gained a match sequence") + +(def huffmanable-lzss-toks-freq-table + (frequencies huffman-friendly-lzss-compressed-dict-ids)) + +;; Huffman tree node +(defrecord Node [left right sym probability]) +(defn symboltable->pq [symbolfreqs] + (into (pm/priority-map-keyfn (juxt first second)) + (map #(vector + (->Node nil nil (first %1) (second %1)) + [(second %1) (first %1)]) + symbolfreqs))) + +(assert (= (count huffmanable-lzss-toks-freq-table) + (count (symboltable->pq huffmanable-lzss-toks-freq-table))) + "Priority queue has fewer symbols than symbol list") + +(defn pq->huffman-tree + "Builds a huffman-tree out of a priority-queue" + [queue] + (if (= 1 (count queue)) + (first (peek queue)) ; Repeat until there is only one parentless node left + (let [[lowest-node [lowest-prob _]] (peek queue) + [second-node [second-prob _]] (peek (pop queue)) ; Step 2 + new-prob (+ lowest-prob second-prob) ; Step 4 + new-node (->Node second-node lowest-node nil new-prob) ; Step 3 - NOTE: unsure about node order + next-queue (assoc (pop (pop queue)) new-node [new-prob nil])] + (recur next-queue)))) + +(def huffman-tree (-> huffmanable-lzss-toks-freq-table + symboltable->pq + pq->huffman-tree)) + +(assert (= (.probability huffman-tree) + (reduce + (map val huffmanable-lzss-toks-freq-table))) + "Probability of root node is not equal to the sum of all probabilities") + +(defrecord HuffmanCodeword [sym code length]) +(defn huffman-tree->symbol-encodings + "Builds a list of symbol encodings out of a huffman-tree + [tree] tree to symbol encodings + [node encodings curr-encoding] recursive builder" + ([tree] + (huffman-tree->symbol-encodings tree [] 0 0)) + ([node encodings curr-encoding curr-length] + (if (.sym node) + (conj encodings (->HuffmanCodeword (.sym node) curr-encoding curr-length)) + (let [left-traversed + (huffman-tree->symbol-encodings + (.left node) + encodings + (unchecked-int (bit-or 0x00000001 (bit-shift-left curr-encoding 1))) + (inc curr-length))] + (huffman-tree->symbol-encodings + (.right node) + left-traversed + (bit-shift-left curr-encoding 1) + (inc curr-length)))))) + +(def huffman-symbol-encodings (huffman-tree->symbol-encodings huffman-tree)) + +(defn canonicalize-codewords + "Converts a collection of HuffmanCodeword records into canonical form." + [codewords] + (let [;; 1. Sort by length (primary) and symbol value (secondary) + sorted-codewords (sort-by (juxt :length :sym) codewords)] + (loop [[cw & more] sorted-codewords + last-code (unchecked-int 0) + last-length (unchecked-int (or (:length (first sorted-codewords)) 0)) + result []] + (if-not cw + result + (let [current-length (unchecked-int (:length cw)) + ;; 2. Calculate the new code: + ;; If length increased, shift the incremented previous code. + ;; If this is the very first code, it stays 0. + new-code (if (empty? result) + (unchecked-int 0) + (unchecked-int + (bit-shift-left (unchecked-inc-int last-code) + (- current-length last-length)))) + + updated-cw (assoc cw :code new-code)] + (recur more + new-code + current-length + (conj result updated-cw))))))) + +(defn verify-huffman-integrity [codewords] + (let [kraft-sum (reduce + (map #(Math/pow 2 (- (:length %))) codewords)) + sorted-cw (sort-by (juxt :length :sym) codewords) + ;; Check if codes are strictly increasing + codes-unique? (apply < (map :code sorted-cw))] + + {:kraft-sum kraft-sum + :is-complete? (== kraft-sum 1.0) + :is-valid? (<= kraft-sum 1.0) + :ordered-correctly? codes-unique?})) + +(def canonicalized-codewords (canonicalize-codewords huffman-symbol-encodings)) +(def symbol->canonical-code (into {} (map #(vector (unchecked-short (.sym %1)) [(.code %1) (.length %1)]) canonicalized-codewords))) + +(assert (= (into #{} (map :sym canonicalized-codewords)) + (into #{} (map key huffmanable-lzss-toks-freq-table)))) + +(defn bit-pack-dist-len [{:keys [dist len]}] + (let [outputbase (unchecked-short 0) + with-dist (unchecked-short + (bit-or + outputbase + (bit-shift-left dist 6))) + with-len (unchecked-short + (bit-or + with-dist + (bit-and 2r111111 len)))] + with-len)) + +(assert (= (unchecked-short 2r1111111111111111) + (bit-pack-dist-len {:dist (unchecked-short 2r1111111111) + :len (unchecked-short 2r111111)}))) +(assert (= (unchecked-short 2r1010101010000001) + (bit-pack-dist-len {:dist (unchecked-short 2r1010101010) + :len (unchecked-short 2r000001)}))) + +(def canonical-huffman-encoded-bit-sequence + (let [mq (volatile! match-queue)] + (mapcat + (fn [symb] + (if (= MATCH-FLAG symb) + (do + (vswap! mq rest) + [(symbol->canonical-code symb) [(bit-pack-dist-len (first @mq)) 16]]) + [(symbol->canonical-code symb)] + )) + huffman-friendly-lzss-compressed-dict-ids))) + +(math/ceil (/ (reduce + (map second canonical-huffman-encoded-bit-sequence)) 8 1024)) + |
