aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--bible/fullcompressor.clj221
1 files changed, 173 insertions, 48 deletions
diff --git a/bible/fullcompressor.clj b/bible/fullcompressor.clj
index d9963ea..65a035b 100644
--- a/bible/fullcompressor.clj
+++ b/bible/fullcompressor.clj
@@ -93,52 +93,6 @@
(recur (inc len))
len)))
-#_(defn compress
- "LZSS using transient vectors and a hash-index for O(1) lookups."
- [data]
- (let [data-vec (vec data)
- data-len (count data-vec)
- #_#_MIN-MATCH 3] ;; Standard MIN-MATCH
- (loop [cursor 0
- ;; map of [c1 c2 c3] -> list of indices
- index {}
- ;; Transient vector for high-performance output collection
- out (transient [])]
- (if (>= cursor data-len)
- (apply str (persistent! out)) ;; Finalize the transient and join
-
- (let [;; 1. Check if we have a potential match using the index
- triplet (subvec data-vec cursor (min (+ cursor MIN-MATCH) data-len))
- match-indices (get index triplet)
-
- ;; 2. Find the best match among the candidates
- best-match (when (>= (count triplet) MIN-MATCH)
- (reduce (fn [best idx]
- (let [len (get-match-len data-vec cursor idx (min (- data-len cursor) WINDOW-SIZE))]
- (if (> len (:length best 0))
- {:offset (- cursor idx) :length len}
- best)))
- nil
- (filter #(> % (- cursor WINDOW-SIZE)) match-indices)))]
-
- (if (and best-match (>= (:length best-match) MIN-MATCH))
- ;; Match found
- (let [len (:length best-match)
- ref-str (str "<" (:offset best-match) "|" len ">")
- #_#_ref-str "<00|00"]
- (recur (+ cursor len)
- ;; Update index for every triplet we skip (simplification: just first)
- (assoc index triplet (conj (get index triplet []) cursor))
- (conj! out ref-str)))
-
- ;; No match found
- (let [char-lit (str (nth data-vec cursor))]
- (recur (inc cursor)
- (if (>= (count triplet) MIN-MATCH)
- (assoc index triplet (conj (get index triplet []) cursor))
- index)
- (conj! out char-lit)))))))))
-
(defn compress-tokens
"Takes a sequence of 16-bit token IDs.
Returns a vector of maps: {:type :lit :val v} or {:type :match :off o :len l}"
@@ -191,5 +145,176 @@
:val (nth data-vec cursor)})))))))))
(def lzss-compressed-dict-ids (compress-tokens dict-id-compressed-text))
-; The way this works is it outputs literal shorts, and then if there's a short
-; that says MATCH-FLAG it doesn't bother huffman encoding.
+
+;; Some stats on lzss-compression
+(comment
+ (let [totalbytes (reduce + (map (fn [lzsscode]
+ (if (= (:type lzsscode) :lit)
+ 2
+ 3)) lzss-compressed-dict-ids))];A match requires 3 bytes, one for MATCH-FLAG 2 for offset and length
+ totalbytes) ; down to 1604373 total bytes!
+ )
+
+;;; Third, we huffman encode
+
+;; If I were to write the LZSS encoding I would write just a bunch of shorts
+;; Since we're huffman-encoding this, the MATCH-FLAG will be a huffman symbol
+;; Which, when writing, we'll just write the 16 bit length-offset sequence in
+;; un huffman-encode, so for instance 100101010100 HUFFIFIED-MATCH-FLAG 0xFFFF (but 10 bits are length, 4 are offset)
+
+
+(def match-queue ; We'll use this when writing the file per above
+ (remove #(= (:type %1) :lit) lzss-compressed-dict-ids))
+
+(assert (= (count lzss-compressed-dict-ids)
+ (+ (count match-queue)
+ (count (filter #(= (:type %1) :lit) lzss-compressed-dict-ids)))))
+
+(def huffman-friendly-lzss-compressed-dict-ids
+ (map (fn [lzsscode]
+ (if (= (:type lzsscode) :lit)
+ (:val lzsscode)
+ MATCH-FLAG)) ;MATCH-FlAG is a symbol
+ lzss-compressed-dict-ids))
+
+(assert (= (count match-queue)
+ (get (frequencies huffman-friendly-lzss-compressed-dict-ids) MATCH-FLAG))
+ "Somehow dropped/gained a match sequence")
+
+(def huffmanable-lzss-toks-freq-table
+ (frequencies huffman-friendly-lzss-compressed-dict-ids))
+
+;; Huffman tree node
+(defrecord Node [left right sym probability])
+(defn symboltable->pq [symbolfreqs]
+ (into (pm/priority-map-keyfn (juxt first second))
+ (map #(vector
+ (->Node nil nil (first %1) (second %1))
+ [(second %1) (first %1)])
+ symbolfreqs)))
+
+(assert (= (count huffmanable-lzss-toks-freq-table)
+ (count (symboltable->pq huffmanable-lzss-toks-freq-table)))
+ "Priority queue has fewer symbols than symbol list")
+
+(defn pq->huffman-tree
+ "Builds a huffman-tree out of a priority-queue"
+ [queue]
+ (if (= 1 (count queue))
+ (first (peek queue)) ; Repeat until there is only one parentless node left
+ (let [[lowest-node [lowest-prob _]] (peek queue)
+ [second-node [second-prob _]] (peek (pop queue)) ; Step 2
+ new-prob (+ lowest-prob second-prob) ; Step 4
+ new-node (->Node second-node lowest-node nil new-prob) ; Step 3 - NOTE: unsure about node order
+ next-queue (assoc (pop (pop queue)) new-node [new-prob nil])]
+ (recur next-queue))))
+
+(def huffman-tree (-> huffmanable-lzss-toks-freq-table
+ symboltable->pq
+ pq->huffman-tree))
+
+(assert (= (.probability huffman-tree)
+ (reduce + (map val huffmanable-lzss-toks-freq-table)))
+ "Probability of root node is not equal to the sum of all probabilities")
+
+(defrecord HuffmanCodeword [sym code length])
+(defn huffman-tree->symbol-encodings
+ "Builds a list of symbol encodings out of a huffman-tree
+ [tree] tree to symbol encodings
+ [node encodings curr-encoding] recursive builder"
+ ([tree]
+ (huffman-tree->symbol-encodings tree [] 0 0))
+ ([node encodings curr-encoding curr-length]
+ (if (.sym node)
+ (conj encodings (->HuffmanCodeword (.sym node) curr-encoding curr-length))
+ (let [left-traversed
+ (huffman-tree->symbol-encodings
+ (.left node)
+ encodings
+ (unchecked-int (bit-or 0x00000001 (bit-shift-left curr-encoding 1)))
+ (inc curr-length))]
+ (huffman-tree->symbol-encodings
+ (.right node)
+ left-traversed
+ (bit-shift-left curr-encoding 1)
+ (inc curr-length))))))
+
+(def huffman-symbol-encodings (huffman-tree->symbol-encodings huffman-tree))
+
+(defn canonicalize-codewords
+ "Converts a collection of HuffmanCodeword records into canonical form."
+ [codewords]
+ (let [;; 1. Sort by length (primary) and symbol value (secondary)
+ sorted-codewords (sort-by (juxt :length :sym) codewords)]
+ (loop [[cw & more] sorted-codewords
+ last-code (unchecked-int 0)
+ last-length (unchecked-int (or (:length (first sorted-codewords)) 0))
+ result []]
+ (if-not cw
+ result
+ (let [current-length (unchecked-int (:length cw))
+ ;; 2. Calculate the new code:
+ ;; If length increased, shift the incremented previous code.
+ ;; If this is the very first code, it stays 0.
+ new-code (if (empty? result)
+ (unchecked-int 0)
+ (unchecked-int
+ (bit-shift-left (unchecked-inc-int last-code)
+ (- current-length last-length))))
+
+ updated-cw (assoc cw :code new-code)]
+ (recur more
+ new-code
+ current-length
+ (conj result updated-cw)))))))
+
+(defn verify-huffman-integrity [codewords]
+ (let [kraft-sum (reduce + (map #(Math/pow 2 (- (:length %))) codewords))
+ sorted-cw (sort-by (juxt :length :sym) codewords)
+ ;; Check if codes are strictly increasing
+ codes-unique? (apply < (map :code sorted-cw))]
+
+ {:kraft-sum kraft-sum
+ :is-complete? (== kraft-sum 1.0)
+ :is-valid? (<= kraft-sum 1.0)
+ :ordered-correctly? codes-unique?}))
+
+(def canonicalized-codewords (canonicalize-codewords huffman-symbol-encodings))
+(def symbol->canonical-code (into {} (map #(vector (unchecked-short (.sym %1)) [(.code %1) (.length %1)]) canonicalized-codewords)))
+
+(assert (= (into #{} (map :sym canonicalized-codewords))
+ (into #{} (map key huffmanable-lzss-toks-freq-table))))
+
+(defn bit-pack-dist-len [{:keys [dist len]}]
+ (let [outputbase (unchecked-short 0)
+ with-dist (unchecked-short
+ (bit-or
+ outputbase
+ (bit-shift-left dist 6)))
+ with-len (unchecked-short
+ (bit-or
+ with-dist
+ (bit-and 2r111111 len)))]
+ with-len))
+
+(assert (= (unchecked-short 2r1111111111111111)
+ (bit-pack-dist-len {:dist (unchecked-short 2r1111111111)
+ :len (unchecked-short 2r111111)})))
+(assert (= (unchecked-short 2r1010101010000001)
+ (bit-pack-dist-len {:dist (unchecked-short 2r1010101010)
+ :len (unchecked-short 2r000001)})))
+
+(def canonical-huffman-encoded-bit-sequence
+ (let [mq (volatile! match-queue)]
+ (mapcat
+ (fn [symb]
+ (if (= MATCH-FLAG symb)
+ (do
+ (vswap! mq rest)
+ [(symbol->canonical-code symb) [(bit-pack-dist-len (first @mq)) 16]])
+ [(symbol->canonical-code symb)]
+ ))
+ huffman-friendly-lzss-compressed-dict-ids)))
+
+(math/ceil (/ (reduce + (map second canonical-huffman-encoded-bit-sequence)) 8 1024))
+