1 files changed, 173 insertions, 48 deletions
diff --git a/bible/fullcompressor.clj b/bible/fullcompressor.clj
index d9963ea..65a035b 100644
--- a/bible/fullcompressor.clj
+++ b/bible/fullcompressor.clj
@@ -93,52 +93,6 @@
       (recur (inc len))
       len)))
 
-#_(defn compress
-  "LZSS using transient vectors and a hash-index for O(1) lookups."
-  [data]
-  (let [data-vec (vec data)
-        data-len (count data-vec)
-        #_#_MIN-MATCH 3] ;; Standard MIN-MATCH
-    (loop [cursor 0
-           ;; map of [c1 c2 c3] -> list of indices
-           index {} 
-           ;; Transient vector for high-performance output collection
-           out (transient [])]
-      (if (>= cursor data-len)
-        (apply str (persistent! out)) ;; Finalize the transient and join
-        
-        (let [;; 1. Check if we have a potential match using the index
-              triplet (subvec data-vec cursor (min (+ cursor MIN-MATCH) data-len))
-              match-indices (get index triplet)
-              
-              ;; 2. Find the best match among the candidates
-              best-match (when (>= (count triplet) MIN-MATCH)
-                           (reduce (fn [best idx]
-                                     (let [len (get-match-len data-vec cursor idx (min (- data-len cursor) WINDOW-SIZE))]
-                                       (if (> len (:length best 0))
-                                         {:offset (- cursor idx) :length len}
-                                         best)))
-                                   nil
-                                   (filter #(> % (- cursor WINDOW-SIZE)) match-indices)))]
-          
-          (if (and best-match (>= (:length best-match) MIN-MATCH))
-            ;; Match found
-            (let [len (:length best-match)
-                  ref-str (str "<" (:offset best-match) "|" len ">")
-                  #_#_ref-str "<00|00"]
-              (recur (+ cursor len)
-                     ;; Update index for every triplet we skip (simplification: just first)
-                     (assoc index triplet (conj (get index triplet []) cursor))
-                     (conj! out ref-str)))
-            
-            ;; No match found
-            (let [char-lit (str (nth data-vec cursor))]
-              (recur (inc cursor)
-                     (if (>= (count triplet) MIN-MATCH)
-                       (assoc index triplet (conj (get index triplet []) cursor))
-                       index)
-                     (conj! out char-lit)))))))))
-
 (defn compress-tokens
   "Takes a sequence of 16-bit token IDs. 
    Returns a vector of maps: {:type :lit :val v} or {:type :match :off o :len l}"
@@ -191,5 +145,176 @@
                                  :val (nth data-vec cursor)})))))))))
 
 (def lzss-compressed-dict-ids (compress-tokens dict-id-compressed-text))
-; The way this works is it outputs literal shorts, and then if there's a short
-; that says MATCH-FLAG it doesn't bother huffman encoding. 
+
+;; Some stats on lzss-compression
+(comment
+  (let [totalbytes (reduce + (map (fn [lzsscode]
+         (if (= (:type lzsscode) :lit)
+           2
+           3)) lzss-compressed-dict-ids))];A match requires 3 bytes, one for MATCH-FLAG 2 for offset and length
+    totalbytes) ; down to 1604373 total bytes!
+  ) 
+
+;;; Third, we huffman encode
+
+;; If I were to write the LZSS encoding I would write just a bunch of shorts
+;; Since we're huffman-encoding this, the MATCH-FLAG will be a huffman symbol
+;; Which, when writing, we'll just write the 16 bit length-offset sequence in
+;; un huffman-encode, so for instance 100101010100 HUFFIFIED-MATCH-FLAG 0xFFFF (but 10 bits are length, 4 are offset)
+
+
+(def match-queue ; We'll use this when writing the file per above
+  (remove #(= (:type %1) :lit) lzss-compressed-dict-ids))
+
+(assert (= (count lzss-compressed-dict-ids)
+           (+ (count match-queue)
+              (count (filter #(= (:type %1) :lit) lzss-compressed-dict-ids)))))
+
+(def huffman-friendly-lzss-compressed-dict-ids
+  (map (fn [lzsscode]
+         (if (= (:type lzsscode) :lit)
+           (:val lzsscode)
+           MATCH-FLAG)) ;MATCH-FlAG is a symbol
+       lzss-compressed-dict-ids))
+
+(assert (= (count match-queue)
+           (get (frequencies huffman-friendly-lzss-compressed-dict-ids) MATCH-FLAG))
+        "Somehow dropped/gained a match sequence")
+
+(def huffmanable-lzss-toks-freq-table
+  (frequencies huffman-friendly-lzss-compressed-dict-ids)) 
+
+;; Huffman tree node
+(defrecord Node [left right sym probability])
+(defn symboltable->pq [symbolfreqs]
+  (into (pm/priority-map-keyfn (juxt first second))
+        (map #(vector
+                (->Node nil nil (first %1) (second %1))
+                [(second %1) (first %1)])
+             symbolfreqs)))
+
+(assert (= (count huffmanable-lzss-toks-freq-table) 
+           (count (symboltable->pq huffmanable-lzss-toks-freq-table))) 
+        "Priority queue has fewer symbols than symbol list")
+
+(defn pq->huffman-tree
+  "Builds a huffman-tree out of a priority-queue"
+  [queue]
+  (if (= 1 (count queue))
+    (first (peek queue))  ; Repeat until there is only one parentless node left
+    (let [[lowest-node [lowest-prob _]] (peek queue)
+          [second-node [second-prob _]] (peek (pop queue))  ; Step 2
+          new-prob (+ lowest-prob second-prob) ; Step 4
+          new-node (->Node second-node lowest-node nil new-prob) ; Step 3 - NOTE: unsure about node order
+          next-queue (assoc (pop (pop queue)) new-node [new-prob nil])] 
+      (recur next-queue))))
+
+(def huffman-tree (-> huffmanable-lzss-toks-freq-table
+                  symboltable->pq
+                  pq->huffman-tree))
+
+(assert (= (.probability huffman-tree)
+           (reduce + (map val huffmanable-lzss-toks-freq-table)))
+        "Probability of root node is not equal to the sum of all probabilities")
+
+(defrecord HuffmanCodeword [sym code length])
+(defn huffman-tree->symbol-encodings 
+  "Builds a list of symbol encodings out of a huffman-tree
+   [tree] tree to symbol encodings
+   [node encodings curr-encoding] recursive builder"
+   ([tree]
+    (huffman-tree->symbol-encodings tree [] 0 0))
+  ([node encodings curr-encoding curr-length]
+  (if (.sym node)
+    (conj encodings (->HuffmanCodeword (.sym node) curr-encoding curr-length))
+    (let [left-traversed
+          (huffman-tree->symbol-encodings 
+            (.left node) 
+            encodings 
+            (unchecked-int (bit-or 0x00000001 (bit-shift-left curr-encoding 1)))
+            (inc curr-length))]
+      (huffman-tree->symbol-encodings 
+        (.right node) 
+        left-traversed 
+        (bit-shift-left curr-encoding 1)
+        (inc curr-length))))))
+
+(def huffman-symbol-encodings (huffman-tree->symbol-encodings huffman-tree))
+
+(defn canonicalize-codewords
+  "Converts a collection of HuffmanCodeword records into canonical form."
+  [codewords]
+  (let [;; 1. Sort by length (primary) and symbol value (secondary)
+        sorted-codewords (sort-by (juxt :length :sym) codewords)]
+    (loop [[cw & more] sorted-codewords
+           last-code   (unchecked-int 0)
+           last-length (unchecked-int (or (:length (first sorted-codewords)) 0))
+           result      []]
+      (if-not cw
+        result
+        (let [current-length (unchecked-int (:length cw))
+              ;; 2. Calculate the new code:
+              ;; If length increased, shift the incremented previous code.
+              ;; If this is the very first code, it stays 0.
+              new-code (if (empty? result)
+                         (unchecked-int 0)
+                         (unchecked-int 
+                           (bit-shift-left (unchecked-inc-int last-code) 
+                                           (- current-length last-length))))
+              
+              updated-cw (assoc cw :code new-code)]
+          (recur more 
+                 new-code 
+                 current-length 
+                 (conj result updated-cw)))))))
+
+(defn verify-huffman-integrity [codewords]
+  (let [kraft-sum (reduce + (map #(Math/pow 2 (- (:length %))) codewords))
+        sorted-cw (sort-by (juxt :length :sym) codewords)
+        ;; Check if codes are strictly increasing
+        codes-unique? (apply < (map :code sorted-cw))]
+    
+    {:kraft-sum kraft-sum
+     :is-complete? (== kraft-sum 1.0)
+     :is-valid? (<= kraft-sum 1.0)
+     :ordered-correctly? codes-unique?}))
+
+(def canonicalized-codewords (canonicalize-codewords huffman-symbol-encodings))
+(def symbol->canonical-code (into {} (map #(vector (unchecked-short (.sym %1)) [(.code %1) (.length %1)]) canonicalized-codewords)))
+
+(assert (= (into #{} (map :sym canonicalized-codewords))
+           (into #{} (map key huffmanable-lzss-toks-freq-table))))
+
+(defn bit-pack-dist-len [{:keys [dist len]}]
+  (let [outputbase (unchecked-short 0)
+        with-dist (unchecked-short
+                    (bit-or 
+                      outputbase
+                      (bit-shift-left dist 6)))
+        with-len (unchecked-short
+                   (bit-or
+                     with-dist
+                     (bit-and 2r111111 len)))]
+    with-len))
+
+(assert (= (unchecked-short 2r1111111111111111)
+           (bit-pack-dist-len {:dist (unchecked-short 2r1111111111)
+                               :len (unchecked-short 2r111111)})))
+(assert (= (unchecked-short 2r1010101010000001)
+           (bit-pack-dist-len {:dist (unchecked-short 2r1010101010)
+                               :len (unchecked-short 2r000001)})))
+
+(def canonical-huffman-encoded-bit-sequence
+  (let [mq (volatile! match-queue)]
+    (mapcat
+      (fn [symb]
+           (if (= MATCH-FLAG symb)
+             (do 
+               (vswap! mq rest)
+               [(symbol->canonical-code symb) [(bit-pack-dist-len (first @mq)) 16]])
+             [(symbol->canonical-code symb)]
+             ))
+      huffman-friendly-lzss-compressed-dict-ids)))
+
+(math/ceil (/ (reduce + (map second canonical-huffman-encoded-bit-sequence)) 8 1024))
+