Move encode to huffman

author: Patrick Kingston <patrick@pkingston.xyz> 2026-01-23 16:38:00 -0500
committer: Patrick Kingston <patrick@pkingston.xyz> 2026-01-23 16:38:00 -0500
commit: d47553f0a7456868418ad9a4c4a5af182528f254 (patch)
tree: 397be2f873641f1982a3fd5e37d36d8f77cd5f90
parent: bcab304fb841814ea0acb4b912c8a7df5a395dab (diff)
1 files changed, 93 insertions, 15 deletions
diff --git a/bible/encode.clj b/bible/huffman.clj
index abd5365..c219b7e 100644
--- a/bible/encode.clj
+++ b/bible/huffman.clj
@@ -31,6 +31,7 @@
 
 (def full-text (slurp "bbe-newlines-nochaps.txt"))
 
+;; This is "Stage1-optimized-tokens"
 (def tokens
   (-> full-text 
       (str/lower-case)
@@ -104,17 +105,28 @@
            (reduce + (map val symbol-freqs)))
         "Probability of root node is not equal to the sum of all probabilities")
 
-(defn huffman-tree-to-symbol-encodings [node encodings curr-encoding]
+(defn huffman-tree-to-symbol-encodings 
+  "Builds a list of symbol encodings out of a huffman-tree
+   [tree] tree to symbol encodings
+   [node encodings curr-encoding] recursive builder"
+   ([tree]
+    (huffman-tree-to-symbol-encodings tree {} ""))
+  ([node encodings curr-encoding]
   (if (.sym node)
     (assoc encodings (.sym node) curr-encoding)
     (merge
       (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding))
-      (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding)))))
+      (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding))))))
 
 (def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} ""))
+
+(assert (= (huffman-tree-to-symbol-encodings huffman-tree {} "")
+           (huffman-tree-to-symbol-encodings huffman-tree))
+        "Multi-arity def does not work as intended")
 (assert (= (count huffman-tree-syms)
-           (count pq)
-           (count symbol-freqs)))
+           (count (symboltable->pq symbol-freqs))
+           (count symbol-freqs))
+        "Some function is adding or removing symbols")
 
 ;;; Build the canonical encodings
 
@@ -126,7 +138,7 @@
 ;;     When you reach a longer codeword, then after incrementing, append zeros until the length of the new codeword is equal to the length of the old codeword. This can be thought of as a left shift.
 
 
-;; little class for Huffman Codewords
+;; --- little class for Huffman Codewords
 (defrecord HuffmanCodeword [sym code length])
 (defn get-codeword-string 
   "Takes a [HuffmanCodeword] and returns a binary str of the code"
@@ -138,16 +150,23 @@
       base
       (str (apply str (take (- len basecount) (repeat "0"))) base))))
 
+;; --- little class for Huffman Codewords
+
 (def sorted-huffman-tree-codewords 
   (->> huffman-tree-syms
        (sort-by (juxt (comp count val) key))
        (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1)))))))
 
 (defn build-canonical-encodings
-  "Build canonical huffman encodings from a sorted list of huffman tree codewords
+  "Build canonical huffman encodings from a huffman tree
    takes [symbols] a list of huffman codes derived from a code-tree"
-  ([symbols]
-   (let [first-sym (first symbols)
+  ([tree]
+   (let [symbols (->> tree
+                      (huffman-tree-to-symbol-encodings)
+                      (sort-by (juxt (comp count val) key))
+
+                      (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1))))))
+         first-sym (first symbols)
          seed-symbol (->HuffmanCodeword (.sym first-sym) 0 (.length first-sym))]
      (build-canonical-encodings [seed-symbol] (rest symbols))))
   ([codes symbols]
@@ -175,7 +194,7 @@
      codes)))
 
 (def canonical-encodings
-  (build-canonical-encodings sorted-huffman-tree-codewords))
+  (build-canonical-encodings huffman-tree))
 
 
 (assert
@@ -192,20 +211,29 @@
   (let [encodingtable (into {} (map #(vector (.sym %1) (.length %1)) encodings))
         totalbits (reduce + (map encodingtable tokenlist))
         totalbytes (math/ceil (/ totalbits 8))
-        totalkb (math/ceil (/ totalbytes 1024))
-        totalmb (math/ceil (/ totalkb 1024))]
+        totalkb (/ totalbytes 1024)
+        totalmb (/ totalkb 1024)]
     {:bits totalbits
      :bytes totalbytes
      :kb totalkb
      :mb totalmb}))
 
-(count (frequencies (str/replace full-text #"\*" "")))
 
-(def raw-token-ascii-encoding
-  (get-encoded-length canonical-encodings tokens))
+(comment
+  (def base-file-size
+    (let [totalbits (* 8 (count full-text))
+          totalbytes (math/ceil (/ totalbits 8))
+          totalkb (/ totalbytes 1024)
+          totalmb (/ totalkb 1024)]
+      {:bits totalbits
+       :bytes totalbytes
+       :kb totalkb
+       :mb totalmb}))
+  (def stage1-optimized-token-encoding-length
+    (get-encoded-length canonical-encodings tokens)))
 
 (comment
-  "Figuring out how to make the shortest bit smaller"
+  "Some scratchwork figuring out how to make the shortest bit smaller"
   (defn get-highest-freq-percent [freqlist]
     (let [most-freq (first (sort-by val > freqlist))] ;the fn already sorts, but most invocations pre-sort for REPL purposes
     [(key most-freq)
@@ -219,3 +247,53 @@
   (get-highest-freq-percent (sort-by val > (frequencies (map count tokens)))) ;26% of the text is 3 letter tokens
   )
 
+;;; NOTE FOR VIDEO:
+;; First, consider some kind of bit-packing scheme 
+;;  - 65 total chars
+;;  - Shave it down to 63
+;;  - Remove caps/reduce even furhter to try and get it to 31
+;; Then introduce huffman encodings
+;;  - Raw text
+;;  - Text with some of the above optimizations
+;;  - Tokenize at spaces and punctuation
+;; Then consider tokenizing I started off by tokenizing the text with some optimizations in place with stage1-optimized-token-encoding-length
+
+(comment
+  "Huffman on the raw text (65 total symbols)"
+  (let [tokenized (seq full-text)
+        symbol-freqs (frequencies tokenized)
+        huff-tree (build-huffman-tree symbol-freqs)
+        canonical-codes (build-canonical-encodings huff-tree)]
+    (get-encoded-length canonical-codes tokenized))
+  "Huffman on the above with normalized space and without * (63 total symbols)"
+  (let [tokenized (seq (-> full-text 
+                           (str/replace #"\s+" " ")
+                           (str/replace #"\*" " ")))
+        symbol-freqs (frequencies tokenized)
+        huff-tree (build-huffman-tree symbol-freqs)
+        canonical-codes (build-canonical-encodings huff-tree)]
+    (get-encoded-length canonical-codes tokenized))
+  "Huffman on the above in all lowercase (38 total symbols)"
+  (let [tokenized (seq (-> full-text 
+                           (str/replace #"\s+" " ")
+                           (str/replace #"[\*]" "")
+                           (str/lower-case)))
+        symbol-freqs (frequencies tokenized)
+        huff-tree (build-huffman-tree symbol-freqs)
+        canonical-codes (build-canonical-encodings huff-tree)]
+    (get-encoded-length canonical-codes tokenized))
+  "Huffman on the above with char changes —()?!:; (31 total symbols)"
+  (let [tokenized (seq (-> full-text 
+                           (str/replace #"—" "-") ; Replace emdash with hyphen
+                           (str/replace #"[\*]" "") ; Get rid of chars
+                           (str/replace #"\([^)]*\)" "") ; Get rid of everything between parens
+                           (str/replace #"[?!]" ".") ; Replace all final punct with periods
+                           (str/replace #"[;:]" ",") ; Replace all middle punct with commas
+                           (str/replace #"\s+" " ") ; Normalize space
+                           (str/lower-case)))
+        symbol-freqs (frequencies tokenized)
+        huff-tree (build-huffman-tree symbol-freqs)
+        canonical-codes (build-canonical-encodings huff-tree)]
+    (get-encoded-length canonical-codes tokenized))
+  )
+
author	Patrick Kingston <patrick@pkingston.xyz>	2026-01-23 16:38:00 -0500
committer	Patrick Kingston <patrick@pkingston.xyz>	2026-01-23 16:38:00 -0500
commit	d47553f0a7456868418ad9a4c4a5af182528f254 (patch)
tree	397be2f873641f1982a3fd5e37d36d8f77cd5f90
parent	bcab304fb841814ea0acb4b912c8a7df5a395dab (diff)