From d47553f0a7456868418ad9a4c4a5af182528f254 Mon Sep 17 00:00:00 2001
From: Patrick Kingston <patrick@pkingston.xyz>
Date: Fri, 23 Jan 2026 16:38:00 -0500
Subject: Move encode to huffman

---
 bible/encode.clj  | 221 ----------------------------------------
 bible/huffman.clj | 299 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 299 insertions(+), 221 deletions(-)
 delete mode 100644 bible/encode.clj
 create mode 100644 bible/huffman.clj

diff --git a/bible/encode.clj b/bible/encode.clj
deleted file mode 100644
index abd5365..0000000
--- a/bible/encode.clj
+++ /dev/null
@@ -1,221 +0,0 @@
-(require '[clojure.string :as str])
-
-(require '[clojure.java.io :as io])
-(require '[babashka.fs :as fs])
-
-(require '[clojure.data.priority-map :as pm])
-
-(require '[clojure.math :as math])
-
-(comment
-  "Build the base file"
-  (def files (fs/glob "./base_files/" "**.txt"))
-
-  (defn get-book-num [filename]
-    (let [[_ _ book _ _]
-          (str/split (str filename) #"_")
-          #_#_chap (int _chap)]
-      (Integer/parseInt book)))
-
-  (defn get-chap-num [filename]
-    (let [[_ _ _ _ chap]
-          (str/split (str filename) #"_")
-          #_#_chap (int _chap)]
-      (Integer/parseInt chap)))
-
-  (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
-    (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
-      (with-open [reader (io/reader (fs/file f))]
-        (doseq [line (drop 2 (line-seq reader))]
-          (.write writer (str line "\n")))))))
-
-(def full-text (slurp "bbe-newlines-nochaps.txt"))
-
-(def tokens
-  (-> full-text 
-      (str/lower-case)
-      (str/replace #"\s+" " ")
-      (str/replace #"'s" " AS ") ;; Apostrophe_S
-      (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
-      (str/split #" ")
-      (#(remove str/blank? %1))))
-
-(def symbol-freqs (frequencies tokens))
-
-(comment
-  "Do some basic statistics and print a list of tokens"
-  (spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs))))
-
-  (sort-by val > symbol-freqs)  ; Greatest to lease frequency
-  (reduce + (map val symbol-freqs))  ; Total tokens
-  (count symbol-freqs)  ; Total unique tokens
-  (reduce + (take 512 (map val symbol-freqs)))  ; Number of the top 100 common tokens
-
-  (reduce + (map count symbol-freqs))  ; Total chars needed for dict vals
-
-  (def two-grams (frequencies (partition 2 1 tokens)))
-  (sort-by val > two-grams))
-
-
-
-
-;;; Make the huffman tree for the symbols (13)
-(defrecord Node [left right sym probability])
-
-; Create a prioirity-queue of parentless nodes for each symbol
-(defn symboltable->pq [symbolfreqs]
-  (into (pm/priority-map-keyfn (juxt first second))
-        (map #(vector
-                (->Node nil nil (first %1) (second %1))
-                [(second %1) (first %1)])
-             symbolfreqs)))
-
-(assert (= (count symbol-freqs) (count (symboltable->pq symbol-freqs))) "Priority queue has fewer symbols than symbol list")
-
-;; From: https://michaeldipperstein.github.io/huffman.html#decode
-;; Step 1. Create a parentless node for each symbol. Each node should include the symbol and its probability.
-;; Step 2. Select the two parentless nodes with the lowest probabilities.
-;; Step 3. Create a new node which is the parent of the two lowest probability nodes.
-;; Step 4. Assign the new node a probability equal to the sum of its children's probabilities.
-;; Step 5. Repeat from Step 2 until there is only one parentless node left.
-
-;; NOTE: This is an inefficient algorithm because we could use the 
-;;       two-queue version on wikipedia
-(defn build-huffman-tree-from-queue
-  "Builds a huffman-tree out of a priority-queue"
-  [queue]
-  (if (= 1 (count queue))
-    (first (peek queue))  ; Repeat until there is only one parentless node left
-    (let [[lowest-node [lowest-prob _]] (peek queue)
-          [second-node [second-prob _]] (peek (pop queue))  ; Step 2
-          new-prob (+ lowest-prob second-prob) ; Step 4
-          new-node (->Node second-node lowest-node nil new-prob) ; Step 3 - NOTE: unsure about node order
-          next-queue (assoc (pop (pop queue)) new-node [new-prob nil])] 
-      (recur next-queue))))
-
-(defn build-huffman-tree
-  "builds a huffman-tree out of a frequency table of symbols"
-  [symbol-freq-table]
-  (build-huffman-tree-from-queue (symboltable->pq symbol-freq-table)))
-
-(def huffman-tree (build-huffman-tree symbol-freqs))
-
-(assert (= (.probability huffman-tree)
-           (reduce + (map val symbol-freqs)))
-        "Probability of root node is not equal to the sum of all probabilities")
-
-(defn huffman-tree-to-symbol-encodings [node encodings curr-encoding]
-  (if (.sym node)
-    (assoc encodings (.sym node) curr-encoding)
-    (merge
-      (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding))
-      (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding)))))
-
-(def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} ""))
-(assert (= (count huffman-tree-syms)
-           (count pq)
-           (count symbol-freqs)))
-
-;;; Build the canonical encodings
-
-;; From: https://en.wikipedia.org/wiki/Canonical_Huffman_code
-;; Each of the existing codes are replaced with a new one of the same length, using the following algorithm:
-
-;;     The first symbol in the list gets assigned a codeword which is the same length as the symbol's original codeword but all zeros. This will often be a single zero ('0').
-;;     Each subsequent symbol is assigned the next binary number in sequence, ensuring that following codes are always higher in value.
-;;     When you reach a longer codeword, then after incrementing, append zeros until the length of the new codeword is equal to the length of the old codeword. This can be thought of as a left shift.
-
-
-;; little class for Huffman Codewords
-(defrecord HuffmanCodeword [sym code length])
-(defn get-codeword-string 
-  "Takes a [HuffmanCodeword] and returns a binary str of the code"
-  [codeword]
-  (let [base (Long/toBinaryString (.code codeword))
-        basecount (count base)
-        len (.length codeword)]
-    (if (= basecount len)
-      base
-      (str (apply str (take (- len basecount) (repeat "0"))) base))))
-
-(def sorted-huffman-tree-codewords 
-  (->> huffman-tree-syms
-       (sort-by (juxt (comp count val) key))
-       (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1)))))))
-
-(defn build-canonical-encodings
-  "Build canonical huffman encodings from a sorted list of huffman tree codewords
-   takes [symbols] a list of huffman codes derived from a code-tree"
-  ([symbols]
-   (let [first-sym (first symbols)
-         seed-symbol (->HuffmanCodeword (.sym first-sym) 0 (.length first-sym))]
-     (build-canonical-encodings [seed-symbol] (rest symbols))))
-  ([codes symbols]
-   (if (not-empty symbols)
-     (let [prev-codeword (last codes)
-           current-codeword (first symbols)
-           next-sym (.sym current-codeword)
-           next-base-code (unchecked-inc (.code prev-codeword))
-           prev-len (.length prev-codeword)
-           next-codeword 
-           (if (= (.length current-codeword)
-                  (.length prev-codeword))
-             (->HuffmanCodeword 
-               next-sym
-               next-base-code
-               prev-len)
-             (->HuffmanCodeword 
-               next-sym
-               (bit-shift-left 
-                 next-base-code 
-                 (- prev-len
-                    (- 63 (Long/numberOfLeadingZeros next-base-code)))) ;Not 100% confident about this
-               (inc prev-len)))]
-       (recur (conj codes next-codeword) (rest symbols)))
-     codes)))
-
-(def canonical-encodings
-  (build-canonical-encodings sorted-huffman-tree-codewords))
-
-
-(assert
-  (= (map #(.length %1) sorted-huffman-tree-codewords)
-     (map #(.length %1) sorted-huffman-tree-codewords))
-  "Some of the codes changed length when canonicalizing")
-
-(assert
-  (= (count canonical-encodings)
-     (count (set (map #(Long/toBinaryString (.code %1)) canonical-encodings))))
-  "There appears to be duplicate canonical encodings")
-
-(defn get-encoded-length [encodings tokenlist]
-  (let [encodingtable (into {} (map #(vector (.sym %1) (.length %1)) encodings))
-        totalbits (reduce + (map encodingtable tokenlist))
-        totalbytes (math/ceil (/ totalbits 8))
-        totalkb (math/ceil (/ totalbytes 1024))
-        totalmb (math/ceil (/ totalkb 1024))]
-    {:bits totalbits
-     :bytes totalbytes
-     :kb totalkb
-     :mb totalmb}))
-
-(count (frequencies (str/replace full-text #"\*" "")))
-
-(def raw-token-ascii-encoding
-  (get-encoded-length canonical-encodings tokens))
-
-(comment
-  "Figuring out how to make the shortest bit smaller"
-  (defn get-highest-freq-percent [freqlist]
-    (let [most-freq (first (sort-by val > freqlist))] ;the fn already sorts, but most invocations pre-sort for REPL purposes
-    [(key most-freq)
-     (double 
-      (/ (second most-freq)
-         (reduce + (map second symbol-freqs))))]))
-
-  (get-highest-freq-percent symbol-freqs) ;most frequent symbol is 6.83%
-  (get-highest-freq-percent (sort-by val > (frequencies (map first tokens)))) ;most frequent 1-letter prefix
-  (get-highest-freq-percent (sort-by val > (frequencies (map last tokens)))) ;most frequent 1-letter suffix
-  (get-highest-freq-percent (sort-by val > (frequencies (map count tokens)))) ;26% of the text is 3 letter tokens
-  )
-
diff --git a/bible/huffman.clj b/bible/huffman.clj
new file mode 100644
index 0000000..c219b7e
--- /dev/null
+++ b/bible/huffman.clj
@@ -0,0 +1,299 @@
+(require '[clojure.string :as str])
+
+(require '[clojure.java.io :as io])
+(require '[babashka.fs :as fs])
+
+(require '[clojure.data.priority-map :as pm])
+
+(require '[clojure.math :as math])
+
+(comment
+  "Build the base file"
+  (def files (fs/glob "./base_files/" "**.txt"))
+
+  (defn get-book-num [filename]
+    (let [[_ _ book _ _]
+          (str/split (str filename) #"_")
+          #_#_chap (int _chap)]
+      (Integer/parseInt book)))
+
+  (defn get-chap-num [filename]
+    (let [[_ _ _ _ chap]
+          (str/split (str filename) #"_")
+          #_#_chap (int _chap)]
+      (Integer/parseInt chap)))
+
+  (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
+    (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
+      (with-open [reader (io/reader (fs/file f))]
+        (doseq [line (drop 2 (line-seq reader))]
+          (.write writer (str line "\n")))))))
+
+(def full-text (slurp "bbe-newlines-nochaps.txt"))
+
+;; This is "Stage1-optimized-tokens"
+(def tokens
+  (-> full-text 
+      (str/lower-case)
+      (str/replace #"\s+" " ")
+      (str/replace #"'s" " AS ") ;; Apostrophe_S
+      (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
+      (str/split #" ")
+      (#(remove str/blank? %1))))
+
+(def symbol-freqs (frequencies tokens))
+
+(comment
+  "Do some basic statistics and print a list of tokens"
+  (spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs))))
+
+  (sort-by val > symbol-freqs)  ; Greatest to lease frequency
+  (reduce + (map val symbol-freqs))  ; Total tokens
+  (count symbol-freqs)  ; Total unique tokens
+  (reduce + (take 512 (map val symbol-freqs)))  ; Number of the top 100 common tokens
+
+  (reduce + (map count symbol-freqs))  ; Total chars needed for dict vals
+
+  (def two-grams (frequencies (partition 2 1 tokens)))
+  (sort-by val > two-grams))
+
+
+
+
+;;; Make the huffman tree for the symbols (13)
+(defrecord Node [left right sym probability])
+
+; Create a prioirity-queue of parentless nodes for each symbol
+(defn symboltable->pq [symbolfreqs]
+  (into (pm/priority-map-keyfn (juxt first second))
+        (map #(vector
+                (->Node nil nil (first %1) (second %1))
+                [(second %1) (first %1)])
+             symbolfreqs)))
+
+(assert (= (count symbol-freqs) (count (symboltable->pq symbol-freqs))) "Priority queue has fewer symbols than symbol list")
+
+;; From: https://michaeldipperstein.github.io/huffman.html#decode
+;; Step 1. Create a parentless node for each symbol. Each node should include the symbol and its probability.
+;; Step 2. Select the two parentless nodes with the lowest probabilities.
+;; Step 3. Create a new node which is the parent of the two lowest probability nodes.
+;; Step 4. Assign the new node a probability equal to the sum of its children's probabilities.
+;; Step 5. Repeat from Step 2 until there is only one parentless node left.
+
+;; NOTE: This is an inefficient algorithm because we could use the 
+;;       two-queue version on wikipedia
+(defn build-huffman-tree-from-queue
+  "Builds a huffman-tree out of a priority-queue"
+  [queue]
+  (if (= 1 (count queue))
+    (first (peek queue))  ; Repeat until there is only one parentless node left
+    (let [[lowest-node [lowest-prob _]] (peek queue)
+          [second-node [second-prob _]] (peek (pop queue))  ; Step 2
+          new-prob (+ lowest-prob second-prob) ; Step 4
+          new-node (->Node second-node lowest-node nil new-prob) ; Step 3 - NOTE: unsure about node order
+          next-queue (assoc (pop (pop queue)) new-node [new-prob nil])] 
+      (recur next-queue))))
+
+(defn build-huffman-tree
+  "builds a huffman-tree out of a frequency table of symbols"
+  [symbol-freq-table]
+  (build-huffman-tree-from-queue (symboltable->pq symbol-freq-table)))
+
+(def huffman-tree (build-huffman-tree symbol-freqs))
+
+(assert (= (.probability huffman-tree)
+           (reduce + (map val symbol-freqs)))
+        "Probability of root node is not equal to the sum of all probabilities")
+
+(defn huffman-tree-to-symbol-encodings 
+  "Builds a list of symbol encodings out of a huffman-tree
+   [tree] tree to symbol encodings
+   [node encodings curr-encoding] recursive builder"
+   ([tree]
+    (huffman-tree-to-symbol-encodings tree {} ""))
+  ([node encodings curr-encoding]
+  (if (.sym node)
+    (assoc encodings (.sym node) curr-encoding)
+    (merge
+      (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding))
+      (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding))))))
+
+(def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} ""))
+
+(assert (= (huffman-tree-to-symbol-encodings huffman-tree {} "")
+           (huffman-tree-to-symbol-encodings huffman-tree))
+        "Multi-arity def does not work as intended")
+(assert (= (count huffman-tree-syms)
+           (count (symboltable->pq symbol-freqs))
+           (count symbol-freqs))
+        "Some function is adding or removing symbols")
+
+;;; Build the canonical encodings
+
+;; From: https://en.wikipedia.org/wiki/Canonical_Huffman_code
+;; Each of the existing codes are replaced with a new one of the same length, using the following algorithm:
+
+;;     The first symbol in the list gets assigned a codeword which is the same length as the symbol's original codeword but all zeros. This will often be a single zero ('0').
+;;     Each subsequent symbol is assigned the next binary number in sequence, ensuring that following codes are always higher in value.
+;;     When you reach a longer codeword, then after incrementing, append zeros until the length of the new codeword is equal to the length of the old codeword. This can be thought of as a left shift.
+
+
+;; --- little class for Huffman Codewords
+(defrecord HuffmanCodeword [sym code length])
+(defn get-codeword-string 
+  "Takes a [HuffmanCodeword] and returns a binary str of the code"
+  [codeword]
+  (let [base (Long/toBinaryString (.code codeword))
+        basecount (count base)
+        len (.length codeword)]
+    (if (= basecount len)
+      base
+      (str (apply str (take (- len basecount) (repeat "0"))) base))))
+
+;; --- little class for Huffman Codewords
+
+(def sorted-huffman-tree-codewords 
+  (->> huffman-tree-syms
+       (sort-by (juxt (comp count val) key))
+       (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1)))))))
+
+(defn build-canonical-encodings
+  "Build canonical huffman encodings from a huffman tree
+   takes [symbols] a list of huffman codes derived from a code-tree"
+  ([tree]
+   (let [symbols (->> tree
+                      (huffman-tree-to-symbol-encodings)
+                      (sort-by (juxt (comp count val) key))
+
+                      (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1))))))
+         first-sym (first symbols)
+         seed-symbol (->HuffmanCodeword (.sym first-sym) 0 (.length first-sym))]
+     (build-canonical-encodings [seed-symbol] (rest symbols))))
+  ([codes symbols]
+   (if (not-empty symbols)
+     (let [prev-codeword (last codes)
+           current-codeword (first symbols)
+           next-sym (.sym current-codeword)
+           next-base-code (unchecked-inc (.code prev-codeword))
+           prev-len (.length prev-codeword)
+           next-codeword 
+           (if (= (.length current-codeword)
+                  (.length prev-codeword))
+             (->HuffmanCodeword 
+               next-sym
+               next-base-code
+               prev-len)
+             (->HuffmanCodeword 
+               next-sym
+               (bit-shift-left 
+                 next-base-code 
+                 (- prev-len
+                    (- 63 (Long/numberOfLeadingZeros next-base-code)))) ;Not 100% confident about this
+               (inc prev-len)))]
+       (recur (conj codes next-codeword) (rest symbols)))
+     codes)))
+
+(def canonical-encodings
+  (build-canonical-encodings huffman-tree))
+
+
+(assert
+  (= (map #(.length %1) sorted-huffman-tree-codewords)
+     (map #(.length %1) sorted-huffman-tree-codewords))
+  "Some of the codes changed length when canonicalizing")
+
+(assert
+  (= (count canonical-encodings)
+     (count (set (map #(Long/toBinaryString (.code %1)) canonical-encodings))))
+  "There appears to be duplicate canonical encodings")
+
+(defn get-encoded-length [encodings tokenlist]
+  (let [encodingtable (into {} (map #(vector (.sym %1) (.length %1)) encodings))
+        totalbits (reduce + (map encodingtable tokenlist))
+        totalbytes (math/ceil (/ totalbits 8))
+        totalkb (/ totalbytes 1024)
+        totalmb (/ totalkb 1024)]
+    {:bits totalbits
+     :bytes totalbytes
+     :kb totalkb
+     :mb totalmb}))
+
+
+(comment
+  (def base-file-size
+    (let [totalbits (* 8 (count full-text))
+          totalbytes (math/ceil (/ totalbits 8))
+          totalkb (/ totalbytes 1024)
+          totalmb (/ totalkb 1024)]
+      {:bits totalbits
+       :bytes totalbytes
+       :kb totalkb
+       :mb totalmb}))
+  (def stage1-optimized-token-encoding-length
+    (get-encoded-length canonical-encodings tokens)))
+
+(comment
+  "Some scratchwork figuring out how to make the shortest bit smaller"
+  (defn get-highest-freq-percent [freqlist]
+    (let [most-freq (first (sort-by val > freqlist))] ;the fn already sorts, but most invocations pre-sort for REPL purposes
+    [(key most-freq)
+     (double 
+      (/ (second most-freq)
+         (reduce + (map second symbol-freqs))))]))
+
+  (get-highest-freq-percent symbol-freqs) ;most frequent symbol is 6.83%
+  (get-highest-freq-percent (sort-by val > (frequencies (map first tokens)))) ;most frequent 1-letter prefix
+  (get-highest-freq-percent (sort-by val > (frequencies (map last tokens)))) ;most frequent 1-letter suffix
+  (get-highest-freq-percent (sort-by val > (frequencies (map count tokens)))) ;26% of the text is 3 letter tokens
+  )
+
+;;; NOTE FOR VIDEO:
+;; First, consider some kind of bit-packing scheme 
+;;  - 65 total chars
+;;  - Shave it down to 63
+;;  - Remove caps/reduce even furhter to try and get it to 31
+;; Then introduce huffman encodings
+;;  - Raw text
+;;  - Text with some of the above optimizations
+;;  - Tokenize at spaces and punctuation
+;; Then consider tokenizing I started off by tokenizing the text with some optimizations in place with stage1-optimized-token-encoding-length
+
+(comment
+  "Huffman on the raw text (65 total symbols)"
+  (let [tokenized (seq full-text)
+        symbol-freqs (frequencies tokenized)
+        huff-tree (build-huffman-tree symbol-freqs)
+        canonical-codes (build-canonical-encodings huff-tree)]
+    (get-encoded-length canonical-codes tokenized))
+  "Huffman on the above with normalized space and without * (63 total symbols)"
+  (let [tokenized (seq (-> full-text 
+                           (str/replace #"\s+" " ")
+                           (str/replace #"\*" " ")))
+        symbol-freqs (frequencies tokenized)
+        huff-tree (build-huffman-tree symbol-freqs)
+        canonical-codes (build-canonical-encodings huff-tree)]
+    (get-encoded-length canonical-codes tokenized))
+  "Huffman on the above in all lowercase (38 total symbols)"
+  (let [tokenized (seq (-> full-text 
+                           (str/replace #"\s+" " ")
+                           (str/replace #"[\*]" "")
+                           (str/lower-case)))
+        symbol-freqs (frequencies tokenized)
+        huff-tree (build-huffman-tree symbol-freqs)
+        canonical-codes (build-canonical-encodings huff-tree)]
+    (get-encoded-length canonical-codes tokenized))
+  "Huffman on the above with char changes —()?!:; (31 total symbols)"
+  (let [tokenized (seq (-> full-text 
+                           (str/replace #"—" "-") ; Replace emdash with hyphen
+                           (str/replace #"[\*]" "") ; Get rid of chars
+                           (str/replace #"\([^)]*\)" "") ; Get rid of everything between parens
+                           (str/replace #"[?!]" ".") ; Replace all final punct with periods
+                           (str/replace #"[;:]" ",") ; Replace all middle punct with commas
+                           (str/replace #"\s+" " ") ; Normalize space
+                           (str/lower-case)))
+        symbol-freqs (frequencies tokenized)
+        huff-tree (build-huffman-tree symbol-freqs)
+        canonical-codes (build-canonical-encodings huff-tree)]
+    (get-encoded-length canonical-codes tokenized))
+  )
+
-- 
cgit v1.2.3