Start working on Canonical Encodings

Note: I didn't get them working (I think)
author: Patrick Kingston <patrick@pkingston.xyz> 2026-01-23 01:08:41 -0500
committer: Patrick Kingston <patrick@pkingston.xyz> 2026-01-23 01:08:41 -0500
commit: 0dd40156ddf58ad15ae1007106ad904719de116e (patch)
tree: b7f01e3415eaf24d246a675f321a3642ed8c1fa4 /bible
parent: 9a39da15012067fb7cc2966d219942d1f409ac7b (diff)
1 files changed, 53 insertions, 9 deletions
diff --git a/bible/encode.clj b/bible/encode.clj
index 031b6aa..903cb2b 100644
--- a/bible/encode.clj
+++ b/bible/encode.clj
@@ -5,21 +5,21 @@
 
 (require '[clojure.data.priority-map :as pm])
 
-(def files (fs/glob "./base_files/" "**.txt"))
+#_(def files (fs/glob "./base_files/" "**.txt"))
 
-(defn get-book-num [filename]
+#_(defn get-book-num [filename]
   (let [[_ _ book _ _]
         (str/split (str filename) #"_")
         #_#_chap (int _chap)]
     (Integer/parseInt book)))
 
-(defn get-chap-num [filename]
+#_(defn get-chap-num [filename]
   (let [[_ _ _ _ chap]
         (str/split (str filename) #"_")
         #_#_chap (int _chap)]
     (Integer/parseInt chap)))
 
-(with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
+#_(with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
   (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
     (with-open [reader (io/reader (fs/file f))]
       (doseq [line (drop 2 (line-seq reader))]
@@ -41,12 +41,12 @@
 #_(spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs))))
 
 
-(sort-by val > symbol-freqs)  ; Greatest to lease frequency
-(reduce + (map val symbol-freqs))  ; Total tokens
-(count symbol-freqs)  ; Total unique tokens
-(reduce + (take 512 (map val symbol-freqs)))  ; Number of the top 100 common tokens
+#_(sort-by val > symbol-freqs)  ; Greatest to lease frequency
+#_(reduce + (map val symbol-freqs))  ; Total tokens
+#_(count symbol-freqs)  ; Total unique tokens
+#_(reduce + (take 512 (map val symbol-freqs)))  ; Number of the top 100 common tokens
 
-(reduce + (map count symbol-freqs))  ; Total chars needed for dict vals
+#_(reduce + (map count symbol-freqs))  ; Total chars needed for dict vals
 
 #_(def two-grams (frequencies (partition 2 1 tokens)))
 #_(sort-by val > two-grams)
@@ -104,3 +104,47 @@
            (count pq)
            (count symbol-freqs)))
 
+;;; Build the canonical encodings
+
+;; Each of the existing codes are replaced with a new one of the same length, using the following algorithm:
+
+;;     The first symbol in the list gets assigned a codeword which is the same length as the symbol's original codeword but all zeros. This will often be a single zero ('0').
+;;     Each subsequent symbol is assigned the next binary number in sequence, ensuring that following codes are always higher in value.
+;;     When you reach a longer codeword, then after incrementing, append zeros until the length of the new codeword is equal to the length of the old codeword. This can be thought of as a left shift.
+
+(defrecord HuffmanCodeword [sym code length])
+
+(def sorted-huffman-tree-codewords 
+  (->> huffman-tree-syms
+       (sort-by (juxt (comp count val) key))
+       (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1)))))))
+
+(defn build-canonical-encodings
+  "Build canonical huffman encodings from a sorted list of huffman tree codewords"
+  ([symbols]
+   (let [first-sym (first symbols)
+         seed-symbol (->HuffmanCodeword (.sym first-sym) 0 (.length first-sym))]
+     (build-canonical-encodings [seed-symbol] (rest symbols))))
+  ([codes symbols]
+   (if (not-empty symbols)
+     (let [prev-codeword (last codes)
+         current-codeword (first symbols)
+         next-sym (.sym current-codeword)
+         next-base-code (unchecked-inc (.code prev-codeword))
+         prev-len (.length prev-codeword)
+         next-codeword (if (= (.length current-codeword)
+                              (.length prev-codeword))
+                         (->HuffmanCodeword next-sym
+                                            next-base-code
+                                            prev-len)
+                         (->HuffmanCodeword next-sym
+                                            (bit-shift-left next-base-code (inc prev-len))
+                                            (inc prev-len)))]
+     (recur (conj codes next-codeword) (rest symbols)))
+     codes)))
+
+
+(def canonical-encodings
+  (build-canonical-encodings sorted-huffman-tree-codewords))
+
+(map #(Long/toBinaryString (.code %1)) (take 100 canonical-encodings))
author	Patrick Kingston <patrick@pkingston.xyz>	2026-01-23 01:08:41 -0500
committer	Patrick Kingston <patrick@pkingston.xyz>	2026-01-23 01:08:41 -0500
commit	0dd40156ddf58ad15ae1007106ad904719de116e (patch)
tree	b7f01e3415eaf24d246a675f321a3642ed8c1fa4 /bible
parent	9a39da15012067fb7cc2966d219942d1f409ac7b (diff)