aboutsummaryrefslogtreecommitdiff
path: root/bible
diff options
context:
space:
mode:
authorPatrick Kingston <patrick@pkingston.xyz>2026-01-23 01:08:41 -0500
committerPatrick Kingston <patrick@pkingston.xyz>2026-01-23 01:08:41 -0500
commit0dd40156ddf58ad15ae1007106ad904719de116e (patch)
treeb7f01e3415eaf24d246a675f321a3642ed8c1fa4 /bible
parent9a39da15012067fb7cc2966d219942d1f409ac7b (diff)
Start working on Canonical Encodings
Note: I didn't get them working (I think)
Diffstat (limited to 'bible')
-rw-r--r--bible/encode.clj62
1 files changed, 53 insertions, 9 deletions
diff --git a/bible/encode.clj b/bible/encode.clj
index 031b6aa..903cb2b 100644
--- a/bible/encode.clj
+++ b/bible/encode.clj
@@ -5,21 +5,21 @@
(require '[clojure.data.priority-map :as pm])
-(def files (fs/glob "./base_files/" "**.txt"))
+#_(def files (fs/glob "./base_files/" "**.txt"))
-(defn get-book-num [filename]
+#_(defn get-book-num [filename]
(let [[_ _ book _ _]
(str/split (str filename) #"_")
#_#_chap (int _chap)]
(Integer/parseInt book)))
-(defn get-chap-num [filename]
+#_(defn get-chap-num [filename]
(let [[_ _ _ _ chap]
(str/split (str filename) #"_")
#_#_chap (int _chap)]
(Integer/parseInt chap)))
-(with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
+#_(with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
(doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
(with-open [reader (io/reader (fs/file f))]
(doseq [line (drop 2 (line-seq reader))]
@@ -41,12 +41,12 @@
#_(spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs))))
-(sort-by val > symbol-freqs) ; Greatest to lease frequency
-(reduce + (map val symbol-freqs)) ; Total tokens
-(count symbol-freqs) ; Total unique tokens
-(reduce + (take 512 (map val symbol-freqs))) ; Number of the top 100 common tokens
+#_(sort-by val > symbol-freqs) ; Greatest to lease frequency
+#_(reduce + (map val symbol-freqs)) ; Total tokens
+#_(count symbol-freqs) ; Total unique tokens
+#_(reduce + (take 512 (map val symbol-freqs))) ; Number of the top 100 common tokens
-(reduce + (map count symbol-freqs)) ; Total chars needed for dict vals
+#_(reduce + (map count symbol-freqs)) ; Total chars needed for dict vals
#_(def two-grams (frequencies (partition 2 1 tokens)))
#_(sort-by val > two-grams)
@@ -104,3 +104,47 @@
(count pq)
(count symbol-freqs)))
+;;; Build the canonical encodings
+
+;; Each of the existing codes are replaced with a new one of the same length, using the following algorithm:
+
+;; The first symbol in the list gets assigned a codeword which is the same length as the symbol's original codeword but all zeros. This will often be a single zero ('0').
+;; Each subsequent symbol is assigned the next binary number in sequence, ensuring that following codes are always higher in value.
+;; When you reach a longer codeword, then after incrementing, append zeros until the length of the new codeword is equal to the length of the old codeword. This can be thought of as a left shift.
+
+(defrecord HuffmanCodeword [sym code length])
+
+(def sorted-huffman-tree-codewords
+ (->> huffman-tree-syms
+ (sort-by (juxt (comp count val) key))
+ (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1)))))))
+
+(defn build-canonical-encodings
+ "Build canonical huffman encodings from a sorted list of huffman tree codewords"
+ ([symbols]
+ (let [first-sym (first symbols)
+ seed-symbol (->HuffmanCodeword (.sym first-sym) 0 (.length first-sym))]
+ (build-canonical-encodings [seed-symbol] (rest symbols))))
+ ([codes symbols]
+ (if (not-empty symbols)
+ (let [prev-codeword (last codes)
+ current-codeword (first symbols)
+ next-sym (.sym current-codeword)
+ next-base-code (unchecked-inc (.code prev-codeword))
+ prev-len (.length prev-codeword)
+ next-codeword (if (= (.length current-codeword)
+ (.length prev-codeword))
+ (->HuffmanCodeword next-sym
+ next-base-code
+ prev-len)
+ (->HuffmanCodeword next-sym
+ (bit-shift-left next-base-code (inc prev-len))
+ (inc prev-len)))]
+ (recur (conj codes next-codeword) (rest symbols)))
+ codes)))
+
+
+(def canonical-encodings
+ (build-canonical-encodings sorted-huffman-tree-codewords))
+
+(map #(Long/toBinaryString (.code %1)) (take 100 canonical-encodings))