From d47553f0a7456868418ad9a4c4a5af182528f254 Mon Sep 17 00:00:00 2001 From: Patrick Kingston Date: Fri, 23 Jan 2026 16:38:00 -0500 Subject: Move encode to huffman --- bible/encode.clj | 221 ---------------------------------------- bible/huffman.clj | 299 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 299 insertions(+), 221 deletions(-) delete mode 100644 bible/encode.clj create mode 100644 bible/huffman.clj diff --git a/bible/encode.clj b/bible/encode.clj deleted file mode 100644 index abd5365..0000000 --- a/bible/encode.clj +++ /dev/null @@ -1,221 +0,0 @@ -(require '[clojure.string :as str]) - -(require '[clojure.java.io :as io]) -(require '[babashka.fs :as fs]) - -(require '[clojure.data.priority-map :as pm]) - -(require '[clojure.math :as math]) - -(comment - "Build the base file" - (def files (fs/glob "./base_files/" "**.txt")) - - (defn get-book-num [filename] - (let [[_ _ book _ _] - (str/split (str filename) #"_") - #_#_chap (int _chap)] - (Integer/parseInt book))) - - (defn get-chap-num [filename] - (let [[_ _ _ _ chap] - (str/split (str filename) #"_") - #_#_chap (int _chap)] - (Integer/parseInt chap))) - - (with-open [writer (io/writer "bbe-newlines-nochaps.txt")] - (doseq [f (sort-by (juxt get-book-num get-chap-num) files)] - (with-open [reader (io/reader (fs/file f))] - (doseq [line (drop 2 (line-seq reader))] - (.write writer (str line "\n"))))))) - -(def full-text (slurp "bbe-newlines-nochaps.txt")) - -(def tokens - (-> full-text - (str/lower-case) - (str/replace #"\s+" " ") - (str/replace #"'s" " AS ") ;; Apostrophe_S - (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " ")) - (str/split #" ") - (#(remove str/blank? %1)))) - -(def symbol-freqs (frequencies tokens)) - -(comment - "Do some basic statistics and print a list of tokens" - (spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs)))) - - (sort-by val > symbol-freqs) ; Greatest to lease frequency - (reduce + (map val symbol-freqs)) ; Total tokens - (count symbol-freqs) ; Total unique tokens - (reduce + (take 512 (map val symbol-freqs))) ; Number of the top 100 common tokens - - (reduce + (map count symbol-freqs)) ; Total chars needed for dict vals - - (def two-grams (frequencies (partition 2 1 tokens))) - (sort-by val > two-grams)) - - - - -;;; Make the huffman tree for the symbols (13) -(defrecord Node [left right sym probability]) - -; Create a prioirity-queue of parentless nodes for each symbol -(defn symboltable->pq [symbolfreqs] - (into (pm/priority-map-keyfn (juxt first second)) - (map #(vector - (->Node nil nil (first %1) (second %1)) - [(second %1) (first %1)]) - symbolfreqs))) - -(assert (= (count symbol-freqs) (count (symboltable->pq symbol-freqs))) "Priority queue has fewer symbols than symbol list") - -;; From: https://michaeldipperstein.github.io/huffman.html#decode -;; Step 1. Create a parentless node for each symbol. Each node should include the symbol and its probability. -;; Step 2. Select the two parentless nodes with the lowest probabilities. -;; Step 3. Create a new node which is the parent of the two lowest probability nodes. -;; Step 4. Assign the new node a probability equal to the sum of its children's probabilities. -;; Step 5. Repeat from Step 2 until there is only one parentless node left. - -;; NOTE: This is an inefficient algorithm because we could use the -;; two-queue version on wikipedia -(defn build-huffman-tree-from-queue - "Builds a huffman-tree out of a priority-queue" - [queue] - (if (= 1 (count queue)) - (first (peek queue)) ; Repeat until there is only one parentless node left - (let [[lowest-node [lowest-prob _]] (peek queue) - [second-node [second-prob _]] (peek (pop queue)) ; Step 2 - new-prob (+ lowest-prob second-prob) ; Step 4 - new-node (->Node second-node lowest-node nil new-prob) ; Step 3 - NOTE: unsure about node order - next-queue (assoc (pop (pop queue)) new-node [new-prob nil])] - (recur next-queue)))) - -(defn build-huffman-tree - "builds a huffman-tree out of a frequency table of symbols" - [symbol-freq-table] - (build-huffman-tree-from-queue (symboltable->pq symbol-freq-table))) - -(def huffman-tree (build-huffman-tree symbol-freqs)) - -(assert (= (.probability huffman-tree) - (reduce + (map val symbol-freqs))) - "Probability of root node is not equal to the sum of all probabilities") - -(defn huffman-tree-to-symbol-encodings [node encodings curr-encoding] - (if (.sym node) - (assoc encodings (.sym node) curr-encoding) - (merge - (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding)) - (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding))))) - -(def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} "")) -(assert (= (count huffman-tree-syms) - (count pq) - (count symbol-freqs))) - -;;; Build the canonical encodings - -;; From: https://en.wikipedia.org/wiki/Canonical_Huffman_code -;; Each of the existing codes are replaced with a new one of the same length, using the following algorithm: - -;; The first symbol in the list gets assigned a codeword which is the same length as the symbol's original codeword but all zeros. This will often be a single zero ('0'). -;; Each subsequent symbol is assigned the next binary number in sequence, ensuring that following codes are always higher in value. -;; When you reach a longer codeword, then after incrementing, append zeros until the length of the new codeword is equal to the length of the old codeword. This can be thought of as a left shift. - - -;; little class for Huffman Codewords -(defrecord HuffmanCodeword [sym code length]) -(defn get-codeword-string - "Takes a [HuffmanCodeword] and returns a binary str of the code" - [codeword] - (let [base (Long/toBinaryString (.code codeword)) - basecount (count base) - len (.length codeword)] - (if (= basecount len) - base - (str (apply str (take (- len basecount) (repeat "0"))) base)))) - -(def sorted-huffman-tree-codewords - (->> huffman-tree-syms - (sort-by (juxt (comp count val) key)) - (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1))))))) - -(defn build-canonical-encodings - "Build canonical huffman encodings from a sorted list of huffman tree codewords - takes [symbols] a list of huffman codes derived from a code-tree" - ([symbols] - (let [first-sym (first symbols) - seed-symbol (->HuffmanCodeword (.sym first-sym) 0 (.length first-sym))] - (build-canonical-encodings [seed-symbol] (rest symbols)))) - ([codes symbols] - (if (not-empty symbols) - (let [prev-codeword (last codes) - current-codeword (first symbols) - next-sym (.sym current-codeword) - next-base-code (unchecked-inc (.code prev-codeword)) - prev-len (.length prev-codeword) - next-codeword - (if (= (.length current-codeword) - (.length prev-codeword)) - (->HuffmanCodeword - next-sym - next-base-code - prev-len) - (->HuffmanCodeword - next-sym - (bit-shift-left - next-base-code - (- prev-len - (- 63 (Long/numberOfLeadingZeros next-base-code)))) ;Not 100% confident about this - (inc prev-len)))] - (recur (conj codes next-codeword) (rest symbols))) - codes))) - -(def canonical-encodings - (build-canonical-encodings sorted-huffman-tree-codewords)) - - -(assert - (= (map #(.length %1) sorted-huffman-tree-codewords) - (map #(.length %1) sorted-huffman-tree-codewords)) - "Some of the codes changed length when canonicalizing") - -(assert - (= (count canonical-encodings) - (count (set (map #(Long/toBinaryString (.code %1)) canonical-encodings)))) - "There appears to be duplicate canonical encodings") - -(defn get-encoded-length [encodings tokenlist] - (let [encodingtable (into {} (map #(vector (.sym %1) (.length %1)) encodings)) - totalbits (reduce + (map encodingtable tokenlist)) - totalbytes (math/ceil (/ totalbits 8)) - totalkb (math/ceil (/ totalbytes 1024)) - totalmb (math/ceil (/ totalkb 1024))] - {:bits totalbits - :bytes totalbytes - :kb totalkb - :mb totalmb})) - -(count (frequencies (str/replace full-text #"\*" ""))) - -(def raw-token-ascii-encoding - (get-encoded-length canonical-encodings tokens)) - -(comment - "Figuring out how to make the shortest bit smaller" - (defn get-highest-freq-percent [freqlist] - (let [most-freq (first (sort-by val > freqlist))] ;the fn already sorts, but most invocations pre-sort for REPL purposes - [(key most-freq) - (double - (/ (second most-freq) - (reduce + (map second symbol-freqs))))])) - - (get-highest-freq-percent symbol-freqs) ;most frequent symbol is 6.83% - (get-highest-freq-percent (sort-by val > (frequencies (map first tokens)))) ;most frequent 1-letter prefix - (get-highest-freq-percent (sort-by val > (frequencies (map last tokens)))) ;most frequent 1-letter suffix - (get-highest-freq-percent (sort-by val > (frequencies (map count tokens)))) ;26% of the text is 3 letter tokens - ) - diff --git a/bible/huffman.clj b/bible/huffman.clj new file mode 100644 index 0000000..c219b7e --- /dev/null +++ b/bible/huffman.clj @@ -0,0 +1,299 @@ +(require '[clojure.string :as str]) + +(require '[clojure.java.io :as io]) +(require '[babashka.fs :as fs]) + +(require '[clojure.data.priority-map :as pm]) + +(require '[clojure.math :as math]) + +(comment + "Build the base file" + (def files (fs/glob "./base_files/" "**.txt")) + + (defn get-book-num [filename] + (let [[_ _ book _ _] + (str/split (str filename) #"_") + #_#_chap (int _chap)] + (Integer/parseInt book))) + + (defn get-chap-num [filename] + (let [[_ _ _ _ chap] + (str/split (str filename) #"_") + #_#_chap (int _chap)] + (Integer/parseInt chap))) + + (with-open [writer (io/writer "bbe-newlines-nochaps.txt")] + (doseq [f (sort-by (juxt get-book-num get-chap-num) files)] + (with-open [reader (io/reader (fs/file f))] + (doseq [line (drop 2 (line-seq reader))] + (.write writer (str line "\n"))))))) + +(def full-text (slurp "bbe-newlines-nochaps.txt")) + +;; This is "Stage1-optimized-tokens" +(def tokens + (-> full-text + (str/lower-case) + (str/replace #"\s+" " ") + (str/replace #"'s" " AS ") ;; Apostrophe_S + (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " ")) + (str/split #" ") + (#(remove str/blank? %1)))) + +(def symbol-freqs (frequencies tokens)) + +(comment + "Do some basic statistics and print a list of tokens" + (spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs)))) + + (sort-by val > symbol-freqs) ; Greatest to lease frequency + (reduce + (map val symbol-freqs)) ; Total tokens + (count symbol-freqs) ; Total unique tokens + (reduce + (take 512 (map val symbol-freqs))) ; Number of the top 100 common tokens + + (reduce + (map count symbol-freqs)) ; Total chars needed for dict vals + + (def two-grams (frequencies (partition 2 1 tokens))) + (sort-by val > two-grams)) + + + + +;;; Make the huffman tree for the symbols (13) +(defrecord Node [left right sym probability]) + +; Create a prioirity-queue of parentless nodes for each symbol +(defn symboltable->pq [symbolfreqs] + (into (pm/priority-map-keyfn (juxt first second)) + (map #(vector + (->Node nil nil (first %1) (second %1)) + [(second %1) (first %1)]) + symbolfreqs))) + +(assert (= (count symbol-freqs) (count (symboltable->pq symbol-freqs))) "Priority queue has fewer symbols than symbol list") + +;; From: https://michaeldipperstein.github.io/huffman.html#decode +;; Step 1. Create a parentless node for each symbol. Each node should include the symbol and its probability. +;; Step 2. Select the two parentless nodes with the lowest probabilities. +;; Step 3. Create a new node which is the parent of the two lowest probability nodes. +;; Step 4. Assign the new node a probability equal to the sum of its children's probabilities. +;; Step 5. Repeat from Step 2 until there is only one parentless node left. + +;; NOTE: This is an inefficient algorithm because we could use the +;; two-queue version on wikipedia +(defn build-huffman-tree-from-queue + "Builds a huffman-tree out of a priority-queue" + [queue] + (if (= 1 (count queue)) + (first (peek queue)) ; Repeat until there is only one parentless node left + (let [[lowest-node [lowest-prob _]] (peek queue) + [second-node [second-prob _]] (peek (pop queue)) ; Step 2 + new-prob (+ lowest-prob second-prob) ; Step 4 + new-node (->Node second-node lowest-node nil new-prob) ; Step 3 - NOTE: unsure about node order + next-queue (assoc (pop (pop queue)) new-node [new-prob nil])] + (recur next-queue)))) + +(defn build-huffman-tree + "builds a huffman-tree out of a frequency table of symbols" + [symbol-freq-table] + (build-huffman-tree-from-queue (symboltable->pq symbol-freq-table))) + +(def huffman-tree (build-huffman-tree symbol-freqs)) + +(assert (= (.probability huffman-tree) + (reduce + (map val symbol-freqs))) + "Probability of root node is not equal to the sum of all probabilities") + +(defn huffman-tree-to-symbol-encodings + "Builds a list of symbol encodings out of a huffman-tree + [tree] tree to symbol encodings + [node encodings curr-encoding] recursive builder" + ([tree] + (huffman-tree-to-symbol-encodings tree {} "")) + ([node encodings curr-encoding] + (if (.sym node) + (assoc encodings (.sym node) curr-encoding) + (merge + (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding)) + (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding)))))) + +(def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} "")) + +(assert (= (huffman-tree-to-symbol-encodings huffman-tree {} "") + (huffman-tree-to-symbol-encodings huffman-tree)) + "Multi-arity def does not work as intended") +(assert (= (count huffman-tree-syms) + (count (symboltable->pq symbol-freqs)) + (count symbol-freqs)) + "Some function is adding or removing symbols") + +;;; Build the canonical encodings + +;; From: https://en.wikipedia.org/wiki/Canonical_Huffman_code +;; Each of the existing codes are replaced with a new one of the same length, using the following algorithm: + +;; The first symbol in the list gets assigned a codeword which is the same length as the symbol's original codeword but all zeros. This will often be a single zero ('0'). +;; Each subsequent symbol is assigned the next binary number in sequence, ensuring that following codes are always higher in value. +;; When you reach a longer codeword, then after incrementing, append zeros until the length of the new codeword is equal to the length of the old codeword. This can be thought of as a left shift. + + +;; --- little class for Huffman Codewords +(defrecord HuffmanCodeword [sym code length]) +(defn get-codeword-string + "Takes a [HuffmanCodeword] and returns a binary str of the code" + [codeword] + (let [base (Long/toBinaryString (.code codeword)) + basecount (count base) + len (.length codeword)] + (if (= basecount len) + base + (str (apply str (take (- len basecount) (repeat "0"))) base)))) + +;; --- little class for Huffman Codewords + +(def sorted-huffman-tree-codewords + (->> huffman-tree-syms + (sort-by (juxt (comp count val) key)) + (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1))))))) + +(defn build-canonical-encodings + "Build canonical huffman encodings from a huffman tree + takes [symbols] a list of huffman codes derived from a code-tree" + ([tree] + (let [symbols (->> tree + (huffman-tree-to-symbol-encodings) + (sort-by (juxt (comp count val) key)) + + (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1)))))) + first-sym (first symbols) + seed-symbol (->HuffmanCodeword (.sym first-sym) 0 (.length first-sym))] + (build-canonical-encodings [seed-symbol] (rest symbols)))) + ([codes symbols] + (if (not-empty symbols) + (let [prev-codeword (last codes) + current-codeword (first symbols) + next-sym (.sym current-codeword) + next-base-code (unchecked-inc (.code prev-codeword)) + prev-len (.length prev-codeword) + next-codeword + (if (= (.length current-codeword) + (.length prev-codeword)) + (->HuffmanCodeword + next-sym + next-base-code + prev-len) + (->HuffmanCodeword + next-sym + (bit-shift-left + next-base-code + (- prev-len + (- 63 (Long/numberOfLeadingZeros next-base-code)))) ;Not 100% confident about this + (inc prev-len)))] + (recur (conj codes next-codeword) (rest symbols))) + codes))) + +(def canonical-encodings + (build-canonical-encodings huffman-tree)) + + +(assert + (= (map #(.length %1) sorted-huffman-tree-codewords) + (map #(.length %1) sorted-huffman-tree-codewords)) + "Some of the codes changed length when canonicalizing") + +(assert + (= (count canonical-encodings) + (count (set (map #(Long/toBinaryString (.code %1)) canonical-encodings)))) + "There appears to be duplicate canonical encodings") + +(defn get-encoded-length [encodings tokenlist] + (let [encodingtable (into {} (map #(vector (.sym %1) (.length %1)) encodings)) + totalbits (reduce + (map encodingtable tokenlist)) + totalbytes (math/ceil (/ totalbits 8)) + totalkb (/ totalbytes 1024) + totalmb (/ totalkb 1024)] + {:bits totalbits + :bytes totalbytes + :kb totalkb + :mb totalmb})) + + +(comment + (def base-file-size + (let [totalbits (* 8 (count full-text)) + totalbytes (math/ceil (/ totalbits 8)) + totalkb (/ totalbytes 1024) + totalmb (/ totalkb 1024)] + {:bits totalbits + :bytes totalbytes + :kb totalkb + :mb totalmb})) + (def stage1-optimized-token-encoding-length + (get-encoded-length canonical-encodings tokens))) + +(comment + "Some scratchwork figuring out how to make the shortest bit smaller" + (defn get-highest-freq-percent [freqlist] + (let [most-freq (first (sort-by val > freqlist))] ;the fn already sorts, but most invocations pre-sort for REPL purposes + [(key most-freq) + (double + (/ (second most-freq) + (reduce + (map second symbol-freqs))))])) + + (get-highest-freq-percent symbol-freqs) ;most frequent symbol is 6.83% + (get-highest-freq-percent (sort-by val > (frequencies (map first tokens)))) ;most frequent 1-letter prefix + (get-highest-freq-percent (sort-by val > (frequencies (map last tokens)))) ;most frequent 1-letter suffix + (get-highest-freq-percent (sort-by val > (frequencies (map count tokens)))) ;26% of the text is 3 letter tokens + ) + +;;; NOTE FOR VIDEO: +;; First, consider some kind of bit-packing scheme +;; - 65 total chars +;; - Shave it down to 63 +;; - Remove caps/reduce even furhter to try and get it to 31 +;; Then introduce huffman encodings +;; - Raw text +;; - Text with some of the above optimizations +;; - Tokenize at spaces and punctuation +;; Then consider tokenizing I started off by tokenizing the text with some optimizations in place with stage1-optimized-token-encoding-length + +(comment + "Huffman on the raw text (65 total symbols)" + (let [tokenized (seq full-text) + symbol-freqs (frequencies tokenized) + huff-tree (build-huffman-tree symbol-freqs) + canonical-codes (build-canonical-encodings huff-tree)] + (get-encoded-length canonical-codes tokenized)) + "Huffman on the above with normalized space and without * (63 total symbols)" + (let [tokenized (seq (-> full-text + (str/replace #"\s+" " ") + (str/replace #"\*" " "))) + symbol-freqs (frequencies tokenized) + huff-tree (build-huffman-tree symbol-freqs) + canonical-codes (build-canonical-encodings huff-tree)] + (get-encoded-length canonical-codes tokenized)) + "Huffman on the above in all lowercase (38 total symbols)" + (let [tokenized (seq (-> full-text + (str/replace #"\s+" " ") + (str/replace #"[\*]" "") + (str/lower-case))) + symbol-freqs (frequencies tokenized) + huff-tree (build-huffman-tree symbol-freqs) + canonical-codes (build-canonical-encodings huff-tree)] + (get-encoded-length canonical-codes tokenized)) + "Huffman on the above with char changes —()?!:; (31 total symbols)" + (let [tokenized (seq (-> full-text + (str/replace #"—" "-") ; Replace emdash with hyphen + (str/replace #"[\*]" "") ; Get rid of chars + (str/replace #"\([^)]*\)" "") ; Get rid of everything between parens + (str/replace #"[?!]" ".") ; Replace all final punct with periods + (str/replace #"[;:]" ",") ; Replace all middle punct with commas + (str/replace #"\s+" " ") ; Normalize space + (str/lower-case))) + symbol-freqs (frequencies tokenized) + huff-tree (build-huffman-tree symbol-freqs) + canonical-codes (build-canonical-encodings huff-tree)] + (get-encoded-length canonical-codes tokenized)) + ) + -- cgit v1.2.3