diff options
| author | Patrick Kingston <patrick@pkingston.xyz> | 2026-01-23 16:38:00 -0500 |
|---|---|---|
| committer | Patrick Kingston <patrick@pkingston.xyz> | 2026-01-23 16:38:00 -0500 |
| commit | d47553f0a7456868418ad9a4c4a5af182528f254 (patch) | |
| tree | 397be2f873641f1982a3fd5e37d36d8f77cd5f90 | |
| parent | bcab304fb841814ea0acb4b912c8a7df5a395dab (diff) | |
Move encode to huffman
| -rw-r--r-- | bible/huffman.clj (renamed from bible/encode.clj) | 108 |
1 files changed, 93 insertions, 15 deletions
diff --git a/bible/encode.clj b/bible/huffman.clj index abd5365..c219b7e 100644 --- a/bible/encode.clj +++ b/bible/huffman.clj @@ -31,6 +31,7 @@ (def full-text (slurp "bbe-newlines-nochaps.txt")) +;; This is "Stage1-optimized-tokens" (def tokens (-> full-text (str/lower-case) @@ -104,17 +105,28 @@ (reduce + (map val symbol-freqs))) "Probability of root node is not equal to the sum of all probabilities") -(defn huffman-tree-to-symbol-encodings [node encodings curr-encoding] +(defn huffman-tree-to-symbol-encodings + "Builds a list of symbol encodings out of a huffman-tree + [tree] tree to symbol encodings + [node encodings curr-encoding] recursive builder" + ([tree] + (huffman-tree-to-symbol-encodings tree {} "")) + ([node encodings curr-encoding] (if (.sym node) (assoc encodings (.sym node) curr-encoding) (merge (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding)) - (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding))))) + (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding)))))) (def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} "")) + +(assert (= (huffman-tree-to-symbol-encodings huffman-tree {} "") + (huffman-tree-to-symbol-encodings huffman-tree)) + "Multi-arity def does not work as intended") (assert (= (count huffman-tree-syms) - (count pq) - (count symbol-freqs))) + (count (symboltable->pq symbol-freqs)) + (count symbol-freqs)) + "Some function is adding or removing symbols") ;;; Build the canonical encodings @@ -126,7 +138,7 @@ ;; When you reach a longer codeword, then after incrementing, append zeros until the length of the new codeword is equal to the length of the old codeword. This can be thought of as a left shift. -;; little class for Huffman Codewords +;; --- little class for Huffman Codewords (defrecord HuffmanCodeword [sym code length]) (defn get-codeword-string "Takes a [HuffmanCodeword] and returns a binary str of the code" @@ -138,16 +150,23 @@ base (str (apply str (take (- len basecount) (repeat "0"))) base)))) +;; --- little class for Huffman Codewords + (def sorted-huffman-tree-codewords (->> huffman-tree-syms (sort-by (juxt (comp count val) key)) (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1))))))) (defn build-canonical-encodings - "Build canonical huffman encodings from a sorted list of huffman tree codewords + "Build canonical huffman encodings from a huffman tree takes [symbols] a list of huffman codes derived from a code-tree" - ([symbols] - (let [first-sym (first symbols) + ([tree] + (let [symbols (->> tree + (huffman-tree-to-symbol-encodings) + (sort-by (juxt (comp count val) key)) + + (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1)))))) + first-sym (first symbols) seed-symbol (->HuffmanCodeword (.sym first-sym) 0 (.length first-sym))] (build-canonical-encodings [seed-symbol] (rest symbols)))) ([codes symbols] @@ -175,7 +194,7 @@ codes))) (def canonical-encodings - (build-canonical-encodings sorted-huffman-tree-codewords)) + (build-canonical-encodings huffman-tree)) (assert @@ -192,20 +211,29 @@ (let [encodingtable (into {} (map #(vector (.sym %1) (.length %1)) encodings)) totalbits (reduce + (map encodingtable tokenlist)) totalbytes (math/ceil (/ totalbits 8)) - totalkb (math/ceil (/ totalbytes 1024)) - totalmb (math/ceil (/ totalkb 1024))] + totalkb (/ totalbytes 1024) + totalmb (/ totalkb 1024)] {:bits totalbits :bytes totalbytes :kb totalkb :mb totalmb})) -(count (frequencies (str/replace full-text #"\*" ""))) -(def raw-token-ascii-encoding - (get-encoded-length canonical-encodings tokens)) +(comment + (def base-file-size + (let [totalbits (* 8 (count full-text)) + totalbytes (math/ceil (/ totalbits 8)) + totalkb (/ totalbytes 1024) + totalmb (/ totalkb 1024)] + {:bits totalbits + :bytes totalbytes + :kb totalkb + :mb totalmb})) + (def stage1-optimized-token-encoding-length + (get-encoded-length canonical-encodings tokens))) (comment - "Figuring out how to make the shortest bit smaller" + "Some scratchwork figuring out how to make the shortest bit smaller" (defn get-highest-freq-percent [freqlist] (let [most-freq (first (sort-by val > freqlist))] ;the fn already sorts, but most invocations pre-sort for REPL purposes [(key most-freq) @@ -219,3 +247,53 @@ (get-highest-freq-percent (sort-by val > (frequencies (map count tokens)))) ;26% of the text is 3 letter tokens ) +;;; NOTE FOR VIDEO: +;; First, consider some kind of bit-packing scheme +;; - 65 total chars +;; - Shave it down to 63 +;; - Remove caps/reduce even furhter to try and get it to 31 +;; Then introduce huffman encodings +;; - Raw text +;; - Text with some of the above optimizations +;; - Tokenize at spaces and punctuation +;; Then consider tokenizing I started off by tokenizing the text with some optimizations in place with stage1-optimized-token-encoding-length + +(comment + "Huffman on the raw text (65 total symbols)" + (let [tokenized (seq full-text) + symbol-freqs (frequencies tokenized) + huff-tree (build-huffman-tree symbol-freqs) + canonical-codes (build-canonical-encodings huff-tree)] + (get-encoded-length canonical-codes tokenized)) + "Huffman on the above with normalized space and without * (63 total symbols)" + (let [tokenized (seq (-> full-text + (str/replace #"\s+" " ") + (str/replace #"\*" " "))) + symbol-freqs (frequencies tokenized) + huff-tree (build-huffman-tree symbol-freqs) + canonical-codes (build-canonical-encodings huff-tree)] + (get-encoded-length canonical-codes tokenized)) + "Huffman on the above in all lowercase (38 total symbols)" + (let [tokenized (seq (-> full-text + (str/replace #"\s+" " ") + (str/replace #"[\*]" "") + (str/lower-case))) + symbol-freqs (frequencies tokenized) + huff-tree (build-huffman-tree symbol-freqs) + canonical-codes (build-canonical-encodings huff-tree)] + (get-encoded-length canonical-codes tokenized)) + "Huffman on the above with char changes —()?!:; (31 total symbols)" + (let [tokenized (seq (-> full-text + (str/replace #"—" "-") ; Replace emdash with hyphen + (str/replace #"[\*]" "") ; Get rid of chars + (str/replace #"\([^)]*\)" "") ; Get rid of everything between parens + (str/replace #"[?!]" ".") ; Replace all final punct with periods + (str/replace #"[;:]" ",") ; Replace all middle punct with commas + (str/replace #"\s+" " ") ; Normalize space + (str/lower-case))) + symbol-freqs (frequencies tokenized) + huff-tree (build-huffman-tree symbol-freqs) + canonical-codes (build-canonical-encodings huff-tree)] + (get-encoded-length canonical-codes tokenized)) + ) + |
