aboutsummaryrefslogtreecommitdiff
path: root/bible
diff options
context:
space:
mode:
Diffstat (limited to 'bible')
-rw-r--r--bible/huffman.clj (renamed from bible/encode.clj)108
1 files changed, 93 insertions, 15 deletions
diff --git a/bible/encode.clj b/bible/huffman.clj
index abd5365..c219b7e 100644
--- a/bible/encode.clj
+++ b/bible/huffman.clj
@@ -31,6 +31,7 @@
(def full-text (slurp "bbe-newlines-nochaps.txt"))
+;; This is "Stage1-optimized-tokens"
(def tokens
(-> full-text
(str/lower-case)
@@ -104,17 +105,28 @@
(reduce + (map val symbol-freqs)))
"Probability of root node is not equal to the sum of all probabilities")
-(defn huffman-tree-to-symbol-encodings [node encodings curr-encoding]
+(defn huffman-tree-to-symbol-encodings
+ "Builds a list of symbol encodings out of a huffman-tree
+ [tree] tree to symbol encodings
+ [node encodings curr-encoding] recursive builder"
+ ([tree]
+ (huffman-tree-to-symbol-encodings tree {} ""))
+ ([node encodings curr-encoding]
(if (.sym node)
(assoc encodings (.sym node) curr-encoding)
(merge
(huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding))
- (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding)))))
+ (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding))))))
(def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} ""))
+
+(assert (= (huffman-tree-to-symbol-encodings huffman-tree {} "")
+ (huffman-tree-to-symbol-encodings huffman-tree))
+ "Multi-arity def does not work as intended")
(assert (= (count huffman-tree-syms)
- (count pq)
- (count symbol-freqs)))
+ (count (symboltable->pq symbol-freqs))
+ (count symbol-freqs))
+ "Some function is adding or removing symbols")
;;; Build the canonical encodings
@@ -126,7 +138,7 @@
;; When you reach a longer codeword, then after incrementing, append zeros until the length of the new codeword is equal to the length of the old codeword. This can be thought of as a left shift.
-;; little class for Huffman Codewords
+;; --- little class for Huffman Codewords
(defrecord HuffmanCodeword [sym code length])
(defn get-codeword-string
"Takes a [HuffmanCodeword] and returns a binary str of the code"
@@ -138,16 +150,23 @@
base
(str (apply str (take (- len basecount) (repeat "0"))) base))))
+;; --- little class for Huffman Codewords
+
(def sorted-huffman-tree-codewords
(->> huffman-tree-syms
(sort-by (juxt (comp count val) key))
(map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1)))))))
(defn build-canonical-encodings
- "Build canonical huffman encodings from a sorted list of huffman tree codewords
+ "Build canonical huffman encodings from a huffman tree
takes [symbols] a list of huffman codes derived from a code-tree"
- ([symbols]
- (let [first-sym (first symbols)
+ ([tree]
+ (let [symbols (->> tree
+ (huffman-tree-to-symbol-encodings)
+ (sort-by (juxt (comp count val) key))
+
+ (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1))))))
+ first-sym (first symbols)
seed-symbol (->HuffmanCodeword (.sym first-sym) 0 (.length first-sym))]
(build-canonical-encodings [seed-symbol] (rest symbols))))
([codes symbols]
@@ -175,7 +194,7 @@
codes)))
(def canonical-encodings
- (build-canonical-encodings sorted-huffman-tree-codewords))
+ (build-canonical-encodings huffman-tree))
(assert
@@ -192,20 +211,29 @@
(let [encodingtable (into {} (map #(vector (.sym %1) (.length %1)) encodings))
totalbits (reduce + (map encodingtable tokenlist))
totalbytes (math/ceil (/ totalbits 8))
- totalkb (math/ceil (/ totalbytes 1024))
- totalmb (math/ceil (/ totalkb 1024))]
+ totalkb (/ totalbytes 1024)
+ totalmb (/ totalkb 1024)]
{:bits totalbits
:bytes totalbytes
:kb totalkb
:mb totalmb}))
-(count (frequencies (str/replace full-text #"\*" "")))
-(def raw-token-ascii-encoding
- (get-encoded-length canonical-encodings tokens))
+(comment
+ (def base-file-size
+ (let [totalbits (* 8 (count full-text))
+ totalbytes (math/ceil (/ totalbits 8))
+ totalkb (/ totalbytes 1024)
+ totalmb (/ totalkb 1024)]
+ {:bits totalbits
+ :bytes totalbytes
+ :kb totalkb
+ :mb totalmb}))
+ (def stage1-optimized-token-encoding-length
+ (get-encoded-length canonical-encodings tokens)))
(comment
- "Figuring out how to make the shortest bit smaller"
+ "Some scratchwork figuring out how to make the shortest bit smaller"
(defn get-highest-freq-percent [freqlist]
(let [most-freq (first (sort-by val > freqlist))] ;the fn already sorts, but most invocations pre-sort for REPL purposes
[(key most-freq)
@@ -219,3 +247,53 @@
(get-highest-freq-percent (sort-by val > (frequencies (map count tokens)))) ;26% of the text is 3 letter tokens
)
+;;; NOTE FOR VIDEO:
+;; First, consider some kind of bit-packing scheme
+;; - 65 total chars
+;; - Shave it down to 63
+;; - Remove caps/reduce even furhter to try and get it to 31
+;; Then introduce huffman encodings
+;; - Raw text
+;; - Text with some of the above optimizations
+;; - Tokenize at spaces and punctuation
+;; Then consider tokenizing I started off by tokenizing the text with some optimizations in place with stage1-optimized-token-encoding-length
+
+(comment
+ "Huffman on the raw text (65 total symbols)"
+ (let [tokenized (seq full-text)
+ symbol-freqs (frequencies tokenized)
+ huff-tree (build-huffman-tree symbol-freqs)
+ canonical-codes (build-canonical-encodings huff-tree)]
+ (get-encoded-length canonical-codes tokenized))
+ "Huffman on the above with normalized space and without * (63 total symbols)"
+ (let [tokenized (seq (-> full-text
+ (str/replace #"\s+" " ")
+ (str/replace #"\*" " ")))
+ symbol-freqs (frequencies tokenized)
+ huff-tree (build-huffman-tree symbol-freqs)
+ canonical-codes (build-canonical-encodings huff-tree)]
+ (get-encoded-length canonical-codes tokenized))
+ "Huffman on the above in all lowercase (38 total symbols)"
+ (let [tokenized (seq (-> full-text
+ (str/replace #"\s+" " ")
+ (str/replace #"[\*]" "")
+ (str/lower-case)))
+ symbol-freqs (frequencies tokenized)
+ huff-tree (build-huffman-tree symbol-freqs)
+ canonical-codes (build-canonical-encodings huff-tree)]
+ (get-encoded-length canonical-codes tokenized))
+ "Huffman on the above with char changes —()?!:; (31 total symbols)"
+ (let [tokenized (seq (-> full-text
+ (str/replace #"—" "-") ; Replace emdash with hyphen
+ (str/replace #"[\*]" "") ; Get rid of chars
+ (str/replace #"\([^)]*\)" "") ; Get rid of everything between parens
+ (str/replace #"[?!]" ".") ; Replace all final punct with periods
+ (str/replace #"[;:]" ",") ; Replace all middle punct with commas
+ (str/replace #"\s+" " ") ; Normalize space
+ (str/lower-case)))
+ symbol-freqs (frequencies tokenized)
+ huff-tree (build-huffman-tree symbol-freqs)
+ canonical-codes (build-canonical-encodings huff-tree)]
+ (get-encoded-length canonical-codes tokenized))
+ )
+