aboutsummaryrefslogtreecommitdiff
path: root/bible
diff options
context:
space:
mode:
authorPatrick Kingston <patrick@pkingston.xyz>2026-01-23 12:54:20 -0500
committerPatrick Kingston <patrick@pkingston.xyz>2026-01-23 12:54:20 -0500
commitbcab304fb841814ea0acb4b912c8a7df5a395dab (patch)
tree374aabc77f9f680a9a7bd149382b538e08ffc875 /bible
parent905c5403d139b22ebf19dc752697e91eac87a060 (diff)
Start working on optimizing
Diffstat (limited to 'bible')
-rw-r--r--bible/encode.clj71
1 files changed, 59 insertions, 12 deletions
diff --git a/bible/encode.clj b/bible/encode.clj
index ac6195d..abd5365 100644
--- a/bible/encode.clj
+++ b/bible/encode.clj
@@ -5,6 +5,8 @@
(require '[clojure.data.priority-map :as pm])
+(require '[clojure.math :as math])
+
(comment
"Build the base file"
(def files (fs/glob "./base_files/" "**.txt"))
@@ -33,7 +35,7 @@
(-> full-text
(str/lower-case)
(str/replace #"\s+" " ")
- (str/replace #"'s" " APOSTROPHE_S ")
+ (str/replace #"'s" " AS ") ;; Apostrophe_S
(str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
(str/split #" ")
(#(remove str/blank? %1))))
@@ -61,14 +63,14 @@
(defrecord Node [left right sym probability])
; Create a prioirity-queue of parentless nodes for each symbol
-(def pq
+(defn symboltable->pq [symbolfreqs]
(into (pm/priority-map-keyfn (juxt first second))
(map #(vector
(->Node nil nil (first %1) (second %1))
[(second %1) (first %1)])
- symbol-freqs)))
+ symbolfreqs)))
-(assert (= (count symbol-freqs) (count pq)) "Priority queue has fewer symbols than symbol list")
+(assert (= (count symbol-freqs) (count (symboltable->pq symbol-freqs))) "Priority queue has fewer symbols than symbol list")
;; From: https://michaeldipperstein.github.io/huffman.html#decode
;; Step 1. Create a parentless node for each symbol. Each node should include the symbol and its probability.
@@ -79,7 +81,9 @@
;; NOTE: This is an inefficient algorithm because we could use the
;; two-queue version on wikipedia
-(defn build-huffman-tree [queue]
+(defn build-huffman-tree-from-queue
+ "Builds a huffman-tree out of a priority-queue"
+ [queue]
(if (= 1 (count queue))
(first (peek queue)) ; Repeat until there is only one parentless node left
(let [[lowest-node [lowest-prob _]] (peek queue)
@@ -89,7 +93,12 @@
next-queue (assoc (pop (pop queue)) new-node [new-prob nil])]
(recur next-queue))))
-(def huffman-tree (build-huffman-tree pq))
+(defn build-huffman-tree
+ "builds a huffman-tree out of a frequency table of symbols"
+ [symbol-freq-table]
+ (build-huffman-tree-from-queue (symboltable->pq symbol-freq-table)))
+
+(def huffman-tree (build-huffman-tree symbol-freqs))
(assert (= (.probability huffman-tree)
(reduce + (map val symbol-freqs)))
@@ -116,7 +125,18 @@
;; Each subsequent symbol is assigned the next binary number in sequence, ensuring that following codes are always higher in value.
;; When you reach a longer codeword, then after incrementing, append zeros until the length of the new codeword is equal to the length of the old codeword. This can be thought of as a left shift.
+
+;; little class for Huffman Codewords
(defrecord HuffmanCodeword [sym code length])
+(defn get-codeword-string
+ "Takes a [HuffmanCodeword] and returns a binary str of the code"
+ [codeword]
+ (let [base (Long/toBinaryString (.code codeword))
+ basecount (count base)
+ len (.length codeword)]
+ (if (= basecount len)
+ base
+ (str (apply str (take (- len basecount) (repeat "0"))) base))))
(def sorted-huffman-tree-codewords
(->> huffman-tree-syms
@@ -124,7 +144,8 @@
(map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1)))))))
(defn build-canonical-encodings
- "Build canonical huffman encodings from a sorted list of huffman tree codewords"
+ "Build canonical huffman encodings from a sorted list of huffman tree codewords
+ takes [symbols] a list of huffman codes derived from a code-tree"
([symbols]
(let [first-sym (first symbols)
seed-symbol (->HuffmanCodeword (.sym first-sym) 0 (.length first-sym))]
@@ -156,6 +177,7 @@
(def canonical-encodings
(build-canonical-encodings sorted-huffman-tree-codewords))
+
(assert
(= (map #(.length %1) sorted-huffman-tree-codewords)
(map #(.length %1) sorted-huffman-tree-codewords))
@@ -166,9 +188,34 @@
(count (set (map #(Long/toBinaryString (.code %1)) canonical-encodings))))
"There appears to be duplicate canonical encodings")
+(defn get-encoded-length [encodings tokenlist]
+ (let [encodingtable (into {} (map #(vector (.sym %1) (.length %1)) encodings))
+ totalbits (reduce + (map encodingtable tokenlist))
+ totalbytes (math/ceil (/ totalbits 8))
+ totalkb (math/ceil (/ totalbytes 1024))
+ totalmb (math/ceil (/ totalkb 1024))]
+ {:bits totalbits
+ :bytes totalbytes
+ :kb totalkb
+ :mb totalmb}))
+
+(count (frequencies (str/replace full-text #"\*" "")))
+
+(def raw-token-ascii-encoding
+ (get-encoded-length canonical-encodings tokens))
+
(comment
- "Some basic stuff"
- (take 10 (sort-by (juxt (comp count val) key) huffman-tree-syms))
- (take 10 sorted-huffman-tree-codewords)
- (take 10 canonical-encodings)
- (take 10 (map #(Long/toBinaryString (.code %1)) canonical-encodings)))
+ "Figuring out how to make the shortest bit smaller"
+ (defn get-highest-freq-percent [freqlist]
+ (let [most-freq (first (sort-by val > freqlist))] ;the fn already sorts, but most invocations pre-sort for REPL purposes
+ [(key most-freq)
+ (double
+ (/ (second most-freq)
+ (reduce + (map second symbol-freqs))))]))
+
+ (get-highest-freq-percent symbol-freqs) ;most frequent symbol is 6.83%
+ (get-highest-freq-percent (sort-by val > (frequencies (map first tokens)))) ;most frequent 1-letter prefix
+ (get-highest-freq-percent (sort-by val > (frequencies (map last tokens)))) ;most frequent 1-letter suffix
+ (get-highest-freq-percent (sort-by val > (frequencies (map count tokens)))) ;26% of the text is 3 letter tokens
+ )
+