diff options
Diffstat (limited to 'bible/encode.clj')
| -rw-r--r-- | bible/encode.clj | 71 |
1 files changed, 59 insertions, 12 deletions
diff --git a/bible/encode.clj b/bible/encode.clj index ac6195d..abd5365 100644 --- a/bible/encode.clj +++ b/bible/encode.clj @@ -5,6 +5,8 @@ (require '[clojure.data.priority-map :as pm]) +(require '[clojure.math :as math]) + (comment "Build the base file" (def files (fs/glob "./base_files/" "**.txt")) @@ -33,7 +35,7 @@ (-> full-text (str/lower-case) (str/replace #"\s+" " ") - (str/replace #"'s" " APOSTROPHE_S ") + (str/replace #"'s" " AS ") ;; Apostrophe_S (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " ")) (str/split #" ") (#(remove str/blank? %1)))) @@ -61,14 +63,14 @@ (defrecord Node [left right sym probability]) ; Create a prioirity-queue of parentless nodes for each symbol -(def pq +(defn symboltable->pq [symbolfreqs] (into (pm/priority-map-keyfn (juxt first second)) (map #(vector (->Node nil nil (first %1) (second %1)) [(second %1) (first %1)]) - symbol-freqs))) + symbolfreqs))) -(assert (= (count symbol-freqs) (count pq)) "Priority queue has fewer symbols than symbol list") +(assert (= (count symbol-freqs) (count (symboltable->pq symbol-freqs))) "Priority queue has fewer symbols than symbol list") ;; From: https://michaeldipperstein.github.io/huffman.html#decode ;; Step 1. Create a parentless node for each symbol. Each node should include the symbol and its probability. @@ -79,7 +81,9 @@ ;; NOTE: This is an inefficient algorithm because we could use the ;; two-queue version on wikipedia -(defn build-huffman-tree [queue] +(defn build-huffman-tree-from-queue + "Builds a huffman-tree out of a priority-queue" + [queue] (if (= 1 (count queue)) (first (peek queue)) ; Repeat until there is only one parentless node left (let [[lowest-node [lowest-prob _]] (peek queue) @@ -89,7 +93,12 @@ next-queue (assoc (pop (pop queue)) new-node [new-prob nil])] (recur next-queue)))) -(def huffman-tree (build-huffman-tree pq)) +(defn build-huffman-tree + "builds a huffman-tree out of a frequency table of symbols" + [symbol-freq-table] + (build-huffman-tree-from-queue (symboltable->pq symbol-freq-table))) + +(def huffman-tree (build-huffman-tree symbol-freqs)) (assert (= (.probability huffman-tree) (reduce + (map val symbol-freqs))) @@ -116,7 +125,18 @@ ;; Each subsequent symbol is assigned the next binary number in sequence, ensuring that following codes are always higher in value. ;; When you reach a longer codeword, then after incrementing, append zeros until the length of the new codeword is equal to the length of the old codeword. This can be thought of as a left shift. + +;; little class for Huffman Codewords (defrecord HuffmanCodeword [sym code length]) +(defn get-codeword-string + "Takes a [HuffmanCodeword] and returns a binary str of the code" + [codeword] + (let [base (Long/toBinaryString (.code codeword)) + basecount (count base) + len (.length codeword)] + (if (= basecount len) + base + (str (apply str (take (- len basecount) (repeat "0"))) base)))) (def sorted-huffman-tree-codewords (->> huffman-tree-syms @@ -124,7 +144,8 @@ (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1))))))) (defn build-canonical-encodings - "Build canonical huffman encodings from a sorted list of huffman tree codewords" + "Build canonical huffman encodings from a sorted list of huffman tree codewords + takes [symbols] a list of huffman codes derived from a code-tree" ([symbols] (let [first-sym (first symbols) seed-symbol (->HuffmanCodeword (.sym first-sym) 0 (.length first-sym))] @@ -156,6 +177,7 @@ (def canonical-encodings (build-canonical-encodings sorted-huffman-tree-codewords)) + (assert (= (map #(.length %1) sorted-huffman-tree-codewords) (map #(.length %1) sorted-huffman-tree-codewords)) @@ -166,9 +188,34 @@ (count (set (map #(Long/toBinaryString (.code %1)) canonical-encodings)))) "There appears to be duplicate canonical encodings") +(defn get-encoded-length [encodings tokenlist] + (let [encodingtable (into {} (map #(vector (.sym %1) (.length %1)) encodings)) + totalbits (reduce + (map encodingtable tokenlist)) + totalbytes (math/ceil (/ totalbits 8)) + totalkb (math/ceil (/ totalbytes 1024)) + totalmb (math/ceil (/ totalkb 1024))] + {:bits totalbits + :bytes totalbytes + :kb totalkb + :mb totalmb})) + +(count (frequencies (str/replace full-text #"\*" ""))) + +(def raw-token-ascii-encoding + (get-encoded-length canonical-encodings tokens)) + (comment - "Some basic stuff" - (take 10 (sort-by (juxt (comp count val) key) huffman-tree-syms)) - (take 10 sorted-huffman-tree-codewords) - (take 10 canonical-encodings) - (take 10 (map #(Long/toBinaryString (.code %1)) canonical-encodings))) + "Figuring out how to make the shortest bit smaller" + (defn get-highest-freq-percent [freqlist] + (let [most-freq (first (sort-by val > freqlist))] ;the fn already sorts, but most invocations pre-sort for REPL purposes + [(key most-freq) + (double + (/ (second most-freq) + (reduce + (map second symbol-freqs))))])) + + (get-highest-freq-percent symbol-freqs) ;most frequent symbol is 6.83% + (get-highest-freq-percent (sort-by val > (frequencies (map first tokens)))) ;most frequent 1-letter prefix + (get-highest-freq-percent (sort-by val > (frequencies (map last tokens)))) ;most frequent 1-letter suffix + (get-highest-freq-percent (sort-by val > (frequencies (map count tokens)))) ;26% of the text is 3 letter tokens + ) + |
