1 files changed, 59 insertions, 12 deletions
diff --git a/bible/encode.clj b/bible/encode.clj
index ac6195d..abd5365 100644
--- a/bible/encode.clj
+++ b/bible/encode.clj
@@ -5,6 +5,8 @@
 
 (require '[clojure.data.priority-map :as pm])
 
+(require '[clojure.math :as math])
+
 (comment
   "Build the base file"
   (def files (fs/glob "./base_files/" "**.txt"))
@@ -33,7 +35,7 @@
   (-> full-text 
       (str/lower-case)
       (str/replace #"\s+" " ")
-      (str/replace #"'s" " APOSTROPHE_S ")
+      (str/replace #"'s" " AS ") ;; Apostrophe_S
       (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
       (str/split #" ")
       (#(remove str/blank? %1))))
@@ -61,14 +63,14 @@
 (defrecord Node [left right sym probability])
 
 ; Create a prioirity-queue of parentless nodes for each symbol
-(def pq
+(defn symboltable->pq [symbolfreqs]
   (into (pm/priority-map-keyfn (juxt first second))
         (map #(vector
                 (->Node nil nil (first %1) (second %1))
                 [(second %1) (first %1)])
-             symbol-freqs)))
+             symbolfreqs)))
 
-(assert (= (count symbol-freqs) (count pq)) "Priority queue has fewer symbols than symbol list")
+(assert (= (count symbol-freqs) (count (symboltable->pq symbol-freqs))) "Priority queue has fewer symbols than symbol list")
 
 ;; From: https://michaeldipperstein.github.io/huffman.html#decode
 ;; Step 1. Create a parentless node for each symbol. Each node should include the symbol and its probability.
@@ -79,7 +81,9 @@
 
 ;; NOTE: This is an inefficient algorithm because we could use the 
 ;;       two-queue version on wikipedia
-(defn build-huffman-tree [queue]
+(defn build-huffman-tree-from-queue
+  "Builds a huffman-tree out of a priority-queue"
+  [queue]
   (if (= 1 (count queue))
     (first (peek queue))  ; Repeat until there is only one parentless node left
     (let [[lowest-node [lowest-prob _]] (peek queue)
@@ -89,7 +93,12 @@
           next-queue (assoc (pop (pop queue)) new-node [new-prob nil])] 
       (recur next-queue))))
 
-(def huffman-tree (build-huffman-tree pq))
+(defn build-huffman-tree
+  "builds a huffman-tree out of a frequency table of symbols"
+  [symbol-freq-table]
+  (build-huffman-tree-from-queue (symboltable->pq symbol-freq-table)))
+
+(def huffman-tree (build-huffman-tree symbol-freqs))
 
 (assert (= (.probability huffman-tree)
            (reduce + (map val symbol-freqs)))
@@ -116,7 +125,18 @@
 ;;     Each subsequent symbol is assigned the next binary number in sequence, ensuring that following codes are always higher in value.
 ;;     When you reach a longer codeword, then after incrementing, append zeros until the length of the new codeword is equal to the length of the old codeword. This can be thought of as a left shift.
 
+
+;; little class for Huffman Codewords
 (defrecord HuffmanCodeword [sym code length])
+(defn get-codeword-string 
+  "Takes a [HuffmanCodeword] and returns a binary str of the code"
+  [codeword]
+  (let [base (Long/toBinaryString (.code codeword))
+        basecount (count base)
+        len (.length codeword)]
+    (if (= basecount len)
+      base
+      (str (apply str (take (- len basecount) (repeat "0"))) base))))
 
 (def sorted-huffman-tree-codewords 
   (->> huffman-tree-syms
@@ -124,7 +144,8 @@
        (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1)))))))
 
 (defn build-canonical-encodings
-  "Build canonical huffman encodings from a sorted list of huffman tree codewords"
+  "Build canonical huffman encodings from a sorted list of huffman tree codewords
+   takes [symbols] a list of huffman codes derived from a code-tree"
   ([symbols]
    (let [first-sym (first symbols)
          seed-symbol (->HuffmanCodeword (.sym first-sym) 0 (.length first-sym))]
@@ -156,6 +177,7 @@
 (def canonical-encodings
   (build-canonical-encodings sorted-huffman-tree-codewords))
 
+
 (assert
   (= (map #(.length %1) sorted-huffman-tree-codewords)
      (map #(.length %1) sorted-huffman-tree-codewords))
@@ -166,9 +188,34 @@
      (count (set (map #(Long/toBinaryString (.code %1)) canonical-encodings))))
   "There appears to be duplicate canonical encodings")
 
+(defn get-encoded-length [encodings tokenlist]
+  (let [encodingtable (into {} (map #(vector (.sym %1) (.length %1)) encodings))
+        totalbits (reduce + (map encodingtable tokenlist))
+        totalbytes (math/ceil (/ totalbits 8))
+        totalkb (math/ceil (/ totalbytes 1024))
+        totalmb (math/ceil (/ totalkb 1024))]
+    {:bits totalbits
+     :bytes totalbytes
+     :kb totalkb
+     :mb totalmb}))
+
+(count (frequencies (str/replace full-text #"\*" "")))
+
+(def raw-token-ascii-encoding
+  (get-encoded-length canonical-encodings tokens))
+
 (comment
-  "Some basic stuff"
-  (take 10 (sort-by (juxt (comp count val) key) huffman-tree-syms))
-  (take 10 sorted-huffman-tree-codewords)
-  (take 10 canonical-encodings)
-  (take 10 (map #(Long/toBinaryString (.code %1)) canonical-encodings)))
+  "Figuring out how to make the shortest bit smaller"
+  (defn get-highest-freq-percent [freqlist]
+    (let [most-freq (first (sort-by val > freqlist))] ;the fn already sorts, but most invocations pre-sort for REPL purposes
+    [(key most-freq)
+     (double 
+      (/ (second most-freq)
+         (reduce + (map second symbol-freqs))))]))
+
+  (get-highest-freq-percent symbol-freqs) ;most frequent symbol is 6.83%
+  (get-highest-freq-percent (sort-by val > (frequencies (map first tokens)))) ;most frequent 1-letter prefix
+  (get-highest-freq-percent (sort-by val > (frequencies (map last tokens)))) ;most frequent 1-letter suffix
+  (get-highest-freq-percent (sort-by val > (frequencies (map count tokens)))) ;26% of the text is 3 letter tokens
+  )
+