diff options
Diffstat (limited to 'bible/encode.clj')
| -rw-r--r-- | bible/encode.clj | 106 |
1 files changed, 106 insertions, 0 deletions
diff --git a/bible/encode.clj b/bible/encode.clj new file mode 100644 index 0000000..031b6aa --- /dev/null +++ b/bible/encode.clj @@ -0,0 +1,106 @@ +(require '[clojure.string :as str]) + +(require '[clojure.java.io :as io]) +(require '[babashka.fs :as fs]) + +(require '[clojure.data.priority-map :as pm]) + +(def files (fs/glob "./base_files/" "**.txt")) + +(defn get-book-num [filename] + (let [[_ _ book _ _] + (str/split (str filename) #"_") + #_#_chap (int _chap)] + (Integer/parseInt book))) + +(defn get-chap-num [filename] + (let [[_ _ _ _ chap] + (str/split (str filename) #"_") + #_#_chap (int _chap)] + (Integer/parseInt chap))) + +(with-open [writer (io/writer "bbe-newlines-nochaps.txt")] + (doseq [f (sort-by (juxt get-book-num get-chap-num) files)] + (with-open [reader (io/reader (fs/file f))] + (doseq [line (drop 2 (line-seq reader))] + (.write writer (str line "\n")))))) + +(def full-text (slurp "bbe-newlines-nochaps.txt")) + +(def tokens + (-> full-text + (str/lower-case) + (str/replace #"\s+" " ") + (str/replace #"'s" " APOSTROPHE_S ") + (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " ")) + (str/split #" ") + (#(remove str/blank? %1)))) + +(def symbol-freqs (frequencies tokens)) + +#_(spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs)))) + + +(sort-by val > symbol-freqs) ; Greatest to lease frequency +(reduce + (map val symbol-freqs)) ; Total tokens +(count symbol-freqs) ; Total unique tokens +(reduce + (take 512 (map val symbol-freqs))) ; Number of the top 100 common tokens + +(reduce + (map count symbol-freqs)) ; Total chars needed for dict vals + +#_(def two-grams (frequencies (partition 2 1 tokens))) +#_(sort-by val > two-grams) + + + + +;;; Make the huffman tree for the symbols (13) +(defrecord Node [left right sym probability]) + +; Create a prioirity-queue of parentless nodes for each symbol +(def pq + (into (pm/priority-map-keyfn (juxt first second)) + (map #(vector + (->Node nil nil (first %1) (second %1)) + [(second %1) (first %1)]) + symbol-freqs))) + +(assert (= (count symbol-freqs) (count pq)) "Priority queue has fewer symbols than symbol list") + +;; From: https://michaeldipperstein.github.io/huffman.html#decode +;; Step 1. Create a parentless node for each symbol. Each node should include the symbol and its probability. +;; Step 2. Select the two parentless nodes with the lowest probabilities. +;; Step 3. Create a new node which is the parent of the two lowest probability nodes. +;; Step 4. Assign the new node a probability equal to the sum of its children's probabilities. +;; Step 5. Repeat from Step 2 until there is only one parentless node left. + +;; NOTE: This is an inefficient algorithm because we could use the +;; two-queue version on wikipedia +(defn build-huffman-tree [queue] + (if (= 1 (count queue)) + (first (peek queue)) ; Repeat until there is only one parentless node left + (let [[lowest-node [lowest-prob _]] (peek queue) + [second-node [second-prob _]] (peek (pop queue)) ; Step 2 + new-prob (+ lowest-prob second-prob) ; Step 4 + new-node (->Node second-node lowest-node nil new-prob) ; Step 3 - NOTE: unsure about node order + next-queue (assoc (pop (pop queue)) new-node [new-prob nil])] + (recur next-queue)))) + +(def huffman-tree (build-huffman-tree pq)) + +(assert (= (.probability huffman-tree) + (reduce + (map val symbol-freqs))) + "Probability of root node is not equal to the sum of all probabilities") + +(defn huffman-tree-to-symbol-encodings [node encodings curr-encoding] + (if (.sym node) + (assoc encodings (.sym node) curr-encoding) + (merge + (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding)) + (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding))))) + +(def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} "")) +(assert (= (count huffman-tree-syms) + (count pq) + (count symbol-freqs))) + |
