(require '[clojure.string :as str]) (require '[clojure.java.io :as io]) (require '[babashka.fs :as fs]) (require '[clojure.data.priority-map :as pm]) (def files (fs/glob "./base_files/" "**.txt")) (defn get-book-num [filename] (let [[_ _ book _ _] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt book))) (defn get-chap-num [filename] (let [[_ _ _ _ chap] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt chap))) (with-open [writer (io/writer "bbe-newlines-nochaps.txt")] (doseq [f (sort-by (juxt get-book-num get-chap-num) files)] (with-open [reader (io/reader (fs/file f))] (doseq [line (drop 2 (line-seq reader))] (.write writer (str line "\n")))))) (def full-text (slurp "bbe-newlines-nochaps.txt")) (def tokens (-> full-text (str/lower-case) (str/replace #"\s+" " ") (str/replace #"'s" " APOSTROPHE_S ") (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " ")) (str/split #" ") (#(remove str/blank? %1)))) (def symbol-freqs (frequencies tokens)) #_(spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs)))) (sort-by val > symbol-freqs) ; Greatest to lease frequency (reduce + (map val symbol-freqs)) ; Total tokens (count symbol-freqs) ; Total unique tokens (reduce + (take 512 (map val symbol-freqs))) ; Number of the top 100 common tokens (reduce + (map count symbol-freqs)) ; Total chars needed for dict vals #_(def two-grams (frequencies (partition 2 1 tokens))) #_(sort-by val > two-grams) ;;; Make the huffman tree for the symbols (13) (defrecord Node [left right sym probability]) ; Create a prioirity-queue of parentless nodes for each symbol (def pq (into (pm/priority-map-keyfn (juxt first second)) (map #(vector (->Node nil nil (first %1) (second %1)) [(second %1) (first %1)]) symbol-freqs))) (assert (= (count symbol-freqs) (count pq)) "Priority queue has fewer symbols than symbol list") ;; From: https://michaeldipperstein.github.io/huffman.html#decode ;; Step 1. Create a parentless node for each symbol. Each node should include the symbol and its probability. ;; Step 2. Select the two parentless nodes with the lowest probabilities. ;; Step 3. Create a new node which is the parent of the two lowest probability nodes. ;; Step 4. Assign the new node a probability equal to the sum of its children's probabilities. ;; Step 5. Repeat from Step 2 until there is only one parentless node left. ;; NOTE: This is an inefficient algorithm because we could use the ;; two-queue version on wikipedia (defn build-huffman-tree [queue] (if (= 1 (count queue)) (first (peek queue)) ; Repeat until there is only one parentless node left (let [[lowest-node [lowest-prob _]] (peek queue) [second-node [second-prob _]] (peek (pop queue)) ; Step 2 new-prob (+ lowest-prob second-prob) ; Step 4 new-node (->Node second-node lowest-node nil new-prob) ; Step 3 - NOTE: unsure about node order next-queue (assoc (pop (pop queue)) new-node [new-prob nil])] (recur next-queue)))) (def huffman-tree (build-huffman-tree pq)) (assert (= (.probability huffman-tree) (reduce + (map val symbol-freqs))) "Probability of root node is not equal to the sum of all probabilities") (defn huffman-tree-to-symbol-encodings [node encodings curr-encoding] (if (.sym node) (assoc encodings (.sym node) curr-encoding) (merge (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding)) (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding))))) (def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} "")) (assert (= (count huffman-tree-syms) (count pq) (count symbol-freqs)))