aboutsummaryrefslogtreecommitdiff
path: root/bible/encode.clj
diff options
context:
space:
mode:
Diffstat (limited to 'bible/encode.clj')
-rw-r--r--bible/encode.clj106
1 files changed, 106 insertions, 0 deletions
diff --git a/bible/encode.clj b/bible/encode.clj
new file mode 100644
index 0000000..031b6aa
--- /dev/null
+++ b/bible/encode.clj
@@ -0,0 +1,106 @@
+(require '[clojure.string :as str])
+
+(require '[clojure.java.io :as io])
+(require '[babashka.fs :as fs])
+
+(require '[clojure.data.priority-map :as pm])
+
+(def files (fs/glob "./base_files/" "**.txt"))
+
+(defn get-book-num [filename]
+ (let [[_ _ book _ _]
+ (str/split (str filename) #"_")
+ #_#_chap (int _chap)]
+ (Integer/parseInt book)))
+
+(defn get-chap-num [filename]
+ (let [[_ _ _ _ chap]
+ (str/split (str filename) #"_")
+ #_#_chap (int _chap)]
+ (Integer/parseInt chap)))
+
+(with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
+ (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
+ (with-open [reader (io/reader (fs/file f))]
+ (doseq [line (drop 2 (line-seq reader))]
+ (.write writer (str line "\n"))))))
+
+(def full-text (slurp "bbe-newlines-nochaps.txt"))
+
+(def tokens
+ (-> full-text
+ (str/lower-case)
+ (str/replace #"\s+" " ")
+ (str/replace #"'s" " APOSTROPHE_S ")
+ (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
+ (str/split #" ")
+ (#(remove str/blank? %1))))
+
+(def symbol-freqs (frequencies tokens))
+
+#_(spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs))))
+
+
+(sort-by val > symbol-freqs) ; Greatest to lease frequency
+(reduce + (map val symbol-freqs)) ; Total tokens
+(count symbol-freqs) ; Total unique tokens
+(reduce + (take 512 (map val symbol-freqs))) ; Number of the top 100 common tokens
+
+(reduce + (map count symbol-freqs)) ; Total chars needed for dict vals
+
+#_(def two-grams (frequencies (partition 2 1 tokens)))
+#_(sort-by val > two-grams)
+
+
+
+
+;;; Make the huffman tree for the symbols (13)
+(defrecord Node [left right sym probability])
+
+; Create a prioirity-queue of parentless nodes for each symbol
+(def pq
+ (into (pm/priority-map-keyfn (juxt first second))
+ (map #(vector
+ (->Node nil nil (first %1) (second %1))
+ [(second %1) (first %1)])
+ symbol-freqs)))
+
+(assert (= (count symbol-freqs) (count pq)) "Priority queue has fewer symbols than symbol list")
+
+;; From: https://michaeldipperstein.github.io/huffman.html#decode
+;; Step 1. Create a parentless node for each symbol. Each node should include the symbol and its probability.
+;; Step 2. Select the two parentless nodes with the lowest probabilities.
+;; Step 3. Create a new node which is the parent of the two lowest probability nodes.
+;; Step 4. Assign the new node a probability equal to the sum of its children's probabilities.
+;; Step 5. Repeat from Step 2 until there is only one parentless node left.
+
+;; NOTE: This is an inefficient algorithm because we could use the
+;; two-queue version on wikipedia
+(defn build-huffman-tree [queue]
+ (if (= 1 (count queue))
+ (first (peek queue)) ; Repeat until there is only one parentless node left
+ (let [[lowest-node [lowest-prob _]] (peek queue)
+ [second-node [second-prob _]] (peek (pop queue)) ; Step 2
+ new-prob (+ lowest-prob second-prob) ; Step 4
+ new-node (->Node second-node lowest-node nil new-prob) ; Step 3 - NOTE: unsure about node order
+ next-queue (assoc (pop (pop queue)) new-node [new-prob nil])]
+ (recur next-queue))))
+
+(def huffman-tree (build-huffman-tree pq))
+
+(assert (= (.probability huffman-tree)
+ (reduce + (map val symbol-freqs)))
+ "Probability of root node is not equal to the sum of all probabilities")
+
+(defn huffman-tree-to-symbol-encodings [node encodings curr-encoding]
+ (if (.sym node)
+ (assoc encodings (.sym node) curr-encoding)
+ (merge
+ (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding))
+ (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding)))))
+
+(def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} ""))
+(assert (= (count huffman-tree-syms)
+ (count pq)
+ (count symbol-freqs)))
+