Add comments

author: Patrick Kingston <patrick@pkingston.xyz> 2026-01-23 01:11:01 -0500
committer: Patrick Kingston <patrick@pkingston.xyz> 2026-01-23 01:11:01 -0500
commit: aed836605a48bd09294e459169599ef72eecd126 (patch)
tree: 2f32d4e37e8644333c5d0d6db816be16c8c2127e
parent: 162b35af9e54a573351bb19b239fb95fb4f57cc8 (diff)
1 files changed, 49 insertions, 46 deletions
diff --git a/bible/encode.clj b/bible/encode.clj
index 8a2976b..5d91bd4 100644
--- a/bible/encode.clj
+++ b/bible/encode.clj
@@ -5,51 +5,54 @@
 
 (require '[clojure.data.priority-map :as pm])
 
-#_(def files (fs/glob "./base_files/" "**.txt"))
+(comment
+  "Build the base file"
+  (def files (fs/glob "./base_files/" "**.txt"))
 
-#_(defn get-book-num [filename]
-  (let [[_ _ book _ _]
-        (str/split (str filename) #"_")
-        #_#_chap (int _chap)]
-    (Integer/parseInt book)))
+  (defn get-book-num [filename]
+    (let [[_ _ book _ _]
+          (str/split (str filename) #"_")
+          #_#_chap (int _chap)]
+      (Integer/parseInt book)))
 
-#_(defn get-chap-num [filename]
-  (let [[_ _ _ _ chap]
-        (str/split (str filename) #"_")
-        #_#_chap (int _chap)]
-    (Integer/parseInt chap)))
+  (defn get-chap-num [filename]
+    (let [[_ _ _ _ chap]
+          (str/split (str filename) #"_")
+          #_#_chap (int _chap)]
+      (Integer/parseInt chap)))
 
-#_(with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
-  (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
-    (with-open [reader (io/reader (fs/file f))]
-      (doseq [line (drop 2 (line-seq reader))]
-        (.write writer (str line "\n"))))))
+  (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
+    (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
+      (with-open [reader (io/reader (fs/file f))]
+        (doseq [line (drop 2 (line-seq reader))]
+          (.write writer (str line "\n")))))))
 
 (def full-text (slurp "bbe-newlines-nochaps.txt"))
 
 (def tokens
   (-> full-text 
-    (str/lower-case)
-    (str/replace #"\s+" " ")
-    (str/replace #"'s" " APOSTROPHE_S ")
-    (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
-    (str/split #" ")
-    (#(remove str/blank? %1))))
+      (str/lower-case)
+      (str/replace #"\s+" " ")
+      (str/replace #"'s" " APOSTROPHE_S ")
+      (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
+      (str/split #" ")
+      (#(remove str/blank? %1))))
 
 (def symbol-freqs (frequencies tokens))
 
-#_(spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs))))
+(comment
+  "Do some basic statistics and print a list of tokens"
+  (spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs))))
 
+  (sort-by val > symbol-freqs)  ; Greatest to lease frequency
+  (reduce + (map val symbol-freqs))  ; Total tokens
+  (count symbol-freqs)  ; Total unique tokens
+  (reduce + (take 512 (map val symbol-freqs)))  ; Number of the top 100 common tokens
 
-#_(sort-by val > symbol-freqs)  ; Greatest to lease frequency
-#_(reduce + (map val symbol-freqs))  ; Total tokens
-#_(count symbol-freqs)  ; Total unique tokens
-#_(reduce + (take 512 (map val symbol-freqs)))  ; Number of the top 100 common tokens
+  (reduce + (map count symbol-freqs))  ; Total chars needed for dict vals
 
-#_(reduce + (map count symbol-freqs))  ; Total chars needed for dict vals
-
-#_(def two-grams (frequencies (partition 2 1 tokens)))
-#_(sort-by val > two-grams)
+  (def two-grams (frequencies (partition 2 1 tokens)))
+  (sort-by val > two-grams))
 
 
 
@@ -96,8 +99,8 @@
   (if (.sym node)
     (assoc encodings (.sym node) curr-encoding)
     (merge
-        (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding))
-        (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding)))))
+      (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding))
+      (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding)))))
 
 (def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} ""))
 (assert (= (count huffman-tree-syms)
@@ -128,19 +131,19 @@
   ([codes symbols]
    (if (not-empty symbols)
      (let [prev-codeword (last codes)
-         current-codeword (first symbols)
-         next-sym (.sym current-codeword)
-         next-base-code (unchecked-inc (.code prev-codeword))
-         prev-len (.length prev-codeword)
-         next-codeword (if (= (.length current-codeword)
-                              (.length prev-codeword))
-                         (->HuffmanCodeword next-sym
-                                            next-base-code
-                                            prev-len)
-                         (->HuffmanCodeword next-sym
-                                            (bit-shift-left next-base-code (inc prev-len))
-                                            (inc prev-len)))]
-     (recur (conj codes next-codeword) (rest symbols)))
+           current-codeword (first symbols)
+           next-sym (.sym current-codeword)
+           next-base-code (unchecked-inc (.code prev-codeword))
+           prev-len (.length prev-codeword)
+           next-codeword (if (= (.length current-codeword)
+                                (.length prev-codeword))
+                           (->HuffmanCodeword next-sym
+                                              next-base-code
+                                              prev-len)
+                           (->HuffmanCodeword next-sym
+                                              (bit-shift-left next-base-code (inc prev-len))
+                                              (inc prev-len)))]
+       (recur (conj codes next-codeword) (rest symbols)))
      codes)))
author	Patrick Kingston <patrick@pkingston.xyz>	2026-01-23 01:11:01 -0500
committer	Patrick Kingston <patrick@pkingston.xyz>	2026-01-23 01:11:01 -0500
commit	aed836605a48bd09294e459169599ef72eecd126 (patch)
tree	2f32d4e37e8644333c5d0d6db816be16c8c2127e
parent	162b35af9e54a573351bb19b239fb95fb4f57cc8 (diff)