aboutsummaryrefslogtreecommitdiff
path: root/bible
diff options
context:
space:
mode:
authorPatrick Kingston <patrick@pkingston.xyz>2026-01-23 01:11:01 -0500
committerPatrick Kingston <patrick@pkingston.xyz>2026-01-23 01:11:01 -0500
commitaed836605a48bd09294e459169599ef72eecd126 (patch)
tree2f32d4e37e8644333c5d0d6db816be16c8c2127e /bible
parent162b35af9e54a573351bb19b239fb95fb4f57cc8 (diff)
Add comments
Diffstat (limited to 'bible')
-rw-r--r--bible/encode.clj95
1 files changed, 49 insertions, 46 deletions
diff --git a/bible/encode.clj b/bible/encode.clj
index 8a2976b..5d91bd4 100644
--- a/bible/encode.clj
+++ b/bible/encode.clj
@@ -5,51 +5,54 @@
(require '[clojure.data.priority-map :as pm])
-#_(def files (fs/glob "./base_files/" "**.txt"))
+(comment
+ "Build the base file"
+ (def files (fs/glob "./base_files/" "**.txt"))
-#_(defn get-book-num [filename]
- (let [[_ _ book _ _]
- (str/split (str filename) #"_")
- #_#_chap (int _chap)]
- (Integer/parseInt book)))
+ (defn get-book-num [filename]
+ (let [[_ _ book _ _]
+ (str/split (str filename) #"_")
+ #_#_chap (int _chap)]
+ (Integer/parseInt book)))
-#_(defn get-chap-num [filename]
- (let [[_ _ _ _ chap]
- (str/split (str filename) #"_")
- #_#_chap (int _chap)]
- (Integer/parseInt chap)))
+ (defn get-chap-num [filename]
+ (let [[_ _ _ _ chap]
+ (str/split (str filename) #"_")
+ #_#_chap (int _chap)]
+ (Integer/parseInt chap)))
-#_(with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
- (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
- (with-open [reader (io/reader (fs/file f))]
- (doseq [line (drop 2 (line-seq reader))]
- (.write writer (str line "\n"))))))
+ (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
+ (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
+ (with-open [reader (io/reader (fs/file f))]
+ (doseq [line (drop 2 (line-seq reader))]
+ (.write writer (str line "\n")))))))
(def full-text (slurp "bbe-newlines-nochaps.txt"))
(def tokens
(-> full-text
- (str/lower-case)
- (str/replace #"\s+" " ")
- (str/replace #"'s" " APOSTROPHE_S ")
- (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
- (str/split #" ")
- (#(remove str/blank? %1))))
+ (str/lower-case)
+ (str/replace #"\s+" " ")
+ (str/replace #"'s" " APOSTROPHE_S ")
+ (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
+ (str/split #" ")
+ (#(remove str/blank? %1))))
(def symbol-freqs (frequencies tokens))
-#_(spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs))))
+(comment
+ "Do some basic statistics and print a list of tokens"
+ (spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs))))
+ (sort-by val > symbol-freqs) ; Greatest to lease frequency
+ (reduce + (map val symbol-freqs)) ; Total tokens
+ (count symbol-freqs) ; Total unique tokens
+ (reduce + (take 512 (map val symbol-freqs))) ; Number of the top 100 common tokens
-#_(sort-by val > symbol-freqs) ; Greatest to lease frequency
-#_(reduce + (map val symbol-freqs)) ; Total tokens
-#_(count symbol-freqs) ; Total unique tokens
-#_(reduce + (take 512 (map val symbol-freqs))) ; Number of the top 100 common tokens
+ (reduce + (map count symbol-freqs)) ; Total chars needed for dict vals
-#_(reduce + (map count symbol-freqs)) ; Total chars needed for dict vals
-
-#_(def two-grams (frequencies (partition 2 1 tokens)))
-#_(sort-by val > two-grams)
+ (def two-grams (frequencies (partition 2 1 tokens)))
+ (sort-by val > two-grams))
@@ -96,8 +99,8 @@
(if (.sym node)
(assoc encodings (.sym node) curr-encoding)
(merge
- (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding))
- (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding)))))
+ (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding))
+ (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding)))))
(def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} ""))
(assert (= (count huffman-tree-syms)
@@ -128,19 +131,19 @@
([codes symbols]
(if (not-empty symbols)
(let [prev-codeword (last codes)
- current-codeword (first symbols)
- next-sym (.sym current-codeword)
- next-base-code (unchecked-inc (.code prev-codeword))
- prev-len (.length prev-codeword)
- next-codeword (if (= (.length current-codeword)
- (.length prev-codeword))
- (->HuffmanCodeword next-sym
- next-base-code
- prev-len)
- (->HuffmanCodeword next-sym
- (bit-shift-left next-base-code (inc prev-len))
- (inc prev-len)))]
- (recur (conj codes next-codeword) (rest symbols)))
+ current-codeword (first symbols)
+ next-sym (.sym current-codeword)
+ next-base-code (unchecked-inc (.code prev-codeword))
+ prev-len (.length prev-codeword)
+ next-codeword (if (= (.length current-codeword)
+ (.length prev-codeword))
+ (->HuffmanCodeword next-sym
+ next-base-code
+ prev-len)
+ (->HuffmanCodeword next-sym
+ (bit-shift-left next-base-code (inc prev-len))
+ (inc prev-len)))]
+ (recur (conj codes next-codeword) (rest symbols)))
codes)))