diff options
| author | Patrick Kingston <patrick@pkingston.xyz> | 2026-01-23 01:11:01 -0500 |
|---|---|---|
| committer | Patrick Kingston <patrick@pkingston.xyz> | 2026-01-23 01:11:01 -0500 |
| commit | aed836605a48bd09294e459169599ef72eecd126 (patch) | |
| tree | 2f32d4e37e8644333c5d0d6db816be16c8c2127e | |
| parent | 162b35af9e54a573351bb19b239fb95fb4f57cc8 (diff) | |
Add comments
| -rw-r--r-- | bible/encode.clj | 95 |
1 files changed, 49 insertions, 46 deletions
diff --git a/bible/encode.clj b/bible/encode.clj index 8a2976b..5d91bd4 100644 --- a/bible/encode.clj +++ b/bible/encode.clj @@ -5,51 +5,54 @@ (require '[clojure.data.priority-map :as pm]) -#_(def files (fs/glob "./base_files/" "**.txt")) +(comment + "Build the base file" + (def files (fs/glob "./base_files/" "**.txt")) -#_(defn get-book-num [filename] - (let [[_ _ book _ _] - (str/split (str filename) #"_") - #_#_chap (int _chap)] - (Integer/parseInt book))) + (defn get-book-num [filename] + (let [[_ _ book _ _] + (str/split (str filename) #"_") + #_#_chap (int _chap)] + (Integer/parseInt book))) -#_(defn get-chap-num [filename] - (let [[_ _ _ _ chap] - (str/split (str filename) #"_") - #_#_chap (int _chap)] - (Integer/parseInt chap))) + (defn get-chap-num [filename] + (let [[_ _ _ _ chap] + (str/split (str filename) #"_") + #_#_chap (int _chap)] + (Integer/parseInt chap))) -#_(with-open [writer (io/writer "bbe-newlines-nochaps.txt")] - (doseq [f (sort-by (juxt get-book-num get-chap-num) files)] - (with-open [reader (io/reader (fs/file f))] - (doseq [line (drop 2 (line-seq reader))] - (.write writer (str line "\n")))))) + (with-open [writer (io/writer "bbe-newlines-nochaps.txt")] + (doseq [f (sort-by (juxt get-book-num get-chap-num) files)] + (with-open [reader (io/reader (fs/file f))] + (doseq [line (drop 2 (line-seq reader))] + (.write writer (str line "\n"))))))) (def full-text (slurp "bbe-newlines-nochaps.txt")) (def tokens (-> full-text - (str/lower-case) - (str/replace #"\s+" " ") - (str/replace #"'s" " APOSTROPHE_S ") - (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " ")) - (str/split #" ") - (#(remove str/blank? %1)))) + (str/lower-case) + (str/replace #"\s+" " ") + (str/replace #"'s" " APOSTROPHE_S ") + (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " ")) + (str/split #" ") + (#(remove str/blank? %1)))) (def symbol-freqs (frequencies tokens)) -#_(spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs)))) +(comment + "Do some basic statistics and print a list of tokens" + (spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs)))) + (sort-by val > symbol-freqs) ; Greatest to lease frequency + (reduce + (map val symbol-freqs)) ; Total tokens + (count symbol-freqs) ; Total unique tokens + (reduce + (take 512 (map val symbol-freqs))) ; Number of the top 100 common tokens -#_(sort-by val > symbol-freqs) ; Greatest to lease frequency -#_(reduce + (map val symbol-freqs)) ; Total tokens -#_(count symbol-freqs) ; Total unique tokens -#_(reduce + (take 512 (map val symbol-freqs))) ; Number of the top 100 common tokens + (reduce + (map count symbol-freqs)) ; Total chars needed for dict vals -#_(reduce + (map count symbol-freqs)) ; Total chars needed for dict vals - -#_(def two-grams (frequencies (partition 2 1 tokens))) -#_(sort-by val > two-grams) + (def two-grams (frequencies (partition 2 1 tokens))) + (sort-by val > two-grams)) @@ -96,8 +99,8 @@ (if (.sym node) (assoc encodings (.sym node) curr-encoding) (merge - (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding)) - (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding))))) + (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding)) + (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding))))) (def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} "")) (assert (= (count huffman-tree-syms) @@ -128,19 +131,19 @@ ([codes symbols] (if (not-empty symbols) (let [prev-codeword (last codes) - current-codeword (first symbols) - next-sym (.sym current-codeword) - next-base-code (unchecked-inc (.code prev-codeword)) - prev-len (.length prev-codeword) - next-codeword (if (= (.length current-codeword) - (.length prev-codeword)) - (->HuffmanCodeword next-sym - next-base-code - prev-len) - (->HuffmanCodeword next-sym - (bit-shift-left next-base-code (inc prev-len)) - (inc prev-len)))] - (recur (conj codes next-codeword) (rest symbols))) + current-codeword (first symbols) + next-sym (.sym current-codeword) + next-base-code (unchecked-inc (.code prev-codeword)) + prev-len (.length prev-codeword) + next-codeword (if (= (.length current-codeword) + (.length prev-codeword)) + (->HuffmanCodeword next-sym + next-base-code + prev-len) + (->HuffmanCodeword next-sym + (bit-shift-left next-base-code (inc prev-len)) + (inc prev-len)))] + (recur (conj codes next-codeword) (rest symbols))) codes))) |
