write a basic dictionary compressor

author: Patrick Kingston <patrick@pkingston.xyz> 2026-01-23 20:01:42 -0500
committer: Patrick Kingston <patrick@pkingston.xyz> 2026-01-23 20:01:42 -0500
commit: a5cb8e19665de19f21d5b8f19719b9d448c99b67 (patch)
tree: b7690b6c3942943fb06399fa2b5b6d4afbf69da4 /bible/dictionary.clj
parent: d47553f0a7456868418ad9a4c4a5af182528f254 (diff)
1 files changed, 90 insertions, 0 deletions
diff --git a/bible/dictionary.clj b/bible/dictionary.clj
new file mode 100644
index 0000000..477b14e
--- /dev/null
+++ b/bible/dictionary.clj
@@ -0,0 +1,90 @@
+(require '[clojure.string :as str])
+
+(require '[clojure.java.io :as io])
+(require '[babashka.fs :as fs])
+
+(require '[clojure.data.priority-map :as pm])
+
+(require '[clojure.math :as math])
+
+(comment
+  "Build the base file"
+  (def files (fs/glob "./base_files/" "**.txt"))
+
+  (defn get-book-num [filename]
+    (let [[_ _ book _ _]
+          (str/split (str filename) #"_")
+          #_#_chap (int _chap)]
+      (Integer/parseInt book)))
+
+  (defn get-chap-num [filename]
+    (let [[_ _ _ _ chap]
+          (str/split (str filename) #"_")
+          #_#_chap (int _chap)]
+      (Integer/parseInt chap)))
+
+  (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
+    (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
+      (with-open [reader (io/reader (fs/file f))]
+        (doseq [line (drop 2 (line-seq reader))]
+          (.write writer (str line "\n")))))))
+
+(defn sizetable [bits]
+  (let [bytect (math/ceil (/ bits 8))
+        kbs (/ bytect 1024)
+        mbs (/ kbs 1024)]
+    {:bits bits
+     :bytes bytect
+     :kbs kbs
+     :mbs mbs}))
+
+(def full-text (slurp "bbe-newlines-nochaps.txt"))
+
+(def base-size (sizetable (* 8 (count full-text))))
+
+"Naiive, just tokenize with spaces"
+(let [toks (-> full-text 
+               ;(str/lower-case)
+               (str/replace #"\s+" " SP ") ;; all spaces become one space
+               (str/replace #"[,.;:!?\(\)\[\]'\*-]" #(str " " %1 " "))
+               (str/replace #"\s+" " ") ;; all spaces normalized to one space
+               (str/split #" ")
+               (#(remove str/blank? %1)))
+      freqs (frequencies toks) ;6689 unique tokens
+      #_#_sorted-freqs (sort-by val > freqs)
+      #_#_dictionary (vec
+                   (zipmap
+                     (range (count sorted-freqs))
+                     (map first sorted-freqs)))]
+  (count freqs)
+  #_(sizetable (* 13 (count toks))))
+
+
+"Lower-case, remove 's, remove word final s tokenize"
+(let [toks (-> full-text 
+               (str/lower-case)
+               (str/replace #"'s" " AS ") ;; Apostrophe S
+               (str/replace #"\s+" " SP ") ;; all spaces become one space
+               (str/replace #"[,.;:!?\(\)\[\]'\*-]" #(str " " %1 " "))
+               (str/replace #"\s+" " ") ;; all spaces normalized to one space
+               (str/split #" ")
+               (#(remove str/blank? %1)))
+      freqs (frequencies toks) ;; 5998 unique tokens
+      #_#_sorted-freqs (sort-by val > freqs)
+      #_#_dictionary (vec
+                   (zipmap
+                     (range (count sorted-freqs))
+                     (map first sorted-freqs)))]
+  (count freqs)
+  #_(sizetable (* 13 (count toks))))
+
+"Takes 13 bits for this dictionary"
+(Integer/toBinaryString 5998)
+"_-_10111 01101110"
+
+"The text uses only 1000 unique words (not counting proper nouns?)->Possible to
+get each word to 10 bits? Maybe two bytes?"
+(math/pow 2 10)
+
+"Dictionary compression is a dead end for me (I think), but it was worth looking
+into."
author	Patrick Kingston <patrick@pkingston.xyz>	2026-01-23 20:01:42 -0500
committer	Patrick Kingston <patrick@pkingston.xyz>	2026-01-23 20:01:42 -0500
commit	a5cb8e19665de19f21d5b8f19719b9d448c99b67 (patch)
tree	b7690b6c3942943fb06399fa2b5b6d4afbf69da4 /bible/dictionary.clj
parent	d47553f0a7456868418ad9a4c4a5af182528f254 (diff)