aboutsummaryrefslogtreecommitdiff
path: root/bible
diff options
context:
space:
mode:
authorPatrick Kingston <patrick@pkingston.xyz>2026-01-23 20:01:42 -0500
committerPatrick Kingston <patrick@pkingston.xyz>2026-01-23 20:01:42 -0500
commita5cb8e19665de19f21d5b8f19719b9d448c99b67 (patch)
treeb7690b6c3942943fb06399fa2b5b6d4afbf69da4 /bible
parentd47553f0a7456868418ad9a4c4a5af182528f254 (diff)
write a basic dictionary compressor
Diffstat (limited to 'bible')
-rw-r--r--bible/dictionary.clj90
1 files changed, 90 insertions, 0 deletions
diff --git a/bible/dictionary.clj b/bible/dictionary.clj
new file mode 100644
index 0000000..477b14e
--- /dev/null
+++ b/bible/dictionary.clj
@@ -0,0 +1,90 @@
+(require '[clojure.string :as str])
+
+(require '[clojure.java.io :as io])
+(require '[babashka.fs :as fs])
+
+(require '[clojure.data.priority-map :as pm])
+
+(require '[clojure.math :as math])
+
+(comment
+ "Build the base file"
+ (def files (fs/glob "./base_files/" "**.txt"))
+
+ (defn get-book-num [filename]
+ (let [[_ _ book _ _]
+ (str/split (str filename) #"_")
+ #_#_chap (int _chap)]
+ (Integer/parseInt book)))
+
+ (defn get-chap-num [filename]
+ (let [[_ _ _ _ chap]
+ (str/split (str filename) #"_")
+ #_#_chap (int _chap)]
+ (Integer/parseInt chap)))
+
+ (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
+ (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
+ (with-open [reader (io/reader (fs/file f))]
+ (doseq [line (drop 2 (line-seq reader))]
+ (.write writer (str line "\n")))))))
+
+(defn sizetable [bits]
+ (let [bytect (math/ceil (/ bits 8))
+ kbs (/ bytect 1024)
+ mbs (/ kbs 1024)]
+ {:bits bits
+ :bytes bytect
+ :kbs kbs
+ :mbs mbs}))
+
+(def full-text (slurp "bbe-newlines-nochaps.txt"))
+
+(def base-size (sizetable (* 8 (count full-text))))
+
+"Naiive, just tokenize with spaces"
+(let [toks (-> full-text
+ ;(str/lower-case)
+ (str/replace #"\s+" " SP ") ;; all spaces become one space
+ (str/replace #"[,.;:!?\(\)\[\]'\*-]" #(str " " %1 " "))
+ (str/replace #"\s+" " ") ;; all spaces normalized to one space
+ (str/split #" ")
+ (#(remove str/blank? %1)))
+ freqs (frequencies toks) ;6689 unique tokens
+ #_#_sorted-freqs (sort-by val > freqs)
+ #_#_dictionary (vec
+ (zipmap
+ (range (count sorted-freqs))
+ (map first sorted-freqs)))]
+ (count freqs)
+ #_(sizetable (* 13 (count toks))))
+
+
+"Lower-case, remove 's, remove word final s tokenize"
+(let [toks (-> full-text
+ (str/lower-case)
+ (str/replace #"'s" " AS ") ;; Apostrophe S
+ (str/replace #"\s+" " SP ") ;; all spaces become one space
+ (str/replace #"[,.;:!?\(\)\[\]'\*-]" #(str " " %1 " "))
+ (str/replace #"\s+" " ") ;; all spaces normalized to one space
+ (str/split #" ")
+ (#(remove str/blank? %1)))
+ freqs (frequencies toks) ;; 5998 unique tokens
+ #_#_sorted-freqs (sort-by val > freqs)
+ #_#_dictionary (vec
+ (zipmap
+ (range (count sorted-freqs))
+ (map first sorted-freqs)))]
+ (count freqs)
+ #_(sizetable (* 13 (count toks))))
+
+"Takes 13 bits for this dictionary"
+(Integer/toBinaryString 5998)
+"_-_10111 01101110"
+
+"The text uses only 1000 unique words (not counting proper nouns?)->Possible to
+get each word to 10 bits? Maybe two bytes?"
+(math/pow 2 10)
+
+"Dictionary compression is a dead end for me (I think), but it was worth looking
+into."