diff options
| author | Patrick Kingston <patrick@pkingston.xyz> | 2026-01-23 20:01:42 -0500 |
|---|---|---|
| committer | Patrick Kingston <patrick@pkingston.xyz> | 2026-01-23 20:01:42 -0500 |
| commit | a5cb8e19665de19f21d5b8f19719b9d448c99b67 (patch) | |
| tree | b7690b6c3942943fb06399fa2b5b6d4afbf69da4 /bible/dictionary.clj | |
| parent | d47553f0a7456868418ad9a4c4a5af182528f254 (diff) | |
write a basic dictionary compressor
Diffstat (limited to 'bible/dictionary.clj')
| -rw-r--r-- | bible/dictionary.clj | 90 |
1 files changed, 90 insertions, 0 deletions
diff --git a/bible/dictionary.clj b/bible/dictionary.clj new file mode 100644 index 0000000..477b14e --- /dev/null +++ b/bible/dictionary.clj @@ -0,0 +1,90 @@ +(require '[clojure.string :as str]) + +(require '[clojure.java.io :as io]) +(require '[babashka.fs :as fs]) + +(require '[clojure.data.priority-map :as pm]) + +(require '[clojure.math :as math]) + +(comment + "Build the base file" + (def files (fs/glob "./base_files/" "**.txt")) + + (defn get-book-num [filename] + (let [[_ _ book _ _] + (str/split (str filename) #"_") + #_#_chap (int _chap)] + (Integer/parseInt book))) + + (defn get-chap-num [filename] + (let [[_ _ _ _ chap] + (str/split (str filename) #"_") + #_#_chap (int _chap)] + (Integer/parseInt chap))) + + (with-open [writer (io/writer "bbe-newlines-nochaps.txt")] + (doseq [f (sort-by (juxt get-book-num get-chap-num) files)] + (with-open [reader (io/reader (fs/file f))] + (doseq [line (drop 2 (line-seq reader))] + (.write writer (str line "\n"))))))) + +(defn sizetable [bits] + (let [bytect (math/ceil (/ bits 8)) + kbs (/ bytect 1024) + mbs (/ kbs 1024)] + {:bits bits + :bytes bytect + :kbs kbs + :mbs mbs})) + +(def full-text (slurp "bbe-newlines-nochaps.txt")) + +(def base-size (sizetable (* 8 (count full-text)))) + +"Naiive, just tokenize with spaces" +(let [toks (-> full-text + ;(str/lower-case) + (str/replace #"\s+" " SP ") ;; all spaces become one space + (str/replace #"[,.;:!?\(\)\[\]'\*-]" #(str " " %1 " ")) + (str/replace #"\s+" " ") ;; all spaces normalized to one space + (str/split #" ") + (#(remove str/blank? %1))) + freqs (frequencies toks) ;6689 unique tokens + #_#_sorted-freqs (sort-by val > freqs) + #_#_dictionary (vec + (zipmap + (range (count sorted-freqs)) + (map first sorted-freqs)))] + (count freqs) + #_(sizetable (* 13 (count toks)))) + + +"Lower-case, remove 's, remove word final s tokenize" +(let [toks (-> full-text + (str/lower-case) + (str/replace #"'s" " AS ") ;; Apostrophe S + (str/replace #"\s+" " SP ") ;; all spaces become one space + (str/replace #"[,.;:!?\(\)\[\]'\*-]" #(str " " %1 " ")) + (str/replace #"\s+" " ") ;; all spaces normalized to one space + (str/split #" ") + (#(remove str/blank? %1))) + freqs (frequencies toks) ;; 5998 unique tokens + #_#_sorted-freqs (sort-by val > freqs) + #_#_dictionary (vec + (zipmap + (range (count sorted-freqs)) + (map first sorted-freqs)))] + (count freqs) + #_(sizetable (* 13 (count toks)))) + +"Takes 13 bits for this dictionary" +(Integer/toBinaryString 5998) +"_-_10111 01101110" + +"The text uses only 1000 unique words (not counting proper nouns?)->Possible to +get each word to 10 bits? Maybe two bytes?" +(math/pow 2 10) + +"Dictionary compression is a dead end for me (I think), but it was worth looking +into." |
