(require '[clojure.string :as str]) (require '[clojure.java.io :as io]) (require '[babashka.fs :as fs]) (require '[clojure.data.priority-map :as pm]) (require '[clojure.math :as math]) (comment "Build the base file" (def files (fs/glob "./base_files/" "**.txt")) (defn get-book-num [filename] (let [[_ _ book _ _] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt book))) (defn get-chap-num [filename] (let [[_ _ _ _ chap] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt chap))) (with-open [writer (io/writer "bbe-newlines-nochaps.txt")] (doseq [f (sort-by (juxt get-book-num get-chap-num) files)] (with-open [reader (io/reader (fs/file f))] (doseq [line (drop 2 (line-seq reader))] (.write writer (str line "\n"))))))) (defn sizetable [bits] (let [bytect (math/ceil (/ bits 8)) kbs (/ bytect 1024) mbs (/ kbs 1024)] {:bits bits :bytes bytect :kbs kbs :mbs mbs})) (def full-text (slurp "bbe-newlines-nochaps.txt")) (def base-size (sizetable (* 8 (count full-text)))) "Naiive, just tokenize with spaces" (let [toks (-> full-text ;(str/lower-case) (str/replace #"\s+" " SP ") ;; all spaces become one space (str/replace #"[,.;:!?\(\)\[\]'\*-]" #(str " " %1 " ")) (str/replace #"\s+" " ") ;; all spaces normalized to one space (str/split #" ") (#(remove str/blank? %1))) freqs (frequencies toks) ;6689 unique tokens #_#_sorted-freqs (sort-by val > freqs) #_#_dictionary (vec (zipmap (range (count sorted-freqs)) (map first sorted-freqs)))] (count freqs) #_(sizetable (* 13 (count toks)))) "Lower-case, remove 's, remove word final s tokenize" (let [toks (-> full-text (str/lower-case) (str/replace #"'s" " AS ") ;; Apostrophe S (str/replace #"\s+" " SP ") ;; all spaces become one space (str/replace #"[,.;:!?\(\)\[\]'\*-]" #(str " " %1 " ")) (str/replace #"\s+" " ") ;; all spaces normalized to one space (str/split #" ") (#(remove str/blank? %1))) freqs (frequencies toks) ;; 5998 unique tokens #_#_sorted-freqs (sort-by val > freqs) #_#_dictionary (vec (zipmap (range (count sorted-freqs)) (map first sorted-freqs)))] (count freqs) #_(sizetable (* 13 (count toks)))) "Takes 13 bits for this dictionary" (Integer/toBinaryString 5998) "_-_10111 01101110" "The text uses only 1000 unique words (not counting proper nouns?)->Possible to get each word to 10 bits? Maybe two bytes?" (math/pow 2 10) "Dictionary compression is a dead end for me (I think), but it was worth looking into."