(require '[clojure.string :as str]) (require '[clojure.java.io :as io]) (require '[babashka.fs :as fs]) (require '[clojure.data.priority-map :as pm]) (require '[clojure.math :as math]) (require '[clojure.core.match :as match]) ;;; The full compressor of bible in basic english ;Build the base txt file out of individual txt files (comment (def files (fs/glob "./base_files/" "**.txt")) (defn get-book-num [filename] (let [[_ _ book _ _] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt book))) (defn get-chap-num [filename] (let [[_ _ _ _ chap] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt chap))) (with-open [writer (io/writer "bbe-newlines-nochaps.txt")] (doseq [f (sort-by (juxt get-book-num get-chap-num) files)] (with-open [reader (io/reader (fs/file f))] (doseq [line (drop 2 (line-seq reader))] (.write writer (str line "\n"))))))) ;;; The full text as a file (def full-text (slurp "./bbe-newlines-nochaps.txt")) (def optimized-string (-> full-text (str/lower-case) (str/replace #"'s" " AS ") (str/replace #"\.\.\." " DOTDOTDOT ") (str/replace #"\*\*\*" " STARSTARSTAR ") (str/replace #"—" "-") (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " ")) (str/replace #"\s+" " "))) (spit "foo.txt" optimized-string) (def optimized-tokens (str/split optimized-string #" ")) (comment ;Some basic stats on our work so far (count full-text) ; total chars 4207465 (count optimized-tokens) ; total tokens 962868 (count (into #{} optimized-tokens)) ; 5997 total unique tokens (apply max (map count (into #{} optimized-tokens))) ; max word is 17 chars long "straightforwardly" -> 1 nyble to represent? ; We could maybe do some domain modeling and do like ; "suffix-s" or "suffix-ly"s like with "'s" right now ) ;;; First we'll dictionary-encode our tokens ;; For a less efficient (in the long term) encoding algorithm, see dictionary-packed.clj (def word-ids (let [sorted-toks (sort-by val > (frequencies optimized-tokens)) token-reprs (into {} (map-indexed (fn [id [tok _freq]] [tok [(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]]) sorted-toks))] token-reprs)) (def dict-id-compressed-text (flatten (map word-ids optimized-tokens))) (comment (count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices ) ;;; TODO: Build dictionary ;;; Next, we'll run LZSS on our stream of tokens