(require '[clojure.string :as str]) (require '[clojure.java.io :as io]) (require '[babashka.fs :as fs]) (require '[clojure.data.priority-map :as pm]) (require '[clojure.math :as math]) (require '[clojure.core.match :as match]) ;;; The full compressor of bible in basic english ;Build the base txt file out of individual txt files (comment (def files (fs/glob "./base_files/" "**.txt")) (defn get-book-num [filename] (let [[_ _ book _ _] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt book))) (defn get-chap-num [filename] (let [[_ _ _ _ chap] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt chap))) (with-open [writer (io/writer "bbe-newlines-nochaps.txt")] (doseq [f (sort-by (juxt get-book-num get-chap-num) files)] (with-open [reader (io/reader (fs/file f))] (doseq [line (drop 2 (line-seq reader))] (.write writer (str line "\n"))))))) ;;; The full text as a file (def full-text (slurp "./bbe-newlines-nochaps.txt")) (def optimized-string (-> full-text (str/lower-case) (str/replace #"'s" " AS ") (str/replace #"\.\.\." " DOTDOTDOT ") (str/replace #"\*\*\*" " STARSTARSTAR ") (str/replace #"—" "-") (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " ")) (str/replace #"\s+" " "))) (spit "foo.txt" optimized-string) (def optimized-tokens (str/split optimized-string #" ")) (comment ;Some basic stats on our work so far (count full-text) ; total chars 4207465 (count optimized-tokens) ; total tokens 962868 (count (into #{} optimized-tokens)) ; 5997 total unique tokens (apply max (map count (into #{} optimized-tokens))) ; max word is 17 chars long "straightforwardly" -> 1 nyble to represent? ; We could maybe do some domain modeling and do like ; "suffix-s" or "suffix-ly"s like with "'s" right now ) ;;; To encode the most common words, we use 1DDDDDDD <- 7 bits for the 127 most common words ;;; To encode the rest, we use 0DDDDDDD DDDDDDDD <- 15 bits for the other dictionary entries (comment (let [sorted-toks (sort-by val > (frequencies optimized-tokens)) totalcount (reduce + (map val sorted-toks)) top127 (take 128 sorted-toks) top127count (reduce + (map val top127))] {:total totalcount :topwords top127count :percent-savings (* 100 (double (/ top127count totalcount))) :total-remaining-words (- totalcount top127count)}) ; {:total 965223, ; :topwords 715122, ; :percent-savings 74.08878570029931, ; :total-remaining-words 250101} ) ;;; We'll start off by bit-packing our representations a bit. (def word-ids (let [sorted-toks (sort-by val > (frequencies optimized-tokens)) top128 (take 128 sorted-toks) rest128 (drop 128 sorted-toks) top128-reprs (into {} (map-indexed (fn [id [tok _freq]] [tok (unchecked-byte (bit-or 0x80 id))]) top128)) rest128-reprs (into {} (map-indexed (fn [id [tok _freq]] [tok [(unchecked-byte (bit-or 2r10000000 (bit-shift-right id 8))) (unchecked-byte (bit-and 0x00FF id))]]) rest128)) token-reprs (merge top128-reprs rest128-reprs)] (assert (= (count token-reprs) (+ (count top128-reprs) (count rest128-reprs)))) token-reprs)) (def dict-id-compressed-text (flatten (map word-ids optimized-tokens))) (comment (count dict-id-compressed-text) ;Whittled it down to 1212042 total bytes )