(require '[clojure.string :as str]) (require '[clojure.java.io :as io]) (require '[babashka.fs :as fs]) (require '[clojure.data.priority-map :as pm]) (require '[clojure.math :as math]) (require '[clojure.core.match :as match]) ;;; The full compressor of bible in basic english ;;; Mk2 - compressing with common n-grams, ditching LZSS (maybe) ;Build the base txt file out of individual txt files (comment (def files (fs/glob "./base_files/" "**.txt")) (defn get-book-num [filename] (let [[_ _ book _ _] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt book))) (defn get-chap-num [filename] (let [[_ _ _ _ chap] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt chap))) (with-open [writer (io/writer "bbe-newlines-nochaps.txt")] (doseq [f (sort-by (juxt get-book-num get-chap-num) files)] (with-open [reader (io/reader (fs/file f))] (doseq [line (drop 2 (line-seq reader))] (.write writer (str line "\n"))))))) ;;; The full text as a file (def full-text (slurp "./bbe-newlines-nochaps.txt")) (def optimized-string (-> full-text (str/lower-case) #_(str/replace #"\s+" " SPACE ") ;; Added in mk2 if it turns out it's better to break hapax into individual chars to kill dictionary bloat (str/replace #"'s" " AS ") (str/replace #"\.\.\." " DOTDOTDOT ") (str/replace #"\*\*\*" " STARSTARSTAR ") (str/replace #"—" "-") (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " ")) (str/replace #"\s+" " "))) (def optimized-tokens (str/split optimized-string #" ")) (comment ;Some basic stats on our work so far (count full-text) ; total chars 4207465 (count optimized-tokens) ; total tokens 962868 (count (into #{} optimized-tokens)) ; 5997 total unique tokens (apply max (map count (into #{} optimized-tokens))) ; max word is 17 chars long "straightforwardly" -> 1 nyble to represent? ; We could maybe do some domain modeling and do like ; "suffix-s" or "suffix-ly"s like with "'s" right now ) ;;; First we'll dictionary-encode our tokens ;; For a less efficient (in the long term) encoding algorithm, see dictionary-packed.clj (def word-ids (let [sorted-toks (sort-by val > (frequencies optimized-tokens)) token-reprs (into {} (map-indexed (fn [id [tok _freq]] [tok (unchecked-short (bit-and 0xffff id))#_[(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]]) sorted-toks))] token-reprs)) (def dict-id-compressed-text (map word-ids optimized-tokens)) (comment (count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices (it's 962868 shorts) (- (dec (math/pow 2 13)) (count (into #{} optimized-tokens))) ; !!NOTE!! that we're leaving 2194 whole tokens on the table. ) ;;; Then we apply an optimal-n-grams finder: (comment ; some scratch work on how big our n can be (defn get-most-common [n] (take 10 (->> optimized-tokens (partition n 1) (frequencies) (sort-by second >)))) (def ngrams-data (map get-most-common (range 2 11))) ngrams-data) ;;; We could do this with a bunch of tokens but we instead apply recursive ;;; n-grams find (comment (defn make-ngram [t1 t2] (str (str/upper-case t1) "_" (str/upper-case t2))) (defn replace-with-ngram [tokens [t1 t2] new-token infreqs] (loop [in tokens out (transient []) freqs infreqs] (cond (empty? in) [(persistent! out) freqs] (and (= (first in) t1) (= (second in) t2)) (recur (drop 2 in) (conj! out new-token) (-> freqs (assoc t1 (inc (freqs t1))) (assoc t2 (inc (freqs t2))) (assoc new-token (dec (get freqs new-token 0))))) :else (recur (rest in) (conj! out (first in)) freqs)))) (defn find-recursive-ngrams [intokens] (loop [tokens intokens freqs (->> intokens (frequencies) (map (fn [[k v]] [k (- v)])) (into (pm/priority-map)))] (if (<= (inc (math/pow 2 13)) (count freqs)) tokens (let [ngrams (partition 2 1 tokens) ;find ngrams [ngram _ct] (apply max-key second (frequencies ngrams)) ;find the most common ngram [t1 t2] ngram [nexttoks nextfreqs] (replace-with-ngram tokens ngram (make-ngram t1 t2) freqs)] (println "Working on: " ngram "with freqcount" (count freqs)) (recur nexttoks nextfreqs))))) (spit "optimized-token-stream.txt" (apply str (interpose "\n" optimized-ngrams)))) #_(def optimized-ngrams (find-recursive-ngrams optimized-tokens)) (def optimized-ngrams (str/split (slurp "optimized-token-stream.txt") #"\n")) (count optimized-tokens) ; 962868 tokens (count optimized-ngrams) ; 608165 tokens ;;; TODO: Build dictionary