(require '[clojure.string :as str]) (require '[clojure.java.io :as io]) (require '[babashka.fs :as fs]) (require '[clojure.data.priority-map :as pm]) (require '[clojure.math :as math]) (require '[clojure.core.match :as match]) ;;; The full compressor of bible in basic english ;Build the base txt file out of individual txt files (comment (def files (fs/glob "./base_files/" "**.txt")) (defn get-book-num [filename] (let [[_ _ book _ _] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt book))) (defn get-chap-num [filename] (let [[_ _ _ _ chap] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt chap))) (with-open [writer (io/writer "bbe-newlines-nochaps.txt")] (doseq [f (sort-by (juxt get-book-num get-chap-num) files)] (with-open [reader (io/reader (fs/file f))] (doseq [line (drop 2 (line-seq reader))] (.write writer (str line "\n"))))))) ;;; The full text as a file (def full-text (slurp "./bbe-newlines-nochaps.txt")) (def optimized-string (-> full-text (str/lower-case) (str/replace #"'s" " AS ") (str/replace #"\.\.\." " DOTDOTDOT ") (str/replace #"\*\*\*" " STARSTARSTAR ") (str/replace #"—" "-") (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " ")) (str/replace #"\s+" " "))) (def optimized-tokens (str/split optimized-string #" ")) (comment ;Some basic stats on our work so far (count full-text) ; total chars 4207465 (count optimized-tokens) ; total tokens 962868 (count (into #{} optimized-tokens)) ; 5997 total unique tokens (apply max (map count (into #{} optimized-tokens))) ; max word is 17 chars long "straightforwardly" -> 1 nyble to represent? ; We could maybe do some domain modeling and do like ; "suffix-s" or "suffix-ly"s like with "'s" right now ) ;;; First we'll dictionary-encode our tokens ;; For a less efficient (in the long term) encoding algorithm, see dictionary-packed.clj (def word-ids (let [sorted-toks (sort-by val > (frequencies optimized-tokens)) token-reprs (into {} (map-indexed (fn [id [tok _freq]] [tok (unchecked-short (bit-and 0xffff id))#_[(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]]) sorted-toks))] token-reprs)) (def dict-id-compressed-text (map word-ids optimized-tokens)) (comment (count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices (it's 962868 shorts) ) ;;; TODO: Build dictionary ;;; Next, we'll run LZSS on our tok-id-list (def WINDOW-SIZE 1024) ; The maximum distance we look back, only allowing 1k RAM (def MIN-MATCH 3) ; Minimum length to bother with a reference (def MATCH-FLAG (count (frequencies dict-id-compressed-text))) ;Basically it's the total number of tokens they only need 0-(total-1) ; This is the START. Then 10 bits for the offset (1k), and 4 bits for the length (255 toks). (defn- get-match-len [data-vec pos match-pos max-len] (loop [len 0] (if (and (< len max-len) (= (nth data-vec (+ pos len)) (nth data-vec (+ match-pos len)))) (recur (inc len)) len))) #_(defn compress "LZSS using transient vectors and a hash-index for O(1) lookups." [data] (let [data-vec (vec data) data-len (count data-vec) #_#_MIN-MATCH 3] ;; Standard MIN-MATCH (loop [cursor 0 ;; map of [c1 c2 c3] -> list of indices index {} ;; Transient vector for high-performance output collection out (transient [])] (if (>= cursor data-len) (apply str (persistent! out)) ;; Finalize the transient and join (let [;; 1. Check if we have a potential match using the index triplet (subvec data-vec cursor (min (+ cursor MIN-MATCH) data-len)) match-indices (get index triplet) ;; 2. Find the best match among the candidates best-match (when (>= (count triplet) MIN-MATCH) (reduce (fn [best idx] (let [len (get-match-len data-vec cursor idx (min (- data-len cursor) WINDOW-SIZE))] (if (> len (:length best 0)) {:offset (- cursor idx) :length len} best))) nil (filter #(> % (- cursor WINDOW-SIZE)) match-indices)))] (if (and best-match (>= (:length best-match) MIN-MATCH)) ;; Match found (let [len (:length best-match) ref-str (str "<" (:offset best-match) "|" len ">") #_#_ref-str "<00|00"] (recur (+ cursor len) ;; Update index for every triplet we skip (simplification: just first) (assoc index triplet (conj (get index triplet []) cursor)) (conj! out ref-str))) ;; No match found (let [char-lit (str (nth data-vec cursor))] (recur (inc cursor) (if (>= (count triplet) MIN-MATCH) (assoc index triplet (conj (get index triplet []) cursor)) index) (conj! out char-lit))))))))) (defn compress-tokens "Takes a sequence of 16-bit token IDs. Returns a vector of maps: {:type :lit :val v} or {:type :match :off o :len l}" [data] (let [data-vec (vec data) data-len (count data-vec)] (loop [cursor 0 index {} out (transient [])] (if (>= cursor data-len) (persistent! out) ;; Return clean vector of data structures ;; 1. Setup search parameters (let [max-search-len (min (- data-len cursor) 255) ;; Cap length (e.g. 255) for 8-bit length fields triplet (if (>= max-search-len MIN-MATCH) (subvec data-vec cursor (+ cursor MIN-MATCH)) nil) match-indices (get index triplet)] ;; 2. Find Best Match (let [best-match (when (and triplet match-indices) (reduce (fn [best candidate-idx] ;; Check if candidate is within WINDOW-SIZE (if (> candidate-idx (- cursor WINDOW-SIZE)) (let [len (get-match-len data-vec cursor candidate-idx max-search-len)] (if (> len (:len best 0)) {:dist (- cursor candidate-idx) :len len} best)) best)) ;; Too old, skip nil match-indices))] (if (and best-match (>= (:len best-match) MIN-MATCH)) ;; CASE A: Match Found (recur (+ cursor (:len best-match)) ;; Note: We are still "Lazy Indexing" here for speed. ;; To maximize compression, you'd loop here to add skipped parts to `index`. (assoc index triplet (conj (get index triplet []) cursor)) (conj! out {:type :match :dist (:dist best-match) :len (:len best-match)})) ;; CASE B: Literal (recur (inc cursor) (if triplet (assoc index triplet (conj (get index triplet []) cursor)) index) (conj! out {:type :lit :val (nth data-vec cursor)}))))))))) (def lzss-compressed-dict-ids (compress-tokens dict-id-compressed-text)) ; The way this works is it outputs literal shorts, and then if there's a short ; that says MATCH-FLAG it doesn't bother huffman encoding.