(require '[clojure.string :as str]) (require '[clojure.java.io :as io]) (require '[babashka.fs :as fs]) (require '[clojure.data.priority-map :as pm]) (require '[clojure.math :as math]) (require '[clojure.core.match :as match]) ;;; The full compressor of bible in basic english ;Build the base txt file out of individual txt files (comment (def files (fs/glob "./base_files/" "**.txt")) (defn get-book-num [filename] (let [[_ _ book _ _] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt book))) (defn get-chap-num [filename] (let [[_ _ _ _ chap] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt chap))) (with-open [writer (io/writer "bbe-newlines-nochaps.txt")] (doseq [f (sort-by (juxt get-book-num get-chap-num) files)] (with-open [reader (io/reader (fs/file f))] (doseq [line (drop 2 (line-seq reader))] (.write writer (str line "\n"))))))) ;;; The full text as a file (def full-text (slurp "./bbe-newlines-nochaps.txt")) (def optimized-string (-> full-text (str/lower-case) (str/replace #"'s" " AS ") (str/replace #"\.\.\." " DOTDOTDOT ") (str/replace #"\*\*\*" " STARSTARSTAR ") (str/replace #"—" "-") (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " ")) (str/replace #"\s+" " "))) (def optimized-tokens (str/split optimized-string #" ")) (comment ;Some basic stats on our work so far (count full-text) ; total chars 4207465 (count optimized-tokens) ; total tokens 962868 (count (into #{} optimized-tokens)) ; 5997 total unique tokens (apply max (map count (into #{} optimized-tokens))) ; max word is 17 chars long "straightforwardly" -> 1 nyble to represent? ; We could maybe do some domain modeling and do like ; "suffix-s" or "suffix-ly"s like with "'s" right now ) ;;; First we'll dictionary-encode our tokens ;; For a less efficient (in the long term) encoding algorithm, see dictionary-packed.clj (def word-ids (let [sorted-toks (sort-by val > (frequencies optimized-tokens)) token-reprs (into {} (map-indexed (fn [id [tok _freq]] [tok (unchecked-short (bit-and 0xffff id))#_[(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]]) sorted-toks))] token-reprs)) (def dict-id-compressed-text (map word-ids optimized-tokens)) (comment (count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices (it's 962868 shorts) ) ;;; TODO: Build dictionary ;;; Next, we'll run LZSS on our tok-id-list (def WINDOW-SIZE 1024) ; The maximum distance we look back, only allowing 1k RAM (def MIN-MATCH 3) ; Minimum length to bother with a reference (def MATCH-FLAG (count (frequencies dict-id-compressed-text))) ;Basically it's the total number of tokens they only need 0-(total-1) ; This is the START. Then 10 bits for the offset (1k), and 4 bits for the length (255 toks). (defn- get-match-len [data-vec pos match-pos max-len] (loop [len 0] (if (and (< len max-len) (= (nth data-vec (+ pos len)) (nth data-vec (+ match-pos len)))) (recur (inc len)) len))) (defn compress-tokens "Takes a sequence of 16-bit token IDs. Returns a vector of maps: {:type :lit :val v} or {:type :match :off o :len l}" [data] (let [data-vec (vec data) data-len (count data-vec)] (loop [cursor 0 index {} out (transient [])] (if (>= cursor data-len) (persistent! out) ;; Return clean vector of data structures ;; 1. Setup search parameters (let [max-search-len (min (- data-len cursor) 255) ;; Cap length (e.g. 255) for 8-bit length fields triplet (if (>= max-search-len MIN-MATCH) (subvec data-vec cursor (+ cursor MIN-MATCH)) nil) match-indices (get index triplet)] ;; 2. Find Best Match (let [best-match (when (and triplet match-indices) (reduce (fn [best candidate-idx] ;; Check if candidate is within WINDOW-SIZE (if (> candidate-idx (- cursor WINDOW-SIZE)) (let [len (get-match-len data-vec cursor candidate-idx max-search-len)] (if (> len (:len best 0)) {:dist (- cursor candidate-idx) :len len} best)) best)) ;; Too old, skip nil match-indices))] (if (and best-match (>= (:len best-match) MIN-MATCH)) ;; CASE A: Match Found (recur (+ cursor (:len best-match)) ;; Note: We are still "Lazy Indexing" here for speed. ;; To maximize compression, you'd loop here to add skipped parts to `index`. (assoc index triplet (conj (get index triplet []) cursor)) (conj! out {:type :match :dist (:dist best-match) :len (:len best-match)})) ;; CASE B: Literal (recur (inc cursor) (if triplet (assoc index triplet (conj (get index triplet []) cursor)) index) (conj! out {:type :lit :val (nth data-vec cursor)}))))))))) (def lzss-compressed-dict-ids (compress-tokens dict-id-compressed-text)) ;; Some stats on lzss-compression (comment (let [totalbytes (reduce + (map (fn [lzsscode] (if (= (:type lzsscode) :lit) 2 3)) lzss-compressed-dict-ids))];A match requires 3 bytes, one for MATCH-FLAG 2 for offset and length totalbytes) ; down to 1604373 total bytes! ) ;;; Third, we huffman encode ;; If I were to write the LZSS encoding I would write just a bunch of shorts ;; Since we're huffman-encoding this, the MATCH-FLAG will be a huffman symbol ;; Which, when writing, we'll just write the 16 bit length-offset sequence in ;; un huffman-encode, so for instance 100101010100 HUFFIFIED-MATCH-FLAG 0xFFFF (but 10 bits are length, 4 are offset) (def match-queue ; We'll use this when writing the file per above (remove #(= (:type %1) :lit) lzss-compressed-dict-ids)) (assert (= (count lzss-compressed-dict-ids) (+ (count match-queue) (count (filter #(= (:type %1) :lit) lzss-compressed-dict-ids))))) (def huffman-friendly-lzss-compressed-dict-ids (map (fn [lzsscode] (if (= (:type lzsscode) :lit) (:val lzsscode) MATCH-FLAG)) ;MATCH-FlAG is a symbol lzss-compressed-dict-ids)) (assert (= (count match-queue) (get (frequencies huffman-friendly-lzss-compressed-dict-ids) MATCH-FLAG)) "Somehow dropped/gained a match sequence") (def huffmanable-lzss-toks-freq-table (frequencies huffman-friendly-lzss-compressed-dict-ids)) ;; Huffman tree node (defrecord Node [left right sym probability]) (defn symboltable->pq [symbolfreqs] (into (pm/priority-map-keyfn (juxt first second)) (map #(vector (->Node nil nil (first %1) (second %1)) [(second %1) (first %1)]) symbolfreqs))) (assert (= (count huffmanable-lzss-toks-freq-table) (count (symboltable->pq huffmanable-lzss-toks-freq-table))) "Priority queue has fewer symbols than symbol list") (defn pq->huffman-tree "Builds a huffman-tree out of a priority-queue" [queue] (if (= 1 (count queue)) (first (peek queue)) ; Repeat until there is only one parentless node left (let [[lowest-node [lowest-prob _]] (peek queue) [second-node [second-prob _]] (peek (pop queue)) ; Step 2 new-prob (+ lowest-prob second-prob) ; Step 4 new-node (->Node second-node lowest-node nil new-prob) ; Step 3 - NOTE: unsure about node order next-queue (assoc (pop (pop queue)) new-node [new-prob nil])] (recur next-queue)))) (def huffman-tree (-> huffmanable-lzss-toks-freq-table symboltable->pq pq->huffman-tree)) (assert (= (.probability huffman-tree) (reduce + (map val huffmanable-lzss-toks-freq-table))) "Probability of root node is not equal to the sum of all probabilities") (defrecord HuffmanCodeword [sym code length]) (defn huffman-tree->symbol-encodings "Builds a list of symbol encodings out of a huffman-tree [tree] tree to symbol encodings [node encodings curr-encoding] recursive builder" ([tree] (huffman-tree->symbol-encodings tree [] 0 0)) ([node encodings curr-encoding curr-length] (if (.sym node) (conj encodings (->HuffmanCodeword (.sym node) curr-encoding curr-length)) (let [left-traversed (huffman-tree->symbol-encodings (.left node) encodings (unchecked-int (bit-or 0x00000001 (bit-shift-left curr-encoding 1))) (inc curr-length))] (huffman-tree->symbol-encodings (.right node) left-traversed (bit-shift-left curr-encoding 1) (inc curr-length)))))) (def huffman-symbol-encodings (huffman-tree->symbol-encodings huffman-tree)) (defn canonicalize-codewords "Converts a collection of HuffmanCodeword records into canonical form." [codewords] (let [;; 1. Sort by length (primary) and symbol value (secondary) sorted-codewords (sort-by (juxt :length :sym) codewords)] (loop [[cw & more] sorted-codewords last-code (unchecked-int 0) last-length (unchecked-int (or (:length (first sorted-codewords)) 0)) result []] (if-not cw result (let [current-length (unchecked-int (:length cw)) ;; 2. Calculate the new code: ;; If length increased, shift the incremented previous code. ;; If this is the very first code, it stays 0. new-code (if (empty? result) (unchecked-int 0) (unchecked-int (bit-shift-left (unchecked-inc-int last-code) (- current-length last-length)))) updated-cw (assoc cw :code new-code)] (recur more new-code current-length (conj result updated-cw))))))) (defn verify-huffman-integrity [codewords] (let [kraft-sum (reduce + (map #(Math/pow 2 (- (:length %))) codewords)) sorted-cw (sort-by (juxt :length :sym) codewords) ;; Check if codes are strictly increasing codes-unique? (apply < (map :code sorted-cw))] {:kraft-sum kraft-sum :is-complete? (== kraft-sum 1.0) :is-valid? (<= kraft-sum 1.0) :ordered-correctly? codes-unique?})) (def canonicalized-codewords (canonicalize-codewords huffman-symbol-encodings)) (def symbol->canonical-code (into {} (map #(vector (unchecked-short (.sym %1)) [(.code %1) (.length %1)]) canonicalized-codewords))) (assert (= (into #{} (map :sym canonicalized-codewords)) (into #{} (map key huffmanable-lzss-toks-freq-table)))) (defn bit-pack-dist-len [{:keys [dist len]}] (let [outputbase (unchecked-short 0) with-dist (unchecked-short (bit-or outputbase (bit-shift-left dist 6))) with-len (unchecked-short (bit-or with-dist (bit-and 2r111111 len)))] with-len)) (assert (= (unchecked-short 2r1111111111111111) (bit-pack-dist-len {:dist (unchecked-short 2r1111111111) :len (unchecked-short 2r111111)}))) (assert (= (unchecked-short 2r1010101010000001) (bit-pack-dist-len {:dist (unchecked-short 2r1010101010) :len (unchecked-short 2r000001)}))) (def canonical-huffman-encoded-bit-sequence (let [mq (volatile! match-queue)] (mapcat (fn [symb] (if (= MATCH-FLAG symb) (do (vswap! mq rest) [(symbol->canonical-code symb) [(bit-pack-dist-len (first @mq)) 16]]) [(symbol->canonical-code symb)] )) huffman-friendly-lzss-compressed-dict-ids))) (math/ceil (/ (reduce + (map second canonical-huffman-encoded-bit-sequence)) 8 1024))