(require '[clojure.string :as str]) (require '[clojure.java.io :as io]) (require '[babashka.fs :as fs]) (require '[clojure.data.priority-map :as pm]) (require '[clojure.math :as math]) (require '[clojure.core.match :as match]) (comment "Build the base file" (def files (fs/glob "./base_files/" "**.txt")) (defn get-book-num [filename] (let [[_ _ book _ _] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt book))) (defn get-chap-num [filename] (let [[_ _ _ _ chap] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt chap))) (with-open [writer (io/writer "bbe-newlines-nochaps.txt")] (doseq [f (sort-by (juxt get-book-num get-chap-num) files)] (with-open [reader (io/reader (fs/file f))] (doseq [line (drop 2 (line-seq reader))] (.write writer (str line "\n"))))))) (def full-text (slurp "./bbe-newlines-nochaps.txt")) (defn sizetable [bits] (let [bytect (math/ceil (/ bits 8)) kbs (/ bytect 1024) mbs (/ kbs 1024)] {:bits bits :bytes bytect :kbs kbs :mbs mbs})) (def mildlyprocessed (str/replace full-text #"—" "-")) #_#_(println (sizetable (* 8 (count full-text)))) (println (sizetable (* 8 (count (encode-lzss (take 100000 mildlyprocessed)))))) (def WINDOW-SIZE (* 32 1024)#_(+ 512 1024)) ; The maximum distance we look back (def MIN-MATCH 3) ; Minimum length to bother with a reference ; START, two byte num, SEP, two byte num (defn- get-match-len [data-vec pos match-pos max-len] (loop [len 0] (if (and (< len max-len) (= (nth data-vec (+ pos len)) (nth data-vec (+ match-pos len)))) (recur (inc len)) len))) (defn compress "LZSS using transient vectors and a hash-index for O(1) lookups." [data] (let [data-vec (vec data) data-len (count data-vec) #_#_MIN-MATCH 3] ;; Standard MIN-MATCH (loop [cursor 0 ;; map of [c1 c2 c3] -> list of indices index {} ;; Transient vector for high-performance output collection out (transient [])] (if (>= cursor data-len) (apply str (persistent! out)) ;; Finalize the transient and join (let [;; 1. Check if we have a potential match using the index triplet (subvec data-vec cursor (min (+ cursor MIN-MATCH) data-len)) match-indices (get index triplet) ;; 2. Find the best match among the candidates best-match (when (>= (count triplet) MIN-MATCH) (reduce (fn [best idx] (let [len (get-match-len data-vec cursor idx (min (- data-len cursor) WINDOW-SIZE))] (if (> len (:length best 0)) {:offset (- cursor idx) :length len} best))) nil (filter #(> % (- cursor WINDOW-SIZE)) match-indices)))] (if (and best-match (>= (:length best-match) MIN-MATCH)) ;; Match found (let [len (:length best-match) #_#_ref-str (str "<" (:offset best-match) "|" len ">") ref-str "<00|00"] (recur (+ cursor len) ;; Update index for every triplet we skip (simplification: just first) (assoc index triplet (conj (get index triplet []) cursor)) (conj! out ref-str))) ;; No match found (let [char-lit (str (nth data-vec cursor))] (recur (inc cursor) (if (>= (count triplet) MIN-MATCH) (assoc index triplet (conj (get index triplet []) cursor)) index) (conj! out char-lit))))))))) ;; --- Testing --- (let [input "When in the course of human events it becomes necessary to yada yada yada yada"] (println "Input: " input) (println "Output:" (compress input))) (defn compare-sizes "Calculates actual LZSS bit-cost by parsing tags vs literals." [original compressed] (let [;; Regex to find your specific tag format tag-pattern #"<00|00" ;; 1. Identify all matches in the compressed string all-tags (re-seq tag-pattern compressed) num-matches (count all-tags) ;; 2. Calculate literal count: ;; We remove the tags from the string; what's left are the raw characters literals-str (clojure.string/replace compressed tag-pattern "") num-literals (count literals-str) ;; 3. Calculate Real Bit Sizes ;; Original is always just 8 bits per char orig-bits (* 8 (count original)) ;; Compressed: 17 bits per match, 9 bits per literal ;; ACTUALLY: 16 bits per match (only need 10 bits to say 1024), 8 bits per literal (1 bit flag, only need 6 bits for char) comp-bits (+ (* num-matches 16 #_17) (* num-literals 8 #_9)) ;; Standard unit conversion helper metrics (fn [b] (let [bytes (double (/ b 8)) kb (/ bytes 1024.0)] {:bits b :bytes bytes :kb kb}))] {:original (metrics orig-bits) :compressed (metrics comp-bits) :stats {:match-count num-matches :literal-count num-literals} :comparison {:bits-saved (- orig-bits comp-bits) :savings-pct (double (* 100 (- 1 (/ comp-bits orig-bits))))}})) (let [test-seq (take 2000000 mildlyprocessed) compressed-str (compress test-seq)] (compare-sizes test-seq compressed-str))