diff options
Diffstat (limited to 'bible/lzss.clj')
| -rw-r--r-- | bible/lzss.clj | 156 |
1 files changed, 156 insertions, 0 deletions
diff --git a/bible/lzss.clj b/bible/lzss.clj new file mode 100644 index 0000000..d6f5a32 --- /dev/null +++ b/bible/lzss.clj @@ -0,0 +1,156 @@ +(require '[clojure.string :as str]) + +(require '[clojure.java.io :as io]) +(require '[babashka.fs :as fs]) + +(require '[clojure.data.priority-map :as pm]) + +(require '[clojure.math :as math]) + +(require '[clojure.core.match :as match]) + +(comment + "Build the base file" + (def files (fs/glob "./base_files/" "**.txt")) + + (defn get-book-num [filename] + (let [[_ _ book _ _] + (str/split (str filename) #"_") + #_#_chap (int _chap)] + (Integer/parseInt book))) + + (defn get-chap-num [filename] + (let [[_ _ _ _ chap] + (str/split (str filename) #"_") + #_#_chap (int _chap)] + (Integer/parseInt chap))) + + (with-open [writer (io/writer "bbe-newlines-nochaps.txt")] + (doseq [f (sort-by (juxt get-book-num get-chap-num) files)] + (with-open [reader (io/reader (fs/file f))] + (doseq [line (drop 2 (line-seq reader))] + (.write writer (str line "\n"))))))) + +(def full-text (slurp "./bbe-newlines-nochaps.txt")) + +(defn sizetable [bits] + (let [bytect (math/ceil (/ bits 8)) + kbs (/ bytect 1024) + mbs (/ kbs 1024)] + {:bits bits + :bytes bytect + :kbs kbs + :mbs mbs})) + +(def mildlyprocessed (str/replace full-text #"—" "-")) + +#_#_(println (sizetable (* 8 (count full-text)))) +(println (sizetable (* 8 (count (encode-lzss (take 100000 mildlyprocessed)))))) + +(def WINDOW-SIZE (* 32 1024)#_(+ 512 1024)) ; The maximum distance we look back +(def MIN-MATCH 3) ; Minimum length to bother with a reference +; START, two byte num, SEP, two byte num + +(defn- get-match-len [data-vec pos match-pos max-len] + (loop [len 0] + (if (and (< len max-len) + (= (nth data-vec (+ pos len)) + (nth data-vec (+ match-pos len)))) + (recur (inc len)) + len))) + +(defn compress + "LZSS using transient vectors and a hash-index for O(1) lookups." + [data] + (let [data-vec (vec data) + data-len (count data-vec) + #_#_MIN-MATCH 3] ;; Standard MIN-MATCH + (loop [cursor 0 + ;; map of [c1 c2 c3] -> list of indices + index {} + ;; Transient vector for high-performance output collection + out (transient [])] + (if (>= cursor data-len) + (apply str (persistent! out)) ;; Finalize the transient and join + + (let [;; 1. Check if we have a potential match using the index + triplet (subvec data-vec cursor (min (+ cursor MIN-MATCH) data-len)) + match-indices (get index triplet) + + ;; 2. Find the best match among the candidates + best-match (when (>= (count triplet) MIN-MATCH) + (reduce (fn [best idx] + (let [len (get-match-len data-vec cursor idx (min (- data-len cursor) WINDOW-SIZE))] + (if (> len (:length best 0)) + {:offset (- cursor idx) :length len} + best))) + nil + (filter #(> % (- cursor WINDOW-SIZE)) match-indices)))] + + (if (and best-match (>= (:length best-match) MIN-MATCH)) + ;; Match found + (let [len (:length best-match) + #_#_ref-str (str "<" (:offset best-match) "|" len ">") + ref-str "<00|00"] + (recur (+ cursor len) + ;; Update index for every triplet we skip (simplification: just first) + (assoc index triplet (conj (get index triplet []) cursor)) + (conj! out ref-str))) + + ;; No match found + (let [char-lit (str (nth data-vec cursor))] + (recur (inc cursor) + (if (>= (count triplet) MIN-MATCH) + (assoc index triplet (conj (get index triplet []) cursor)) + index) + (conj! out char-lit))))))))) + +;; --- Testing --- +(let [input "When in the course of human events it becomes necessary to yada yada yada yada"] + (println "Input: " input) + (println "Output:" (compress input))) + + + + +(defn compare-sizes + "Calculates actual LZSS bit-cost by parsing tags vs literals." + [original compressed] + (let [;; Regex to find your specific tag format + tag-pattern #"<00|00" + + ;; 1. Identify all matches in the compressed string + all-tags (re-seq tag-pattern compressed) + num-matches (count all-tags) + + ;; 2. Calculate literal count: + ;; We remove the tags from the string; what's left are the raw characters + literals-str (clojure.string/replace compressed tag-pattern "") + num-literals (count literals-str) + + ;; 3. Calculate Real Bit Sizes + ;; Original is always just 8 bits per char + orig-bits (* 8 (count original)) + + ;; Compressed: 17 bits per match, 9 bits per literal + ;; ACTUALLY: 16 bits per match (only need 10 bits to say 1024), 8 bits per literal (1 bit flag, only need 6 bits for char) + comp-bits (+ (* num-matches 16 #_17) + (* num-literals 8 #_9)) + + ;; Standard unit conversion helper + metrics (fn [b] + (let [bytes (double (/ b 8)) + kb (/ bytes 1024.0)] + {:bits b :bytes bytes :kb kb}))] + + {:original (metrics orig-bits) + :compressed (metrics comp-bits) + :stats {:match-count num-matches + :literal-count num-literals} + :comparison {:bits-saved (- orig-bits comp-bits) + :savings-pct (double (* 100 (- 1 (/ comp-bits orig-bits))))}})) + + +(let [test-seq (take 2000000 mildlyprocessed) + compressed-str (compress test-seq)] + (compare-sizes test-seq compressed-str)) |
