aboutsummaryrefslogtreecommitdiff
path: root/bible/lzss.clj
diff options
context:
space:
mode:
authorPatrick Kingston <patrick@pkingston.xyz>2026-01-26 22:29:57 -0500
committerPatrick Kingston <patrick@pkingston.xyz>2026-01-26 22:29:57 -0500
commit8eab01054bf2d484f16be29767baab00535b87b1 (patch)
tree588bbd84f70302a1799d00e00a7afeab93755f2f /bible/lzss.clj
parenta5cb8e19665de19f21d5b8f19719b9d448c99b67 (diff)
Keep on compressing
Diffstat (limited to 'bible/lzss.clj')
-rw-r--r--bible/lzss.clj156
1 files changed, 156 insertions, 0 deletions
diff --git a/bible/lzss.clj b/bible/lzss.clj
new file mode 100644
index 0000000..d6f5a32
--- /dev/null
+++ b/bible/lzss.clj
@@ -0,0 +1,156 @@
+(require '[clojure.string :as str])
+
+(require '[clojure.java.io :as io])
+(require '[babashka.fs :as fs])
+
+(require '[clojure.data.priority-map :as pm])
+
+(require '[clojure.math :as math])
+
+(require '[clojure.core.match :as match])
+
+(comment
+ "Build the base file"
+ (def files (fs/glob "./base_files/" "**.txt"))
+
+ (defn get-book-num [filename]
+ (let [[_ _ book _ _]
+ (str/split (str filename) #"_")
+ #_#_chap (int _chap)]
+ (Integer/parseInt book)))
+
+ (defn get-chap-num [filename]
+ (let [[_ _ _ _ chap]
+ (str/split (str filename) #"_")
+ #_#_chap (int _chap)]
+ (Integer/parseInt chap)))
+
+ (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
+ (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
+ (with-open [reader (io/reader (fs/file f))]
+ (doseq [line (drop 2 (line-seq reader))]
+ (.write writer (str line "\n")))))))
+
+(def full-text (slurp "./bbe-newlines-nochaps.txt"))
+
+(defn sizetable [bits]
+ (let [bytect (math/ceil (/ bits 8))
+ kbs (/ bytect 1024)
+ mbs (/ kbs 1024)]
+ {:bits bits
+ :bytes bytect
+ :kbs kbs
+ :mbs mbs}))
+
+(def mildlyprocessed (str/replace full-text #"—" "-"))
+
+#_#_(println (sizetable (* 8 (count full-text))))
+(println (sizetable (* 8 (count (encode-lzss (take 100000 mildlyprocessed))))))
+
+(def WINDOW-SIZE (* 32 1024)#_(+ 512 1024)) ; The maximum distance we look back
+(def MIN-MATCH 3) ; Minimum length to bother with a reference
+; START, two byte num, SEP, two byte num
+
+(defn- get-match-len [data-vec pos match-pos max-len]
+ (loop [len 0]
+ (if (and (< len max-len)
+ (= (nth data-vec (+ pos len))
+ (nth data-vec (+ match-pos len))))
+ (recur (inc len))
+ len)))
+
+(defn compress
+ "LZSS using transient vectors and a hash-index for O(1) lookups."
+ [data]
+ (let [data-vec (vec data)
+ data-len (count data-vec)
+ #_#_MIN-MATCH 3] ;; Standard MIN-MATCH
+ (loop [cursor 0
+ ;; map of [c1 c2 c3] -> list of indices
+ index {}
+ ;; Transient vector for high-performance output collection
+ out (transient [])]
+ (if (>= cursor data-len)
+ (apply str (persistent! out)) ;; Finalize the transient and join
+
+ (let [;; 1. Check if we have a potential match using the index
+ triplet (subvec data-vec cursor (min (+ cursor MIN-MATCH) data-len))
+ match-indices (get index triplet)
+
+ ;; 2. Find the best match among the candidates
+ best-match (when (>= (count triplet) MIN-MATCH)
+ (reduce (fn [best idx]
+ (let [len (get-match-len data-vec cursor idx (min (- data-len cursor) WINDOW-SIZE))]
+ (if (> len (:length best 0))
+ {:offset (- cursor idx) :length len}
+ best)))
+ nil
+ (filter #(> % (- cursor WINDOW-SIZE)) match-indices)))]
+
+ (if (and best-match (>= (:length best-match) MIN-MATCH))
+ ;; Match found
+ (let [len (:length best-match)
+ #_#_ref-str (str "<" (:offset best-match) "|" len ">")
+ ref-str "<00|00"]
+ (recur (+ cursor len)
+ ;; Update index for every triplet we skip (simplification: just first)
+ (assoc index triplet (conj (get index triplet []) cursor))
+ (conj! out ref-str)))
+
+ ;; No match found
+ (let [char-lit (str (nth data-vec cursor))]
+ (recur (inc cursor)
+ (if (>= (count triplet) MIN-MATCH)
+ (assoc index triplet (conj (get index triplet []) cursor))
+ index)
+ (conj! out char-lit)))))))))
+
+;; --- Testing ---
+(let [input "When in the course of human events it becomes necessary to yada yada yada yada"]
+ (println "Input: " input)
+ (println "Output:" (compress input)))
+
+
+
+
+(defn compare-sizes
+ "Calculates actual LZSS bit-cost by parsing tags vs literals."
+ [original compressed]
+ (let [;; Regex to find your specific tag format
+ tag-pattern #"<00|00"
+
+ ;; 1. Identify all matches in the compressed string
+ all-tags (re-seq tag-pattern compressed)
+ num-matches (count all-tags)
+
+ ;; 2. Calculate literal count:
+ ;; We remove the tags from the string; what's left are the raw characters
+ literals-str (clojure.string/replace compressed tag-pattern "")
+ num-literals (count literals-str)
+
+ ;; 3. Calculate Real Bit Sizes
+ ;; Original is always just 8 bits per char
+ orig-bits (* 8 (count original))
+
+ ;; Compressed: 17 bits per match, 9 bits per literal
+ ;; ACTUALLY: 16 bits per match (only need 10 bits to say 1024), 8 bits per literal (1 bit flag, only need 6 bits for char)
+ comp-bits (+ (* num-matches 16 #_17)
+ (* num-literals 8 #_9))
+
+ ;; Standard unit conversion helper
+ metrics (fn [b]
+ (let [bytes (double (/ b 8))
+ kb (/ bytes 1024.0)]
+ {:bits b :bytes bytes :kb kb}))]
+
+ {:original (metrics orig-bits)
+ :compressed (metrics comp-bits)
+ :stats {:match-count num-matches
+ :literal-count num-literals}
+ :comparison {:bits-saved (- orig-bits comp-bits)
+ :savings-pct (double (* 100 (- 1 (/ comp-bits orig-bits))))}}))
+
+
+(let [test-seq (take 2000000 mildlyprocessed)
+ compressed-str (compress test-seq)]
+ (compare-sizes test-seq compressed-str))