blob: d6f5a32e70dff11cde661efbbec20a20df647d74 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
|
(require '[clojure.string :as str])
(require '[clojure.java.io :as io])
(require '[babashka.fs :as fs])
(require '[clojure.data.priority-map :as pm])
(require '[clojure.math :as math])
(require '[clojure.core.match :as match])
(comment
"Build the base file"
(def files (fs/glob "./base_files/" "**.txt"))
(defn get-book-num [filename]
(let [[_ _ book _ _]
(str/split (str filename) #"_")
#_#_chap (int _chap)]
(Integer/parseInt book)))
(defn get-chap-num [filename]
(let [[_ _ _ _ chap]
(str/split (str filename) #"_")
#_#_chap (int _chap)]
(Integer/parseInt chap)))
(with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
(doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
(with-open [reader (io/reader (fs/file f))]
(doseq [line (drop 2 (line-seq reader))]
(.write writer (str line "\n")))))))
(def full-text (slurp "./bbe-newlines-nochaps.txt"))
(defn sizetable [bits]
(let [bytect (math/ceil (/ bits 8))
kbs (/ bytect 1024)
mbs (/ kbs 1024)]
{:bits bits
:bytes bytect
:kbs kbs
:mbs mbs}))
(def mildlyprocessed (str/replace full-text #"—" "-"))
#_#_(println (sizetable (* 8 (count full-text))))
(println (sizetable (* 8 (count (encode-lzss (take 100000 mildlyprocessed))))))
(def WINDOW-SIZE (* 32 1024)#_(+ 512 1024)) ; The maximum distance we look back
(def MIN-MATCH 3) ; Minimum length to bother with a reference
; START, two byte num, SEP, two byte num
(defn- get-match-len [data-vec pos match-pos max-len]
(loop [len 0]
(if (and (< len max-len)
(= (nth data-vec (+ pos len))
(nth data-vec (+ match-pos len))))
(recur (inc len))
len)))
(defn compress
"LZSS using transient vectors and a hash-index for O(1) lookups."
[data]
(let [data-vec (vec data)
data-len (count data-vec)
#_#_MIN-MATCH 3] ;; Standard MIN-MATCH
(loop [cursor 0
;; map of [c1 c2 c3] -> list of indices
index {}
;; Transient vector for high-performance output collection
out (transient [])]
(if (>= cursor data-len)
(apply str (persistent! out)) ;; Finalize the transient and join
(let [;; 1. Check if we have a potential match using the index
triplet (subvec data-vec cursor (min (+ cursor MIN-MATCH) data-len))
match-indices (get index triplet)
;; 2. Find the best match among the candidates
best-match (when (>= (count triplet) MIN-MATCH)
(reduce (fn [best idx]
(let [len (get-match-len data-vec cursor idx (min (- data-len cursor) WINDOW-SIZE))]
(if (> len (:length best 0))
{:offset (- cursor idx) :length len}
best)))
nil
(filter #(> % (- cursor WINDOW-SIZE)) match-indices)))]
(if (and best-match (>= (:length best-match) MIN-MATCH))
;; Match found
(let [len (:length best-match)
#_#_ref-str (str "<" (:offset best-match) "|" len ">")
ref-str "<00|00"]
(recur (+ cursor len)
;; Update index for every triplet we skip (simplification: just first)
(assoc index triplet (conj (get index triplet []) cursor))
(conj! out ref-str)))
;; No match found
(let [char-lit (str (nth data-vec cursor))]
(recur (inc cursor)
(if (>= (count triplet) MIN-MATCH)
(assoc index triplet (conj (get index triplet []) cursor))
index)
(conj! out char-lit)))))))))
;; --- Testing ---
(let [input "When in the course of human events it becomes necessary to yada yada yada yada"]
(println "Input: " input)
(println "Output:" (compress input)))
(defn compare-sizes
"Calculates actual LZSS bit-cost by parsing tags vs literals."
[original compressed]
(let [;; Regex to find your specific tag format
tag-pattern #"<00|00"
;; 1. Identify all matches in the compressed string
all-tags (re-seq tag-pattern compressed)
num-matches (count all-tags)
;; 2. Calculate literal count:
;; We remove the tags from the string; what's left are the raw characters
literals-str (clojure.string/replace compressed tag-pattern "")
num-literals (count literals-str)
;; 3. Calculate Real Bit Sizes
;; Original is always just 8 bits per char
orig-bits (* 8 (count original))
;; Compressed: 17 bits per match, 9 bits per literal
;; ACTUALLY: 16 bits per match (only need 10 bits to say 1024), 8 bits per literal (1 bit flag, only need 6 bits for char)
comp-bits (+ (* num-matches 16 #_17)
(* num-literals 8 #_9))
;; Standard unit conversion helper
metrics (fn [b]
(let [bytes (double (/ b 8))
kb (/ bytes 1024.0)]
{:bits b :bytes bytes :kb kb}))]
{:original (metrics orig-bits)
:compressed (metrics comp-bits)
:stats {:match-count num-matches
:literal-count num-literals}
:comparison {:bits-saved (- orig-bits comp-bits)
:savings-pct (double (* 100 (- 1 (/ comp-bits orig-bits))))}}))
(let [test-seq (take 2000000 mildlyprocessed)
compressed-str (compress test-seq)]
(compare-sizes test-seq compressed-str))
|