blob: d9963ea1f9ebcca022385b698a32b48302833135 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
|
(require '[clojure.string :as str])
(require '[clojure.java.io :as io])
(require '[babashka.fs :as fs])
(require '[clojure.data.priority-map :as pm])
(require '[clojure.math :as math])
(require '[clojure.core.match :as match])
;;; The full compressor of bible in basic english
;Build the base txt file out of individual txt files
(comment
(def files (fs/glob "./base_files/" "**.txt"))
(defn get-book-num [filename]
(let [[_ _ book _ _]
(str/split (str filename) #"_")
#_#_chap (int _chap)]
(Integer/parseInt book)))
(defn get-chap-num [filename]
(let [[_ _ _ _ chap]
(str/split (str filename) #"_")
#_#_chap (int _chap)]
(Integer/parseInt chap)))
(with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
(doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
(with-open [reader (io/reader (fs/file f))]
(doseq [line (drop 2 (line-seq reader))]
(.write writer (str line "\n")))))))
;;; The full text as a file
(def full-text (slurp "./bbe-newlines-nochaps.txt"))
(def optimized-string
(-> full-text
(str/lower-case)
(str/replace #"'s" " AS ")
(str/replace #"\.\.\." " DOTDOTDOT ")
(str/replace #"\*\*\*" " STARSTARSTAR ")
(str/replace #"—" "-")
(str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
(str/replace #"\s+" " ")))
(def optimized-tokens
(str/split optimized-string #" "))
(comment ;Some basic stats on our work so far
(count full-text) ; total chars 4207465
(count optimized-tokens) ; total tokens 962868
(count (into #{} optimized-tokens)) ; 5997 total unique tokens
(apply max (map count (into #{} optimized-tokens))) ; max word is 17 chars long "straightforwardly" -> 1 nyble to represent?
; We could maybe do some domain modeling and do like
; "suffix-s" or "suffix-ly"s like with "'s" right now
)
;;; First we'll dictionary-encode our tokens
;; For a less efficient (in the long term) encoding algorithm, see dictionary-packed.clj
(def word-ids
(let [sorted-toks (sort-by val > (frequencies optimized-tokens))
token-reprs
(into {}
(map-indexed
(fn [id [tok _freq]]
[tok (unchecked-short (bit-and 0xffff id))#_[(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]])
sorted-toks))]
token-reprs))
(def dict-id-compressed-text
(map word-ids optimized-tokens))
(comment
(count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices (it's 962868 shorts)
)
;;; TODO: Build dictionary
;;; Next, we'll run LZSS on our tok-id-list
(def WINDOW-SIZE 1024) ; The maximum distance we look back, only allowing 1k RAM
(def MIN-MATCH 3) ; Minimum length to bother with a reference
(def MATCH-FLAG (count (frequencies dict-id-compressed-text)))
;Basically it's the total number of tokens they only need 0-(total-1)
; This is the START. Then 10 bits for the offset (1k), and 4 bits for the length (255 toks).
(defn- get-match-len [data-vec pos match-pos max-len]
(loop [len 0]
(if (and (< len max-len)
(= (nth data-vec (+ pos len))
(nth data-vec (+ match-pos len))))
(recur (inc len))
len)))
#_(defn compress
"LZSS using transient vectors and a hash-index for O(1) lookups."
[data]
(let [data-vec (vec data)
data-len (count data-vec)
#_#_MIN-MATCH 3] ;; Standard MIN-MATCH
(loop [cursor 0
;; map of [c1 c2 c3] -> list of indices
index {}
;; Transient vector for high-performance output collection
out (transient [])]
(if (>= cursor data-len)
(apply str (persistent! out)) ;; Finalize the transient and join
(let [;; 1. Check if we have a potential match using the index
triplet (subvec data-vec cursor (min (+ cursor MIN-MATCH) data-len))
match-indices (get index triplet)
;; 2. Find the best match among the candidates
best-match (when (>= (count triplet) MIN-MATCH)
(reduce (fn [best idx]
(let [len (get-match-len data-vec cursor idx (min (- data-len cursor) WINDOW-SIZE))]
(if (> len (:length best 0))
{:offset (- cursor idx) :length len}
best)))
nil
(filter #(> % (- cursor WINDOW-SIZE)) match-indices)))]
(if (and best-match (>= (:length best-match) MIN-MATCH))
;; Match found
(let [len (:length best-match)
ref-str (str "<" (:offset best-match) "|" len ">")
#_#_ref-str "<00|00"]
(recur (+ cursor len)
;; Update index for every triplet we skip (simplification: just first)
(assoc index triplet (conj (get index triplet []) cursor))
(conj! out ref-str)))
;; No match found
(let [char-lit (str (nth data-vec cursor))]
(recur (inc cursor)
(if (>= (count triplet) MIN-MATCH)
(assoc index triplet (conj (get index triplet []) cursor))
index)
(conj! out char-lit)))))))))
(defn compress-tokens
"Takes a sequence of 16-bit token IDs.
Returns a vector of maps: {:type :lit :val v} or {:type :match :off o :len l}"
[data]
(let [data-vec (vec data)
data-len (count data-vec)]
(loop [cursor 0
index {}
out (transient [])]
(if (>= cursor data-len)
(persistent! out) ;; Return clean vector of data structures
;; 1. Setup search parameters
(let [max-search-len (min (- data-len cursor) 255) ;; Cap length (e.g. 255) for 8-bit length fields
triplet (if (>= max-search-len MIN-MATCH)
(subvec data-vec cursor (+ cursor MIN-MATCH))
nil)
match-indices (get index triplet)]
;; 2. Find Best Match
(let [best-match (when (and triplet match-indices)
(reduce (fn [best candidate-idx]
;; Check if candidate is within WINDOW-SIZE
(if (> candidate-idx (- cursor WINDOW-SIZE))
(let [len (get-match-len data-vec cursor candidate-idx max-search-len)]
(if (> len (:len best 0))
{:dist (- cursor candidate-idx) :len len}
best))
best)) ;; Too old, skip
nil
match-indices))]
(if (and best-match (>= (:len best-match) MIN-MATCH))
;; CASE A: Match Found
(recur (+ cursor (:len best-match))
;; Note: We are still "Lazy Indexing" here for speed.
;; To maximize compression, you'd loop here to add skipped parts to `index`.
(assoc index triplet (conj (get index triplet []) cursor))
(conj! out {:type :match
:dist (:dist best-match)
:len (:len best-match)}))
;; CASE B: Literal
(recur (inc cursor)
(if triplet
(assoc index triplet (conj (get index triplet []) cursor))
index)
(conj! out {:type :lit
:val (nth data-vec cursor)})))))))))
(def lzss-compressed-dict-ids (compress-tokens dict-id-compressed-text))
; The way this works is it outputs literal shorts, and then if there's a short
; that says MATCH-FLAG it doesn't bother huffman encoding.
|