bible/huffman.clj


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293

(require '[clojure.string :as str])

(require '[clojure.java.io :as io])
(require '[babashka.fs :as fs])

(require '[clojure.data.priority-map :as pm])

(require '[clojure.math :as math])

(comment
  "Build the base file"
  (def files (fs/glob "./base_files/" "**.txt"))

  (defn get-book-num [filename]
    (let [[_ _ book _ _]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt book)))

  (defn get-chap-num [filename]
    (let [[_ _ _ _ chap]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt chap)))

  (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
    (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
      (with-open [reader (io/reader (fs/file f))]
        (doseq [line (drop 2 (line-seq reader))]
          (.write writer (str line "\n")))))))

(def full-text (slurp "bbe-newlines-nochaps.txt"))

;; This is "Stage1-optimized-tokens"
(def tokens
  (-> full-text 
      (str/lower-case)
      (str/replace #"\s+" " ")
      (str/replace #"'s" " AS ") ;; Apostrophe_S
      (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
      (str/split #" ")
      (#(remove str/blank? %1))))

(def symbol-freqs (frequencies tokens))

(comment
  "Do some basic statistics and print a list of tokens"
  (spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs))))

  (sort-by val > symbol-freqs)  ; Greatest to lease frequency
  (reduce + (map val symbol-freqs))  ; Total tokens
  (count symbol-freqs)  ; Total unique tokens
  (reduce + (take 512 (map val symbol-freqs)))  ; Number of the top 100 common tokens

  (reduce + (map count symbol-freqs))  ; Total chars needed for dict vals

  (def two-grams (frequencies (partition 2 1 tokens)))
  (sort-by val > two-grams))


;;; Make the huffman tree for the symbols (13)
(defrecord Node [left right sym probability])

; Create a prioirity-queue of parentless nodes for each symbol
(defn symboltable->pq [symbolfreqs]
  (into (pm/priority-map-keyfn (juxt first second))
        (map #(vector
                (->Node nil nil (first %1) (second %1))
                [(second %1) (first %1)])
             symbolfreqs)))

(assert (= (count symbol-freqs) (count (symboltable->pq symbol-freqs))) "Priority queue has fewer symbols than symbol list")

;; From: https://michaeldipperstein.github.io/huffman.html#decode
;; Step 1. Create a parentless node for each symbol. Each node should include the symbol and its probability.
;; Step 2. Select the two parentless nodes with the lowest probabilities.
;; Step 3. Create a new node which is the parent of the two lowest probability nodes.
;; Step 4. Assign the new node a probability equal to the sum of its children's probabilities.
;; Step 5. Repeat from Step 2 until there is only one parentless node left.

;; NOTE: This is an inefficient algorithm because we could use the 
;;       two-queue version on wikipedia
(defn build-huffman-tree-from-queue
  "Builds a huffman-tree out of a priority-queue"
  [queue]
  (if (= 1 (count queue))
    (first (peek queue))  ; Repeat until there is only one parentless node left
    (let [[lowest-node [lowest-prob _]] (peek queue)
          [second-node [second-prob _]] (peek (pop queue))  ; Step 2
          new-prob (+ lowest-prob second-prob) ; Step 4
          new-node (->Node second-node lowest-node nil new-prob) ; Step 3 - NOTE: unsure about node order
          next-queue (assoc (pop (pop queue)) new-node [new-prob nil])] 
      (recur next-queue))))

(defn build-huffman-tree
  "builds a huffman-tree out of a frequency table of symbols"
  [symbol-freq-table]
  (build-huffman-tree-from-queue (symboltable->pq symbol-freq-table)))

(def huffman-tree (build-huffman-tree symbol-freqs))

(assert (= (.probability huffman-tree)
           (reduce + (map val symbol-freqs)))
        "Probability of root node is not equal to the sum of all probabilities")

(defn huffman-tree-to-symbol-encodings 
  "Builds a list of symbol encodings out of a huffman-tree
   [tree] tree to symbol encodings
   [node encodings curr-encoding] recursive builder"
   ([tree]
    (huffman-tree-to-symbol-encodings tree {} ""))
  ([node encodings curr-encoding]
  (if (.sym node)
    (assoc encodings (.sym node) curr-encoding)
    (merge
      (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding))
      (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding))))))

(def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} ""))

(assert (= (huffman-tree-to-symbol-encodings huffman-tree {} "")
           (huffman-tree-to-symbol-encodings huffman-tree))
        "Multi-arity def does not work as intended")
(assert (= (count huffman-tree-syms)
           (count (symboltable->pq symbol-freqs))
           (count symbol-freqs))
        "Some function is adding or removing symbols")

;;; Build the canonical encodings

;; From: https://en.wikipedia.org/wiki/Canonical_Huffman_code
;; Each of the existing codes are replaced with a new one of the same length, using the following algorithm:

;;     The first symbol in the list gets assigned a codeword which is the same length as the symbol's original codeword but all zeros. This will often be a single zero ('0').
;;     Each subsequent symbol is assigned the next binary number in sequence, ensuring that following codes are always higher in value.
;;     When you reach a longer codeword, then after incrementing, append zeros until the length of the new codeword is equal to the length of the old codeword. This can be thought of as a left shift.


;; --- little class for Huffman Codewords
(defrecord HuffmanCodeword [sym code length])
(defn get-codeword-string 
  "Takes a [HuffmanCodeword] and returns a binary str of the code"
  [codeword]
  (let [base (Long/toBinaryString (.code codeword))
        basecount (count base)
        len (.length codeword)]
    (if (= basecount len)
      base
      (str (apply str (take (- len basecount) (repeat "0"))) base))))
;; --- little class for Huffman Codewords

(defn build-canonical-encodings
  "Build canonical huffman encodings from a huffman tree
   takes [symbols] a list of huffman codes derived from a code-tree"
  ([tree]
   (let [symbols (->> tree
                      (huffman-tree-to-symbol-encodings)
                      (sort-by (juxt (comp count val) key))

                      (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1))))))
         first-sym (first symbols)
         seed-symbol (->HuffmanCodeword (.sym first-sym) 0 (.length first-sym))]
     (build-canonical-encodings [seed-symbol] (rest symbols))))
  ([codes symbols]
   (if (not-empty symbols)
     (let [prev-codeword (last codes)
           current-codeword (first symbols)
           next-sym (.sym current-codeword)
           next-base-code (unchecked-inc (.code prev-codeword))
           prev-len (.length prev-codeword)
           next-codeword 
           (if (= (.length current-codeword)
                  (.length prev-codeword))
             (->HuffmanCodeword 
               next-sym
               next-base-code
               prev-len)
             (->HuffmanCodeword 
               next-sym
               (bit-shift-left 
                 next-base-code 
                 (- prev-len
                    (- 63 (Long/numberOfLeadingZeros next-base-code)))) ;Not 100% confident about this
               (inc prev-len)))]
       (recur (conj codes next-codeword) (rest symbols)))
     codes)))

(def canonical-encodings
  (build-canonical-encodings huffman-tree))


#_(assert
  (= (map #(.length %1) sorted-huffman-tree-codewords)
     (map #(.length %1) sorted-huffman-tree-codewords))
  "Some of the codes changed length when canonicalizing")

(assert
  (= (count canonical-encodings)
     (count (set (map #(Long/toBinaryString (.code %1)) canonical-encodings))))
  "There appears to be duplicate canonical encodings")

(defn get-encoded-length [encodings tokenlist]
  (let [encodingtable (into {} (map #(vector (.sym %1) (.length %1)) encodings))
        totalbits (reduce + (map encodingtable tokenlist))
        totalbytes (math/ceil (/ totalbits 8))
        totalkb (/ totalbytes 1024)
        totalmb (/ totalkb 1024)]
    {:bits totalbits
     :bytes totalbytes
     :kb totalkb
     :mb totalmb}))


(comment
  (def base-file-size
    (let [totalbits (* 8 (count full-text))
          totalbytes (math/ceil (/ totalbits 8))
          totalkb (/ totalbytes 1024)
          totalmb (/ totalkb 1024)]
      {:bits totalbits
       :bytes totalbytes
       :kb totalkb
       :mb totalmb})) 
  (def stage1-optimized-token-encoding-length
    (get-encoded-length canonical-encodings tokens)))

(comment
  "Some scratchwork figuring out how to make the shortest bit smaller"
  (defn get-highest-freq-percent [freqlist]
    (let [most-freq (first (sort-by val > freqlist))] ;the fn already sorts, but most invocations pre-sort for REPL purposes
    [(key most-freq)
     (double 
      (/ (second most-freq)
         (reduce + (map second symbol-freqs))))]))

  (get-highest-freq-percent symbol-freqs) ;most frequent symbol is 6.83%
  (get-highest-freq-percent (sort-by val > (frequencies (map first tokens)))) ;most frequent 1-letter prefix
  (get-highest-freq-percent (sort-by val > (frequencies (map last tokens)))) ;most frequent 1-letter suffix
  (get-highest-freq-percent (sort-by val > (frequencies (map count tokens)))) ;26% of the text is 3 letter tokens
  )

;;; NOTE FOR VIDEO:
;; First, consider some kind of bit-packing scheme 
;;  - 65 total chars
;;  - Shave it down to 63
;;  - Remove caps/reduce even furhter to try and get it to 31
;; Then introduce huffman encodings
;;  - Raw text
;;  - Text with some of the above optimizations
;;  - Tokenize at spaces and punctuation
;; Then consider tokenizing I started off by tokenizing the text with some optimizations in place with stage1-optimized-token-encoding-length

(comment
  "Huffman on the raw text (65 total symbols)"
  (let [tokenized (seq full-text)
        symbol-freqs (frequencies tokenized)
        huff-tree (build-huffman-tree symbol-freqs)
        canonical-codes (build-canonical-encodings huff-tree)]
    (get-encoded-length canonical-codes tokenized))
  "Huffman on the above with normalized space and without * (63 total symbols)"
  (let [tokenized (seq (-> full-text 
                           (str/replace #"\s+" " ")
                           (str/replace #"\*" " ")))
        symbol-freqs (frequencies tokenized)
        huff-tree (build-huffman-tree symbol-freqs)
        canonical-codes (build-canonical-encodings huff-tree)]
    (get-encoded-length canonical-codes tokenized))
  "Huffman on the above in all lowercase (38 total symbols)"
  (let [tokenized (seq (-> full-text 
                           (str/replace #"\s+" " ")
                           (str/replace #"[\*]" "")
                           (str/lower-case)))
        symbol-freqs (frequencies tokenized)
        huff-tree (build-huffman-tree symbol-freqs)
        canonical-codes (build-canonical-encodings huff-tree)]
    (get-encoded-length canonical-codes tokenized))
  "Huffman on the above with char changes —()?!:; (31 total symbols)"
  (let [tokenized (seq (-> full-text 
                           (str/replace #"—" "-") ; Replace emdash with hyphen
                           (str/replace #"[\*]" "") ; Get rid of chars
                           (str/replace #"\([^)]*\)" "") ; Get rid of everything between parens
                           (str/replace #"[?!]" ".") ; Replace all final punct with periods
                           (str/replace #"[;:]" ",") ; Replace all middle punct with commas
                           (str/replace #"\s+" " ") ; Normalize space
                           (str/lower-case)))
        symbol-freqs (frequencies tokenized)
        huff-tree (build-huffman-tree symbol-freqs)
        canonical-codes (build-canonical-encodings huff-tree)]
    (get-encoded-length canonical-codes tokenized))
  )