bible/encode.clj


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

(require '[clojure.string :as str])

(require '[clojure.java.io :as io])
(require '[babashka.fs :as fs])

(require '[clojure.data.priority-map :as pm])

(comment
  "Build the base file"
  (def files (fs/glob "./base_files/" "**.txt"))

  (defn get-book-num [filename]
    (let [[_ _ book _ _]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt book)))

  (defn get-chap-num [filename]
    (let [[_ _ _ _ chap]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt chap)))

  (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
    (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
      (with-open [reader (io/reader (fs/file f))]
        (doseq [line (drop 2 (line-seq reader))]
          (.write writer (str line "\n")))))))

(def full-text (slurp "bbe-newlines-nochaps.txt"))

(def tokens
  (-> full-text 
      (str/lower-case)
      (str/replace #"\s+" " ")
      (str/replace #"'s" " APOSTROPHE_S ")
      (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
      (str/split #" ")
      (#(remove str/blank? %1))))

(def symbol-freqs (frequencies tokens))

(comment
  "Do some basic statistics and print a list of tokens"
  (spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs))))

  (sort-by val > symbol-freqs)  ; Greatest to lease frequency
  (reduce + (map val symbol-freqs))  ; Total tokens
  (count symbol-freqs)  ; Total unique tokens
  (reduce + (take 512 (map val symbol-freqs)))  ; Number of the top 100 common tokens

  (reduce + (map count symbol-freqs))  ; Total chars needed for dict vals

  (def two-grams (frequencies (partition 2 1 tokens)))
  (sort-by val > two-grams))


;;; Make the huffman tree for the symbols (13)
(defrecord Node [left right sym probability])

; Create a prioirity-queue of parentless nodes for each symbol
(def pq
  (into (pm/priority-map-keyfn (juxt first second))
        (map #(vector
                (->Node nil nil (first %1) (second %1))
                [(second %1) (first %1)])
             symbol-freqs)))

(assert (= (count symbol-freqs) (count pq)) "Priority queue has fewer symbols than symbol list")

;; From: https://michaeldipperstein.github.io/huffman.html#decode
;; Step 1. Create a parentless node for each symbol. Each node should include the symbol and its probability.
;; Step 2. Select the two parentless nodes with the lowest probabilities.
;; Step 3. Create a new node which is the parent of the two lowest probability nodes.
;; Step 4. Assign the new node a probability equal to the sum of its children's probabilities.
;; Step 5. Repeat from Step 2 until there is only one parentless node left.

;; NOTE: This is an inefficient algorithm because we could use the 
;;       two-queue version on wikipedia
(defn build-huffman-tree [queue]
  (if (= 1 (count queue))
    (first (peek queue))  ; Repeat until there is only one parentless node left
    (let [[lowest-node [lowest-prob _]] (peek queue)
          [second-node [second-prob _]] (peek (pop queue))  ; Step 2
          new-prob (+ lowest-prob second-prob) ; Step 4
          new-node (->Node second-node lowest-node nil new-prob) ; Step 3 - NOTE: unsure about node order
          next-queue (assoc (pop (pop queue)) new-node [new-prob nil])] 
      (recur next-queue))))

(def huffman-tree (build-huffman-tree pq))

(assert (= (.probability huffman-tree)
           (reduce + (map val symbol-freqs)))
        "Probability of root node is not equal to the sum of all probabilities")

(defn huffman-tree-to-symbol-encodings [node encodings curr-encoding]
  (if (.sym node)
    (assoc encodings (.sym node) curr-encoding)
    (merge
      (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding))
      (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding)))))

(def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} ""))
(assert (= (count huffman-tree-syms)
           (count pq)
           (count symbol-freqs)))

;;; Build the canonical encodings

;; Each of the existing codes are replaced with a new one of the same length, using the following algorithm:

;;     The first symbol in the list gets assigned a codeword which is the same length as the symbol's original codeword but all zeros. This will often be a single zero ('0').
;;     Each subsequent symbol is assigned the next binary number in sequence, ensuring that following codes are always higher in value.
;;     When you reach a longer codeword, then after incrementing, append zeros until the length of the new codeword is equal to the length of the old codeword. This can be thought of as a left shift.

(defrecord HuffmanCodeword [sym code length])

(def sorted-huffman-tree-codewords 
  (->> huffman-tree-syms
       (sort-by (juxt (comp count val) key))
       (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1)))))))

(defn build-canonical-encodings
  "Build canonical huffman encodings from a sorted list of huffman tree codewords"
  ([symbols]
   (let [first-sym (first symbols)
         seed-symbol (->HuffmanCodeword (.sym first-sym) 0 (.length first-sym))]
     (build-canonical-encodings [seed-symbol] (rest symbols))))
  ([codes symbols]
   (if (not-empty symbols)
     (let [prev-codeword (last codes)
           current-codeword (first symbols)
           next-sym (.sym current-codeword)
           next-base-code (unchecked-inc (.code prev-codeword))
           prev-len (.length prev-codeword)
           next-codeword (if (= (.length current-codeword)
                                (.length prev-codeword))
                           (->HuffmanCodeword next-sym
                                              next-base-code
                                              prev-len)
                           (->HuffmanCodeword next-sym
                                              (bit-shift-left next-base-code (inc prev-len))
                                              (inc prev-len)))]
       (recur (conj codes next-codeword) (rest symbols)))
     codes)))


(def canonical-encodings
  (build-canonical-encodings sorted-huffman-tree-codewords))

(map #(Long/toBinaryString (.code %1)) (take 100 canonical-encodings)) ;; The results of this *seem* wrong.