blob: 031b6aab3253f2d90a271442888e2b2639bd4710 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
|
(require '[clojure.string :as str])
(require '[clojure.java.io :as io])
(require '[babashka.fs :as fs])
(require '[clojure.data.priority-map :as pm])
(def files (fs/glob "./base_files/" "**.txt"))
(defn get-book-num [filename]
(let [[_ _ book _ _]
(str/split (str filename) #"_")
#_#_chap (int _chap)]
(Integer/parseInt book)))
(defn get-chap-num [filename]
(let [[_ _ _ _ chap]
(str/split (str filename) #"_")
#_#_chap (int _chap)]
(Integer/parseInt chap)))
(with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
(doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
(with-open [reader (io/reader (fs/file f))]
(doseq [line (drop 2 (line-seq reader))]
(.write writer (str line "\n"))))))
(def full-text (slurp "bbe-newlines-nochaps.txt"))
(def tokens
(-> full-text
(str/lower-case)
(str/replace #"\s+" " ")
(str/replace #"'s" " APOSTROPHE_S ")
(str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
(str/split #" ")
(#(remove str/blank? %1))))
(def symbol-freqs (frequencies tokens))
#_(spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs))))
(sort-by val > symbol-freqs) ; Greatest to lease frequency
(reduce + (map val symbol-freqs)) ; Total tokens
(count symbol-freqs) ; Total unique tokens
(reduce + (take 512 (map val symbol-freqs))) ; Number of the top 100 common tokens
(reduce + (map count symbol-freqs)) ; Total chars needed for dict vals
#_(def two-grams (frequencies (partition 2 1 tokens)))
#_(sort-by val > two-grams)
;;; Make the huffman tree for the symbols (13)
(defrecord Node [left right sym probability])
; Create a prioirity-queue of parentless nodes for each symbol
(def pq
(into (pm/priority-map-keyfn (juxt first second))
(map #(vector
(->Node nil nil (first %1) (second %1))
[(second %1) (first %1)])
symbol-freqs)))
(assert (= (count symbol-freqs) (count pq)) "Priority queue has fewer symbols than symbol list")
;; From: https://michaeldipperstein.github.io/huffman.html#decode
;; Step 1. Create a parentless node for each symbol. Each node should include the symbol and its probability.
;; Step 2. Select the two parentless nodes with the lowest probabilities.
;; Step 3. Create a new node which is the parent of the two lowest probability nodes.
;; Step 4. Assign the new node a probability equal to the sum of its children's probabilities.
;; Step 5. Repeat from Step 2 until there is only one parentless node left.
;; NOTE: This is an inefficient algorithm because we could use the
;; two-queue version on wikipedia
(defn build-huffman-tree [queue]
(if (= 1 (count queue))
(first (peek queue)) ; Repeat until there is only one parentless node left
(let [[lowest-node [lowest-prob _]] (peek queue)
[second-node [second-prob _]] (peek (pop queue)) ; Step 2
new-prob (+ lowest-prob second-prob) ; Step 4
new-node (->Node second-node lowest-node nil new-prob) ; Step 3 - NOTE: unsure about node order
next-queue (assoc (pop (pop queue)) new-node [new-prob nil])]
(recur next-queue))))
(def huffman-tree (build-huffman-tree pq))
(assert (= (.probability huffman-tree)
(reduce + (map val symbol-freqs)))
"Probability of root node is not equal to the sum of all probabilities")
(defn huffman-tree-to-symbol-encodings [node encodings curr-encoding]
(if (.sym node)
(assoc encodings (.sym node) curr-encoding)
(merge
(huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding))
(huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding)))))
(def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} ""))
(assert (= (count huffman-tree-syms)
(count pq)
(count symbol-freqs)))
|