aboutsummaryrefslogtreecommitdiff
path: root/bible/encode.clj
diff options
context:
space:
mode:
Diffstat (limited to 'bible/encode.clj')
-rw-r--r--bible/encode.clj221
1 files changed, 0 insertions, 221 deletions
diff --git a/bible/encode.clj b/bible/encode.clj
deleted file mode 100644
index abd5365..0000000
--- a/bible/encode.clj
+++ /dev/null
@@ -1,221 +0,0 @@
-(require '[clojure.string :as str])
-
-(require '[clojure.java.io :as io])
-(require '[babashka.fs :as fs])
-
-(require '[clojure.data.priority-map :as pm])
-
-(require '[clojure.math :as math])
-
-(comment
- "Build the base file"
- (def files (fs/glob "./base_files/" "**.txt"))
-
- (defn get-book-num [filename]
- (let [[_ _ book _ _]
- (str/split (str filename) #"_")
- #_#_chap (int _chap)]
- (Integer/parseInt book)))
-
- (defn get-chap-num [filename]
- (let [[_ _ _ _ chap]
- (str/split (str filename) #"_")
- #_#_chap (int _chap)]
- (Integer/parseInt chap)))
-
- (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
- (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
- (with-open [reader (io/reader (fs/file f))]
- (doseq [line (drop 2 (line-seq reader))]
- (.write writer (str line "\n")))))))
-
-(def full-text (slurp "bbe-newlines-nochaps.txt"))
-
-(def tokens
- (-> full-text
- (str/lower-case)
- (str/replace #"\s+" " ")
- (str/replace #"'s" " AS ") ;; Apostrophe_S
- (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
- (str/split #" ")
- (#(remove str/blank? %1))))
-
-(def symbol-freqs (frequencies tokens))
-
-(comment
- "Do some basic statistics and print a list of tokens"
- (spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs))))
-
- (sort-by val > symbol-freqs) ; Greatest to lease frequency
- (reduce + (map val symbol-freqs)) ; Total tokens
- (count symbol-freqs) ; Total unique tokens
- (reduce + (take 512 (map val symbol-freqs))) ; Number of the top 100 common tokens
-
- (reduce + (map count symbol-freqs)) ; Total chars needed for dict vals
-
- (def two-grams (frequencies (partition 2 1 tokens)))
- (sort-by val > two-grams))
-
-
-
-
-;;; Make the huffman tree for the symbols (13)
-(defrecord Node [left right sym probability])
-
-; Create a prioirity-queue of parentless nodes for each symbol
-(defn symboltable->pq [symbolfreqs]
- (into (pm/priority-map-keyfn (juxt first second))
- (map #(vector
- (->Node nil nil (first %1) (second %1))
- [(second %1) (first %1)])
- symbolfreqs)))
-
-(assert (= (count symbol-freqs) (count (symboltable->pq symbol-freqs))) "Priority queue has fewer symbols than symbol list")
-
-;; From: https://michaeldipperstein.github.io/huffman.html#decode
-;; Step 1. Create a parentless node for each symbol. Each node should include the symbol and its probability.
-;; Step 2. Select the two parentless nodes with the lowest probabilities.
-;; Step 3. Create a new node which is the parent of the two lowest probability nodes.
-;; Step 4. Assign the new node a probability equal to the sum of its children's probabilities.
-;; Step 5. Repeat from Step 2 until there is only one parentless node left.
-
-;; NOTE: This is an inefficient algorithm because we could use the
-;; two-queue version on wikipedia
-(defn build-huffman-tree-from-queue
- "Builds a huffman-tree out of a priority-queue"
- [queue]
- (if (= 1 (count queue))
- (first (peek queue)) ; Repeat until there is only one parentless node left
- (let [[lowest-node [lowest-prob _]] (peek queue)
- [second-node [second-prob _]] (peek (pop queue)) ; Step 2
- new-prob (+ lowest-prob second-prob) ; Step 4
- new-node (->Node second-node lowest-node nil new-prob) ; Step 3 - NOTE: unsure about node order
- next-queue (assoc (pop (pop queue)) new-node [new-prob nil])]
- (recur next-queue))))
-
-(defn build-huffman-tree
- "builds a huffman-tree out of a frequency table of symbols"
- [symbol-freq-table]
- (build-huffman-tree-from-queue (symboltable->pq symbol-freq-table)))
-
-(def huffman-tree (build-huffman-tree symbol-freqs))
-
-(assert (= (.probability huffman-tree)
- (reduce + (map val symbol-freqs)))
- "Probability of root node is not equal to the sum of all probabilities")
-
-(defn huffman-tree-to-symbol-encodings [node encodings curr-encoding]
- (if (.sym node)
- (assoc encodings (.sym node) curr-encoding)
- (merge
- (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding))
- (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding)))))
-
-(def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} ""))
-(assert (= (count huffman-tree-syms)
- (count pq)
- (count symbol-freqs)))
-
-;;; Build the canonical encodings
-
-;; From: https://en.wikipedia.org/wiki/Canonical_Huffman_code
-;; Each of the existing codes are replaced with a new one of the same length, using the following algorithm:
-
-;; The first symbol in the list gets assigned a codeword which is the same length as the symbol's original codeword but all zeros. This will often be a single zero ('0').
-;; Each subsequent symbol is assigned the next binary number in sequence, ensuring that following codes are always higher in value.
-;; When you reach a longer codeword, then after incrementing, append zeros until the length of the new codeword is equal to the length of the old codeword. This can be thought of as a left shift.
-
-
-;; little class for Huffman Codewords
-(defrecord HuffmanCodeword [sym code length])
-(defn get-codeword-string
- "Takes a [HuffmanCodeword] and returns a binary str of the code"
- [codeword]
- (let [base (Long/toBinaryString (.code codeword))
- basecount (count base)
- len (.length codeword)]
- (if (= basecount len)
- base
- (str (apply str (take (- len basecount) (repeat "0"))) base))))
-
-(def sorted-huffman-tree-codewords
- (->> huffman-tree-syms
- (sort-by (juxt (comp count val) key))
- (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1)))))))
-
-(defn build-canonical-encodings
- "Build canonical huffman encodings from a sorted list of huffman tree codewords
- takes [symbols] a list of huffman codes derived from a code-tree"
- ([symbols]
- (let [first-sym (first symbols)
- seed-symbol (->HuffmanCodeword (.sym first-sym) 0 (.length first-sym))]
- (build-canonical-encodings [seed-symbol] (rest symbols))))
- ([codes symbols]
- (if (not-empty symbols)
- (let [prev-codeword (last codes)
- current-codeword (first symbols)
- next-sym (.sym current-codeword)
- next-base-code (unchecked-inc (.code prev-codeword))
- prev-len (.length prev-codeword)
- next-codeword
- (if (= (.length current-codeword)
- (.length prev-codeword))
- (->HuffmanCodeword
- next-sym
- next-base-code
- prev-len)
- (->HuffmanCodeword
- next-sym
- (bit-shift-left
- next-base-code
- (- prev-len
- (- 63 (Long/numberOfLeadingZeros next-base-code)))) ;Not 100% confident about this
- (inc prev-len)))]
- (recur (conj codes next-codeword) (rest symbols)))
- codes)))
-
-(def canonical-encodings
- (build-canonical-encodings sorted-huffman-tree-codewords))
-
-
-(assert
- (= (map #(.length %1) sorted-huffman-tree-codewords)
- (map #(.length %1) sorted-huffman-tree-codewords))
- "Some of the codes changed length when canonicalizing")
-
-(assert
- (= (count canonical-encodings)
- (count (set (map #(Long/toBinaryString (.code %1)) canonical-encodings))))
- "There appears to be duplicate canonical encodings")
-
-(defn get-encoded-length [encodings tokenlist]
- (let [encodingtable (into {} (map #(vector (.sym %1) (.length %1)) encodings))
- totalbits (reduce + (map encodingtable tokenlist))
- totalbytes (math/ceil (/ totalbits 8))
- totalkb (math/ceil (/ totalbytes 1024))
- totalmb (math/ceil (/ totalkb 1024))]
- {:bits totalbits
- :bytes totalbytes
- :kb totalkb
- :mb totalmb}))
-
-(count (frequencies (str/replace full-text #"\*" "")))
-
-(def raw-token-ascii-encoding
- (get-encoded-length canonical-encodings tokens))
-
-(comment
- "Figuring out how to make the shortest bit smaller"
- (defn get-highest-freq-percent [freqlist]
- (let [most-freq (first (sort-by val > freqlist))] ;the fn already sorts, but most invocations pre-sort for REPL purposes
- [(key most-freq)
- (double
- (/ (second most-freq)
- (reduce + (map second symbol-freqs))))]))
-
- (get-highest-freq-percent symbol-freqs) ;most frequent symbol is 6.83%
- (get-highest-freq-percent (sort-by val > (frequencies (map first tokens)))) ;most frequent 1-letter prefix
- (get-highest-freq-percent (sort-by val > (frequencies (map last tokens)))) ;most frequent 1-letter suffix
- (get-highest-freq-percent (sort-by val > (frequencies (map count tokens)))) ;26% of the text is 3 letter tokens
- )
-