(require '[clojure.string :as str]) (require '[clojure.java.io :as io]) (require '[babashka.fs :as fs]) (require '[clojure.data.priority-map :as pm]) (require '[clojure.math :as math]) (comment "Build the base file" (def files (fs/glob "./base_files/" "**.txt")) (defn get-book-num [filename] (let [[_ _ book _ _] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt book))) (defn get-chap-num [filename] (let [[_ _ _ _ chap] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt chap))) (with-open [writer (io/writer "bbe-newlines-nochaps.txt")] (doseq [f (sort-by (juxt get-book-num get-chap-num) files)] (with-open [reader (io/reader (fs/file f))] (doseq [line (drop 2 (line-seq reader))] (.write writer (str line "\n"))))))) (def full-text (slurp "bbe-newlines-nochaps.txt")) ;; This is "Stage1-optimized-tokens" (def tokens (-> full-text (str/lower-case) (str/replace #"\s+" " ") (str/replace #"'s" " AS ") ;; Apostrophe_S (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " ")) (str/split #" ") (#(remove str/blank? %1)))) (def symbol-freqs (frequencies tokens)) (comment "Do some basic statistics and print a list of tokens" (spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs)))) (sort-by val > symbol-freqs) ; Greatest to lease frequency (reduce + (map val symbol-freqs)) ; Total tokens (count symbol-freqs) ; Total unique tokens (reduce + (take 512 (map val symbol-freqs))) ; Number of the top 100 common tokens (reduce + (map count symbol-freqs)) ; Total chars needed for dict vals (def two-grams (frequencies (partition 2 1 tokens))) (sort-by val > two-grams)) ;;; Make the huffman tree for the symbols (13) (defrecord Node [left right sym probability]) ; Create a prioirity-queue of parentless nodes for each symbol (defn symboltable->pq [symbolfreqs] (into (pm/priority-map-keyfn (juxt first second)) (map #(vector (->Node nil nil (first %1) (second %1)) [(second %1) (first %1)]) symbolfreqs))) (assert (= (count symbol-freqs) (count (symboltable->pq symbol-freqs))) "Priority queue has fewer symbols than symbol list") ;; From: https://michaeldipperstein.github.io/huffman.html#decode ;; Step 1. Create a parentless node for each symbol. Each node should include the symbol and its probability. ;; Step 2. Select the two parentless nodes with the lowest probabilities. ;; Step 3. Create a new node which is the parent of the two lowest probability nodes. ;; Step 4. Assign the new node a probability equal to the sum of its children's probabilities. ;; Step 5. Repeat from Step 2 until there is only one parentless node left. ;; NOTE: This is an inefficient algorithm because we could use the ;; two-queue version on wikipedia (defn build-huffman-tree-from-queue "Builds a huffman-tree out of a priority-queue" [queue] (if (= 1 (count queue)) (first (peek queue)) ; Repeat until there is only one parentless node left (let [[lowest-node [lowest-prob _]] (peek queue) [second-node [second-prob _]] (peek (pop queue)) ; Step 2 new-prob (+ lowest-prob second-prob) ; Step 4 new-node (->Node second-node lowest-node nil new-prob) ; Step 3 - NOTE: unsure about node order next-queue (assoc (pop (pop queue)) new-node [new-prob nil])] (recur next-queue)))) (defn build-huffman-tree "builds a huffman-tree out of a frequency table of symbols" [symbol-freq-table] (build-huffman-tree-from-queue (symboltable->pq symbol-freq-table))) (def huffman-tree (build-huffman-tree symbol-freqs)) (assert (= (.probability huffman-tree) (reduce + (map val symbol-freqs))) "Probability of root node is not equal to the sum of all probabilities") (defn huffman-tree-to-symbol-encodings "Builds a list of symbol encodings out of a huffman-tree [tree] tree to symbol encodings [node encodings curr-encoding] recursive builder" ([tree] (huffman-tree-to-symbol-encodings tree {} "")) ([node encodings curr-encoding] (if (.sym node) (assoc encodings (.sym node) curr-encoding) (merge (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding)) (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding)))))) (def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} "")) (assert (= (huffman-tree-to-symbol-encodings huffman-tree {} "") (huffman-tree-to-symbol-encodings huffman-tree)) "Multi-arity def does not work as intended") (assert (= (count huffman-tree-syms) (count (symboltable->pq symbol-freqs)) (count symbol-freqs)) "Some function is adding or removing symbols") ;;; Build the canonical encodings ;; From: https://en.wikipedia.org/wiki/Canonical_Huffman_code ;; Each of the existing codes are replaced with a new one of the same length, using the following algorithm: ;; The first symbol in the list gets assigned a codeword which is the same length as the symbol's original codeword but all zeros. This will often be a single zero ('0'). ;; Each subsequent symbol is assigned the next binary number in sequence, ensuring that following codes are always higher in value. ;; When you reach a longer codeword, then after incrementing, append zeros until the length of the new codeword is equal to the length of the old codeword. This can be thought of as a left shift. ;; --- little class for Huffman Codewords (defrecord HuffmanCodeword [sym code length]) (defn get-codeword-string "Takes a [HuffmanCodeword] and returns a binary str of the code" [codeword] (let [base (Long/toBinaryString (.code codeword)) basecount (count base) len (.length codeword)] (if (= basecount len) base (str (apply str (take (- len basecount) (repeat "0"))) base)))) ;; --- little class for Huffman Codewords (def sorted-huffman-tree-codewords (->> huffman-tree-syms (sort-by (juxt (comp count val) key)) (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1))))))) (defn build-canonical-encodings "Build canonical huffman encodings from a huffman tree takes [symbols] a list of huffman codes derived from a code-tree" ([tree] (let [symbols (->> tree (huffman-tree-to-symbol-encodings) (sort-by (juxt (comp count val) key)) (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1)))))) first-sym (first symbols) seed-symbol (->HuffmanCodeword (.sym first-sym) 0 (.length first-sym))] (build-canonical-encodings [seed-symbol] (rest symbols)))) ([codes symbols] (if (not-empty symbols) (let [prev-codeword (last codes) current-codeword (first symbols) next-sym (.sym current-codeword) next-base-code (unchecked-inc (.code prev-codeword)) prev-len (.length prev-codeword) next-codeword (if (= (.length current-codeword) (.length prev-codeword)) (->HuffmanCodeword next-sym next-base-code prev-len) (->HuffmanCodeword next-sym (bit-shift-left next-base-code (- prev-len (- 63 (Long/numberOfLeadingZeros next-base-code)))) ;Not 100% confident about this (inc prev-len)))] (recur (conj codes next-codeword) (rest symbols))) codes))) (def canonical-encodings (build-canonical-encodings huffman-tree)) (assert (= (map #(.length %1) sorted-huffman-tree-codewords) (map #(.length %1) sorted-huffman-tree-codewords)) "Some of the codes changed length when canonicalizing") (assert (= (count canonical-encodings) (count (set (map #(Long/toBinaryString (.code %1)) canonical-encodings)))) "There appears to be duplicate canonical encodings") (defn get-encoded-length [encodings tokenlist] (let [encodingtable (into {} (map #(vector (.sym %1) (.length %1)) encodings)) totalbits (reduce + (map encodingtable tokenlist)) totalbytes (math/ceil (/ totalbits 8)) totalkb (/ totalbytes 1024) totalmb (/ totalkb 1024)] {:bits totalbits :bytes totalbytes :kb totalkb :mb totalmb})) (comment (def base-file-size (let [totalbits (* 8 (count full-text)) totalbytes (math/ceil (/ totalbits 8)) totalkb (/ totalbytes 1024) totalmb (/ totalkb 1024)] {:bits totalbits :bytes totalbytes :kb totalkb :mb totalmb})) (def stage1-optimized-token-encoding-length (get-encoded-length canonical-encodings tokens))) (comment "Some scratchwork figuring out how to make the shortest bit smaller" (defn get-highest-freq-percent [freqlist] (let [most-freq (first (sort-by val > freqlist))] ;the fn already sorts, but most invocations pre-sort for REPL purposes [(key most-freq) (double (/ (second most-freq) (reduce + (map second symbol-freqs))))])) (get-highest-freq-percent symbol-freqs) ;most frequent symbol is 6.83% (get-highest-freq-percent (sort-by val > (frequencies (map first tokens)))) ;most frequent 1-letter prefix (get-highest-freq-percent (sort-by val > (frequencies (map last tokens)))) ;most frequent 1-letter suffix (get-highest-freq-percent (sort-by val > (frequencies (map count tokens)))) ;26% of the text is 3 letter tokens ) ;;; NOTE FOR VIDEO: ;; First, consider some kind of bit-packing scheme ;; - 65 total chars ;; - Shave it down to 63 ;; - Remove caps/reduce even furhter to try and get it to 31 ;; Then introduce huffman encodings ;; - Raw text ;; - Text with some of the above optimizations ;; - Tokenize at spaces and punctuation ;; Then consider tokenizing I started off by tokenizing the text with some optimizations in place with stage1-optimized-token-encoding-length (comment "Huffman on the raw text (65 total symbols)" (let [tokenized (seq full-text) symbol-freqs (frequencies tokenized) huff-tree (build-huffman-tree symbol-freqs) canonical-codes (build-canonical-encodings huff-tree)] (get-encoded-length canonical-codes tokenized)) "Huffman on the above with normalized space and without * (63 total symbols)" (let [tokenized (seq (-> full-text (str/replace #"\s+" " ") (str/replace #"\*" " "))) symbol-freqs (frequencies tokenized) huff-tree (build-huffman-tree symbol-freqs) canonical-codes (build-canonical-encodings huff-tree)] (get-encoded-length canonical-codes tokenized)) "Huffman on the above in all lowercase (38 total symbols)" (let [tokenized (seq (-> full-text (str/replace #"\s+" " ") (str/replace #"[\*]" "") (str/lower-case))) symbol-freqs (frequencies tokenized) huff-tree (build-huffman-tree symbol-freqs) canonical-codes (build-canonical-encodings huff-tree)] (get-encoded-length canonical-codes tokenized)) "Huffman on the above with char changes —()?!:; (31 total symbols)" (let [tokenized (seq (-> full-text (str/replace #"—" "-") ; Replace emdash with hyphen (str/replace #"[\*]" "") ; Get rid of chars (str/replace #"\([^)]*\)" "") ; Get rid of everything between parens (str/replace #"[?!]" ".") ; Replace all final punct with periods (str/replace #"[;:]" ",") ; Replace all middle punct with commas (str/replace #"\s+" " ") ; Normalize space (str/lower-case))) symbol-freqs (frequencies tokenized) huff-tree (build-huffman-tree symbol-freqs) canonical-codes (build-canonical-encodings huff-tree)] (get-encoded-length canonical-codes tokenized)) )