diff options
| -rw-r--r-- | bible/encode.clj | 62 |
1 files changed, 53 insertions, 9 deletions
diff --git a/bible/encode.clj b/bible/encode.clj index 031b6aa..903cb2b 100644 --- a/bible/encode.clj +++ b/bible/encode.clj @@ -5,21 +5,21 @@ (require '[clojure.data.priority-map :as pm]) -(def files (fs/glob "./base_files/" "**.txt")) +#_(def files (fs/glob "./base_files/" "**.txt")) -(defn get-book-num [filename] +#_(defn get-book-num [filename] (let [[_ _ book _ _] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt book))) -(defn get-chap-num [filename] +#_(defn get-chap-num [filename] (let [[_ _ _ _ chap] (str/split (str filename) #"_") #_#_chap (int _chap)] (Integer/parseInt chap))) -(with-open [writer (io/writer "bbe-newlines-nochaps.txt")] +#_(with-open [writer (io/writer "bbe-newlines-nochaps.txt")] (doseq [f (sort-by (juxt get-book-num get-chap-num) files)] (with-open [reader (io/reader (fs/file f))] (doseq [line (drop 2 (line-seq reader))] @@ -41,12 +41,12 @@ #_(spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs)))) -(sort-by val > symbol-freqs) ; Greatest to lease frequency -(reduce + (map val symbol-freqs)) ; Total tokens -(count symbol-freqs) ; Total unique tokens -(reduce + (take 512 (map val symbol-freqs))) ; Number of the top 100 common tokens +#_(sort-by val > symbol-freqs) ; Greatest to lease frequency +#_(reduce + (map val symbol-freqs)) ; Total tokens +#_(count symbol-freqs) ; Total unique tokens +#_(reduce + (take 512 (map val symbol-freqs))) ; Number of the top 100 common tokens -(reduce + (map count symbol-freqs)) ; Total chars needed for dict vals +#_(reduce + (map count symbol-freqs)) ; Total chars needed for dict vals #_(def two-grams (frequencies (partition 2 1 tokens))) #_(sort-by val > two-grams) @@ -104,3 +104,47 @@ (count pq) (count symbol-freqs))) +;;; Build the canonical encodings + +;; Each of the existing codes are replaced with a new one of the same length, using the following algorithm: + +;; The first symbol in the list gets assigned a codeword which is the same length as the symbol's original codeword but all zeros. This will often be a single zero ('0'). +;; Each subsequent symbol is assigned the next binary number in sequence, ensuring that following codes are always higher in value. +;; When you reach a longer codeword, then after incrementing, append zeros until the length of the new codeword is equal to the length of the old codeword. This can be thought of as a left shift. + +(defrecord HuffmanCodeword [sym code length]) + +(def sorted-huffman-tree-codewords + (->> huffman-tree-syms + (sort-by (juxt (comp count val) key)) + (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1))))))) + +(defn build-canonical-encodings + "Build canonical huffman encodings from a sorted list of huffman tree codewords" + ([symbols] + (let [first-sym (first symbols) + seed-symbol (->HuffmanCodeword (.sym first-sym) 0 (.length first-sym))] + (build-canonical-encodings [seed-symbol] (rest symbols)))) + ([codes symbols] + (if (not-empty symbols) + (let [prev-codeword (last codes) + current-codeword (first symbols) + next-sym (.sym current-codeword) + next-base-code (unchecked-inc (.code prev-codeword)) + prev-len (.length prev-codeword) + next-codeword (if (= (.length current-codeword) + (.length prev-codeword)) + (->HuffmanCodeword next-sym + next-base-code + prev-len) + (->HuffmanCodeword next-sym + (bit-shift-left next-base-code (inc prev-len)) + (inc prev-len)))] + (recur (conj codes next-codeword) (rest symbols))) + codes))) + + +(def canonical-encodings + (build-canonical-encodings sorted-huffman-tree-codewords)) + +(map #(Long/toBinaryString (.code %1)) (take 100 canonical-encodings)) |
