diff options
Diffstat (limited to 'bible/dictionary-packed.clj')
| -rw-r--r-- | bible/dictionary-packed.clj | 104 |
1 files changed, 104 insertions, 0 deletions
diff --git a/bible/dictionary-packed.clj b/bible/dictionary-packed.clj new file mode 100644 index 0000000..96c16f8 --- /dev/null +++ b/bible/dictionary-packed.clj @@ -0,0 +1,104 @@ +(require '[clojure.string :as str]) +(require '[clojure.java.io :as io]) +(require '[babashka.fs :as fs]) +(require '[clojure.data.priority-map :as pm]) +(require '[clojure.math :as math]) +(require '[clojure.core.match :as match]) + +;;; The full compressor of bible in basic english + +;Build the base txt file out of individual txt files +(comment + (def files (fs/glob "./base_files/" "**.txt")) + + (defn get-book-num [filename] + (let [[_ _ book _ _] + (str/split (str filename) #"_") + #_#_chap (int _chap)] + (Integer/parseInt book))) + + (defn get-chap-num [filename] + (let [[_ _ _ _ chap] + (str/split (str filename) #"_") + #_#_chap (int _chap)] + (Integer/parseInt chap))) + + (with-open [writer (io/writer "bbe-newlines-nochaps.txt")] + (doseq [f (sort-by (juxt get-book-num get-chap-num) files)] + (with-open [reader (io/reader (fs/file f))] + (doseq [line (drop 2 (line-seq reader))] + (.write writer (str line "\n"))))))) + +;;; The full text as a file +(def full-text (slurp "./bbe-newlines-nochaps.txt")) + +(def optimized-string + (-> full-text + (str/lower-case) + (str/replace #"'s" " AS ") + (str/replace #"\.\.\." " DOTDOTDOT ") + (str/replace #"\*\*\*" " STARSTARSTAR ") + (str/replace #"—" "-") + (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " ")) + (str/replace #"\s+" " "))) + +(spit "foo.txt" optimized-string) + +(def optimized-tokens + (str/split optimized-string #" ")) + +(comment ;Some basic stats on our work so far + (count full-text) ; total chars 4207465 + (count optimized-tokens) ; total tokens 962868 + (count (into #{} optimized-tokens)) ; 5997 total unique tokens + (apply max (map count (into #{} optimized-tokens))) ; max word is 17 chars long "straightforwardly" -> 1 nyble to represent? + ; We could maybe do some domain modeling and do like + ; "suffix-s" or "suffix-ly"s like with "'s" right now + ) + + +;;; To encode the most common words, we use 1DDDDDDD <- 7 bits for the 127 most common words +;;; To encode the rest, we use 0DDDDDDD DDDDDDDD <- 15 bits for the other dictionary entries +(comment + (let [sorted-toks (sort-by val > (frequencies optimized-tokens)) + totalcount (reduce + (map val sorted-toks)) + top127 (take 128 sorted-toks) + top127count (reduce + (map val top127))] + {:total totalcount + :topwords top127count + :percent-savings (* 100 (double (/ top127count totalcount))) + :total-remaining-words (- totalcount top127count)}) + ; {:total 965223, + ; :topwords 715122, + ; :percent-savings 74.08878570029931, + ; :total-remaining-words 250101} + ) + +;;; We'll start off by bit-packing our representations a bit. +(def word-ids + (let [sorted-toks (sort-by val > (frequencies optimized-tokens)) + top128 (take 128 sorted-toks) + rest128 (drop 128 sorted-toks) + top128-reprs + (into {} + (map-indexed + (fn [id [tok _freq]] + [tok (unchecked-byte (bit-or 0x80 id))]) + top128)) + rest128-reprs + (into {} + (map-indexed + (fn [id [tok _freq]] + [tok [(unchecked-byte (bit-or 2r10000000 (bit-shift-right id 8))) + (unchecked-byte (bit-and 0x00FF id))]]) + rest128)) + token-reprs (merge top128-reprs rest128-reprs)] + (assert (= (count token-reprs) (+ (count top128-reprs) (count rest128-reprs)))) + token-reprs)) + +(def dict-id-compressed-text + (flatten (map word-ids optimized-tokens))) + +(comment + (count dict-id-compressed-text) ;Whittled it down to 1212042 total bytes + ) |
