1 files changed, 104 insertions, 0 deletions
diff --git a/bible/fullcompressor.clj b/bible/fullcompressor.clj
new file mode 100644
index 0000000..96c16f8
--- /dev/null
+++ b/bible/fullcompressor.clj
@@ -0,0 +1,104 @@
+(require '[clojure.string :as str])
+(require '[clojure.java.io :as io])
+(require '[babashka.fs :as fs])
+(require '[clojure.data.priority-map :as pm])
+(require '[clojure.math :as math])
+(require '[clojure.core.match :as match])
+
+;;; The full compressor of bible in basic english
+
+;Build the base txt file out of individual txt files
+(comment 
+  (def files (fs/glob "./base_files/" "**.txt"))
+
+  (defn get-book-num [filename]
+    (let [[_ _ book _ _]
+          (str/split (str filename) #"_")
+          #_#_chap (int _chap)]
+      (Integer/parseInt book)))
+
+  (defn get-chap-num [filename]
+    (let [[_ _ _ _ chap]
+          (str/split (str filename) #"_")
+          #_#_chap (int _chap)]
+      (Integer/parseInt chap)))
+
+  (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
+    (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
+      (with-open [reader (io/reader (fs/file f))]
+        (doseq [line (drop 2 (line-seq reader))]
+          (.write writer (str line "\n")))))))
+
+;;; The full text as a file
+(def full-text (slurp "./bbe-newlines-nochaps.txt"))
+
+(def optimized-string
+  (-> full-text
+      (str/lower-case)
+      (str/replace #"'s" " AS ")
+      (str/replace #"\.\.\." " DOTDOTDOT ")
+      (str/replace #"\*\*\*" " STARSTARSTAR ")
+      (str/replace #"—" "-")
+      (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
+      (str/replace #"\s+" " ")))
+
+(spit "foo.txt" optimized-string)
+
+(def optimized-tokens
+  (str/split optimized-string #" "))
+
+(comment ;Some basic stats on our work so far
+         (count full-text) ; total chars 4207465
+         (count optimized-tokens) ; total tokens 962868
+         (count (into #{} optimized-tokens)) ; 5997 total unique tokens
+         (apply max (map count (into #{} optimized-tokens))) ; max word is 17 chars long "straightforwardly" -> 1 nyble to represent?
+                                                             ; We could maybe do some domain modeling and do like 
+                                                             ; "suffix-s" or "suffix-ly"s like with "'s" right now
+         ) 
+
+
+;;; To encode the most common words, we use 1DDDDDDD <- 7 bits for the 127 most common words
+;;; To encode the rest, we use 0DDDDDDD DDDDDDDD <- 15 bits for the other dictionary entries
+(comment 
+  (let [sorted-toks (sort-by val > (frequencies optimized-tokens))
+        totalcount (reduce + (map val sorted-toks))
+        top127 (take 128 sorted-toks)
+        top127count (reduce + (map val top127))]
+    {:total totalcount
+     :topwords top127count
+     :percent-savings (* 100 (double (/ top127count totalcount)))
+     :total-remaining-words (- totalcount top127count)})
+  ; {:total 965223,
+     ;  :topwords 715122,
+     ;  :percent-savings 74.08878570029931,
+     ;  :total-remaining-words 250101}
+  )
+
+;;; We'll start off by bit-packing our representations a bit.
+(def word-ids
+  (let [sorted-toks (sort-by val > (frequencies optimized-tokens))
+        top128 (take 128 sorted-toks)
+        rest128 (drop 128 sorted-toks)
+        top128-reprs 
+        (into {} 
+              (map-indexed 
+                (fn [id [tok _freq]]
+                  [tok (unchecked-byte (bit-or 0x80 id))])
+                top128))
+        rest128-reprs 
+        (into {} 
+              (map-indexed
+                (fn [id [tok _freq]]
+                  [tok [(unchecked-byte (bit-or 2r10000000 (bit-shift-right id 8)))
+                        (unchecked-byte (bit-and 0x00FF id))]])
+                rest128))
+        token-reprs (merge top128-reprs rest128-reprs)]
+    (assert (= (count token-reprs) (+ (count top128-reprs) (count rest128-reprs))))
+    token-reprs))
+
+(def dict-id-compressed-text
+  (flatten (map word-ids optimized-tokens)))
+
+(comment
+  (count dict-id-compressed-text) ;Whittled it down to 1212042 total bytes
+  )