Byte-pair encode N-gramsHEAD main

author: Patrick Kingston <patrick@pkingston.xyz> 2026-01-29 01:06:52 -0500
committer: Patrick Kingston <patrick@pkingston.xyz> 2026-01-29 01:06:52 -0500
commit: f5de7c4dc4f3fce22aa6891f329bbaade87d0e2a (patch)
tree: cf5d16b626f5c180d9679404d3ccab15d6e784e4 /bible/fullcompressorcommonngrams.clj
parent: fc8569a88200ec88c3773c338fdcf16b16ea51d5 (diff)
1 files changed, 141 insertions, 0 deletions
diff --git a/bible/fullcompressorcommonngrams.clj b/bible/fullcompressorcommonngrams.clj
new file mode 100644
index 0000000..fae5811
--- /dev/null
+++ b/bible/fullcompressorcommonngrams.clj
@@ -0,0 +1,141 @@
+(require '[clojure.string :as str])
+(require '[clojure.java.io :as io])
+(require '[babashka.fs :as fs])
+(require '[clojure.data.priority-map :as pm])
+(require '[clojure.math :as math])
+(require '[clojure.core.match :as match])
+
+;;; The full compressor of bible in basic english
+;;; Mk2 - compressing with common n-grams, ditching LZSS (maybe)
+
+;Build the base txt file out of individual txt files
+(comment 
+  (def files (fs/glob "./base_files/" "**.txt"))
+
+  (defn get-book-num [filename]
+    (let [[_ _ book _ _]
+          (str/split (str filename) #"_")
+          #_#_chap (int _chap)]
+      (Integer/parseInt book)))
+
+  (defn get-chap-num [filename]
+    (let [[_ _ _ _ chap]
+          (str/split (str filename) #"_")
+          #_#_chap (int _chap)]
+      (Integer/parseInt chap)))
+
+  (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
+    (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
+      (with-open [reader (io/reader (fs/file f))]
+        (doseq [line (drop 2 (line-seq reader))]
+          (.write writer (str line "\n")))))))
+
+;;; The full text as a file
+(def full-text (slurp "./bbe-newlines-nochaps.txt"))
+
+(def optimized-string
+  (-> full-text
+      (str/lower-case)
+      #_(str/replace #"\s+" " SPACE ") ;; Added in mk2 if it turns out it's better to break hapax into individual chars to kill dictionary bloat
+      (str/replace #"'s" " AS ")
+      (str/replace #"\.\.\." " DOTDOTDOT ")
+      (str/replace #"\*\*\*" " STARSTARSTAR ")
+      (str/replace #"—" "-")
+      (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
+      (str/replace #"\s+" " ")))
+
+(def optimized-tokens
+  (str/split optimized-string #" "))
+
+(comment ;Some basic stats on our work so far
+         (count full-text) ; total chars 4207465
+         (count optimized-tokens) ; total tokens 962868
+         (count (into #{} optimized-tokens)) ; 5997 total unique tokens
+         (apply max (map count (into #{} optimized-tokens))) ; max word is 17 chars long "straightforwardly" -> 1 nyble to represent?
+         ; We could maybe do some domain modeling and do like 
+         ; "suffix-s" or "suffix-ly"s like with "'s" right now
+         ) 
+
+
+;;; First we'll dictionary-encode our tokens
+
+;; For a less efficient (in the long term) encoding algorithm, see dictionary-packed.clj
+(def word-ids
+  (let [sorted-toks (sort-by val > (frequencies optimized-tokens))
+        token-reprs 
+        (into {} 
+              (map-indexed 
+                (fn [id [tok _freq]]
+                  [tok (unchecked-short (bit-and 0xffff id))#_[(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]])
+                sorted-toks))]
+    token-reprs))
+
+(def dict-id-compressed-text
+  (map word-ids optimized-tokens))
+
+(comment
+  (count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices (it's 962868 shorts)
+  (- (dec (math/pow 2 13)) (count (into #{} optimized-tokens))) ; !!NOTE!! that we're leaving 2194 whole tokens on the table.
+  )
+
+;;; Then we apply an optimal-n-grams finder:
+(comment
+  ; some scratch work on how big our n can be
+  (defn get-most-common [n]
+    (take 10 
+          (->> optimized-tokens
+               (partition n 1)
+               (frequencies)
+               (sort-by second >))))
+  (def ngrams-data (map get-most-common (range 2 11)))
+  ngrams-data)
+
+;;; We could do this with a bunch of tokens but we instead apply recursive
+;;; n-grams find
+
+(comment
+  (defn make-ngram [t1 t2]
+    (str (str/upper-case t1) "_" (str/upper-case t2)))
+
+  (defn replace-with-ngram [tokens [t1 t2] new-token infreqs]
+    (loop [in tokens
+           out (transient [])
+           freqs infreqs]
+      (cond
+        (empty? in) 
+        [(persistent! out) freqs]
+
+        (and (= (first in) t1) (= (second in) t2))
+        (recur (drop 2 in) (conj! out new-token) (-> freqs
+                                                     (assoc t1 (inc (freqs t1)))
+                                                     (assoc t2 (inc (freqs t2)))
+                                                     (assoc new-token (dec (get freqs new-token 0)))))
+
+        :else
+        (recur (rest in) (conj! out (first in)) freqs))))
+
+  (defn find-recursive-ngrams 
+    [intokens]
+    (loop [tokens intokens
+           freqs (->> intokens 
+                      (frequencies)
+                      (map (fn [[k v]] [k (- v)]))
+                      (into (pm/priority-map)))]
+      (if (<= (inc (math/pow 2 13)) (count freqs))
+        tokens
+        (let [ngrams (partition 2 1 tokens) ;find ngrams
+              [ngram _ct] (apply max-key second (frequencies ngrams)) ;find the most common ngram
+              [t1 t2] ngram
+              [nexttoks nextfreqs] (replace-with-ngram tokens ngram (make-ngram t1 t2) freqs)]
+
+          (println "Working on: " ngram "with freqcount" (count freqs))
+          (recur nexttoks nextfreqs)))))
+
+  (spit "optimized-token-stream.txt" (apply str (interpose "\n" optimized-ngrams))))
+#_(def optimized-ngrams (find-recursive-ngrams optimized-tokens))
+(def optimized-ngrams (str/split (slurp "optimized-token-stream.txt") #"\n"))
+
+(count optimized-tokens) ; 962868 tokens
+(count optimized-ngrams) ; 608165 tokens
+
+;;; TODO: Build dictionary
author	Patrick Kingston <patrick@pkingston.xyz>	2026-01-29 01:06:52 -0500
committer	Patrick Kingston <patrick@pkingston.xyz>	2026-01-29 01:06:52 -0500
commit	f5de7c4dc4f3fce22aa6891f329bbaade87d0e2a (patch)
tree	cf5d16b626f5c180d9679404d3ccab15d6e784e4 /bible/fullcompressorcommonngrams.clj
parent	fc8569a88200ec88c3773c338fdcf16b16ea51d5 (diff)