diff options
| author | Patrick Kingston <patrick@pkingston.xyz> | 2026-01-29 01:06:52 -0500 |
|---|---|---|
| committer | Patrick Kingston <patrick@pkingston.xyz> | 2026-01-29 01:06:52 -0500 |
| commit | f5de7c4dc4f3fce22aa6891f329bbaade87d0e2a (patch) | |
| tree | cf5d16b626f5c180d9679404d3ccab15d6e784e4 /bible/fullcompressorcommonngrams.clj | |
| parent | fc8569a88200ec88c3773c338fdcf16b16ea51d5 (diff) | |
Diffstat (limited to 'bible/fullcompressorcommonngrams.clj')
| -rw-r--r-- | bible/fullcompressorcommonngrams.clj | 141 |
1 files changed, 141 insertions, 0 deletions
diff --git a/bible/fullcompressorcommonngrams.clj b/bible/fullcompressorcommonngrams.clj new file mode 100644 index 0000000..fae5811 --- /dev/null +++ b/bible/fullcompressorcommonngrams.clj @@ -0,0 +1,141 @@ +(require '[clojure.string :as str]) +(require '[clojure.java.io :as io]) +(require '[babashka.fs :as fs]) +(require '[clojure.data.priority-map :as pm]) +(require '[clojure.math :as math]) +(require '[clojure.core.match :as match]) + +;;; The full compressor of bible in basic english +;;; Mk2 - compressing with common n-grams, ditching LZSS (maybe) + +;Build the base txt file out of individual txt files +(comment + (def files (fs/glob "./base_files/" "**.txt")) + + (defn get-book-num [filename] + (let [[_ _ book _ _] + (str/split (str filename) #"_") + #_#_chap (int _chap)] + (Integer/parseInt book))) + + (defn get-chap-num [filename] + (let [[_ _ _ _ chap] + (str/split (str filename) #"_") + #_#_chap (int _chap)] + (Integer/parseInt chap))) + + (with-open [writer (io/writer "bbe-newlines-nochaps.txt")] + (doseq [f (sort-by (juxt get-book-num get-chap-num) files)] + (with-open [reader (io/reader (fs/file f))] + (doseq [line (drop 2 (line-seq reader))] + (.write writer (str line "\n"))))))) + +;;; The full text as a file +(def full-text (slurp "./bbe-newlines-nochaps.txt")) + +(def optimized-string + (-> full-text + (str/lower-case) + #_(str/replace #"\s+" " SPACE ") ;; Added in mk2 if it turns out it's better to break hapax into individual chars to kill dictionary bloat + (str/replace #"'s" " AS ") + (str/replace #"\.\.\." " DOTDOTDOT ") + (str/replace #"\*\*\*" " STARSTARSTAR ") + (str/replace #"—" "-") + (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " ")) + (str/replace #"\s+" " "))) + +(def optimized-tokens + (str/split optimized-string #" ")) + +(comment ;Some basic stats on our work so far + (count full-text) ; total chars 4207465 + (count optimized-tokens) ; total tokens 962868 + (count (into #{} optimized-tokens)) ; 5997 total unique tokens + (apply max (map count (into #{} optimized-tokens))) ; max word is 17 chars long "straightforwardly" -> 1 nyble to represent? + ; We could maybe do some domain modeling and do like + ; "suffix-s" or "suffix-ly"s like with "'s" right now + ) + + +;;; First we'll dictionary-encode our tokens + +;; For a less efficient (in the long term) encoding algorithm, see dictionary-packed.clj +(def word-ids + (let [sorted-toks (sort-by val > (frequencies optimized-tokens)) + token-reprs + (into {} + (map-indexed + (fn [id [tok _freq]] + [tok (unchecked-short (bit-and 0xffff id))#_[(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]]) + sorted-toks))] + token-reprs)) + +(def dict-id-compressed-text + (map word-ids optimized-tokens)) + +(comment + (count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices (it's 962868 shorts) + (- (dec (math/pow 2 13)) (count (into #{} optimized-tokens))) ; !!NOTE!! that we're leaving 2194 whole tokens on the table. + ) + +;;; Then we apply an optimal-n-grams finder: +(comment + ; some scratch work on how big our n can be + (defn get-most-common [n] + (take 10 + (->> optimized-tokens + (partition n 1) + (frequencies) + (sort-by second >)))) + (def ngrams-data (map get-most-common (range 2 11))) + ngrams-data) + +;;; We could do this with a bunch of tokens but we instead apply recursive +;;; n-grams find + +(comment + (defn make-ngram [t1 t2] + (str (str/upper-case t1) "_" (str/upper-case t2))) + + (defn replace-with-ngram [tokens [t1 t2] new-token infreqs] + (loop [in tokens + out (transient []) + freqs infreqs] + (cond + (empty? in) + [(persistent! out) freqs] + + (and (= (first in) t1) (= (second in) t2)) + (recur (drop 2 in) (conj! out new-token) (-> freqs + (assoc t1 (inc (freqs t1))) + (assoc t2 (inc (freqs t2))) + (assoc new-token (dec (get freqs new-token 0))))) + + :else + (recur (rest in) (conj! out (first in)) freqs)))) + + (defn find-recursive-ngrams + [intokens] + (loop [tokens intokens + freqs (->> intokens + (frequencies) + (map (fn [[k v]] [k (- v)])) + (into (pm/priority-map)))] + (if (<= (inc (math/pow 2 13)) (count freqs)) + tokens + (let [ngrams (partition 2 1 tokens) ;find ngrams + [ngram _ct] (apply max-key second (frequencies ngrams)) ;find the most common ngram + [t1 t2] ngram + [nexttoks nextfreqs] (replace-with-ngram tokens ngram (make-ngram t1 t2) freqs)] + + (println "Working on: " ngram "with freqcount" (count freqs)) + (recur nexttoks nextfreqs))))) + + (spit "optimized-token-stream.txt" (apply str (interpose "\n" optimized-ngrams)))) +#_(def optimized-ngrams (find-recursive-ngrams optimized-tokens)) +(def optimized-ngrams (str/split (slurp "optimized-token-stream.txt") #"\n")) + +(count optimized-tokens) ; 962868 tokens +(count optimized-ngrams) ; 608165 tokens + +;;; TODO: Build dictionary |
