bible/fullcompressorcommonngrams.clj


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141

(require '[clojure.string :as str])
(require '[clojure.java.io :as io])
(require '[babashka.fs :as fs])
(require '[clojure.data.priority-map :as pm])
(require '[clojure.math :as math])
(require '[clojure.core.match :as match])

;;; The full compressor of bible in basic english
;;; Mk2 - compressing with common n-grams, ditching LZSS (maybe)

;Build the base txt file out of individual txt files
(comment 
  (def files (fs/glob "./base_files/" "**.txt"))

  (defn get-book-num [filename]
    (let [[_ _ book _ _]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt book)))

  (defn get-chap-num [filename]
    (let [[_ _ _ _ chap]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt chap)))

  (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
    (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
      (with-open [reader (io/reader (fs/file f))]
        (doseq [line (drop 2 (line-seq reader))]
          (.write writer (str line "\n")))))))

;;; The full text as a file
(def full-text (slurp "./bbe-newlines-nochaps.txt"))

(def optimized-string
  (-> full-text
      (str/lower-case)
      #_(str/replace #"\s+" " SPACE ") ;; Added in mk2 if it turns out it's better to break hapax into individual chars to kill dictionary bloat
      (str/replace #"'s" " AS ")
      (str/replace #"\.\.\." " DOTDOTDOT ")
      (str/replace #"\*\*\*" " STARSTARSTAR ")
      (str/replace #"—" "-")
      (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
      (str/replace #"\s+" " ")))

(def optimized-tokens
  (str/split optimized-string #" "))

(comment ;Some basic stats on our work so far
         (count full-text) ; total chars 4207465
         (count optimized-tokens) ; total tokens 962868
         (count (into #{} optimized-tokens)) ; 5997 total unique tokens
         (apply max (map count (into #{} optimized-tokens))) ; max word is 17 chars long "straightforwardly" -> 1 nyble to represent?
         ; We could maybe do some domain modeling and do like 
         ; "suffix-s" or "suffix-ly"s like with "'s" right now
         ) 


;;; First we'll dictionary-encode our tokens

;; For a less efficient (in the long term) encoding algorithm, see dictionary-packed.clj
(def word-ids
  (let [sorted-toks (sort-by val > (frequencies optimized-tokens))
        token-reprs 
        (into {} 
              (map-indexed 
                (fn [id [tok _freq]]
                  [tok (unchecked-short (bit-and 0xffff id))#_[(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]])
                sorted-toks))]
    token-reprs))

(def dict-id-compressed-text
  (map word-ids optimized-tokens))

(comment
  (count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices (it's 962868 shorts)
  (- (dec (math/pow 2 13)) (count (into #{} optimized-tokens))) ; !!NOTE!! that we're leaving 2194 whole tokens on the table.
  )

;;; Then we apply an optimal-n-grams finder:
(comment
  ; some scratch work on how big our n can be
  (defn get-most-common [n]
    (take 10 
          (->> optimized-tokens
               (partition n 1)
               (frequencies)
               (sort-by second >))))
  (def ngrams-data (map get-most-common (range 2 11)))
  ngrams-data)

;;; We could do this with a bunch of tokens but we instead apply recursive
;;; n-grams find

(comment
  (defn make-ngram [t1 t2]
    (str (str/upper-case t1) "_" (str/upper-case t2)))

  (defn replace-with-ngram [tokens [t1 t2] new-token infreqs]
    (loop [in tokens
           out (transient [])
           freqs infreqs]
      (cond
        (empty? in) 
        [(persistent! out) freqs]

        (and (= (first in) t1) (= (second in) t2))
        (recur (drop 2 in) (conj! out new-token) (-> freqs
                                                     (assoc t1 (inc (freqs t1)))
                                                     (assoc t2 (inc (freqs t2)))
                                                     (assoc new-token (dec (get freqs new-token 0)))))

        :else
        (recur (rest in) (conj! out (first in)) freqs))))

  (defn find-recursive-ngrams 
    [intokens]
    (loop [tokens intokens
           freqs (->> intokens 
                      (frequencies)
                      (map (fn [[k v]] [k (- v)]))
                      (into (pm/priority-map)))]
      (if (<= (inc (math/pow 2 13)) (count freqs))
        tokens
        (let [ngrams (partition 2 1 tokens) ;find ngrams
              [ngram _ct] (apply max-key second (frequencies ngrams)) ;find the most common ngram
              [t1 t2] ngram
              [nexttoks nextfreqs] (replace-with-ngram tokens ngram (make-ngram t1 t2) freqs)]

          (println "Working on: " ngram "with freqcount" (count freqs))
          (recur nexttoks nextfreqs)))))

  (spit "optimized-token-stream.txt" (apply str (interpose "\n" optimized-ngrams))))
#_(def optimized-ngrams (find-recursive-ngrams optimized-tokens))
(def optimized-ngrams (str/split (slurp "optimized-token-stream.txt") #"\n"))

(count optimized-tokens) ; 962868 tokens
(count optimized-ngrams) ; 608165 tokens

;;; TODO: Build dictionary