bible/fullcompressor.clj


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

(require '[clojure.string :as str])
(require '[clojure.java.io :as io])
(require '[babashka.fs :as fs])
(require '[clojure.data.priority-map :as pm])
(require '[clojure.math :as math])
(require '[clojure.core.match :as match])

;;; The full compressor of bible in basic english

;Build the base txt file out of individual txt files
(comment 
  (def files (fs/glob "./base_files/" "**.txt"))

  (defn get-book-num [filename]
    (let [[_ _ book _ _]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt book)))

  (defn get-chap-num [filename]
    (let [[_ _ _ _ chap]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt chap)))

  (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
    (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
      (with-open [reader (io/reader (fs/file f))]
        (doseq [line (drop 2 (line-seq reader))]
          (.write writer (str line "\n")))))))

;;; The full text as a file
(def full-text (slurp "./bbe-newlines-nochaps.txt"))

(def optimized-string
  (-> full-text
      (str/lower-case)
      (str/replace #"'s" " AS ")
      (str/replace #"\.\.\." " DOTDOTDOT ")
      (str/replace #"\*\*\*" " STARSTARSTAR ")
      (str/replace #"—" "-")
      (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
      (str/replace #"\s+" " ")))

(spit "foo.txt" optimized-string)

(def optimized-tokens
  (str/split optimized-string #" "))

(comment ;Some basic stats on our work so far
         (count full-text) ; total chars 4207465
         (count optimized-tokens) ; total tokens 962868
         (count (into #{} optimized-tokens)) ; 5997 total unique tokens
         (apply max (map count (into #{} optimized-tokens))) ; max word is 17 chars long "straightforwardly" -> 1 nyble to represent?
                                                             ; We could maybe do some domain modeling and do like 
                                                             ; "suffix-s" or "suffix-ly"s like with "'s" right now
         ) 


;;; First we'll dictionary-encode our tokens

;; For a less efficient (in the long term) encoding algorithm, see dictionary-packed.clj
(def word-ids
  (let [sorted-toks (sort-by val > (frequencies optimized-tokens))
        token-reprs 
        (into {} 
              (map-indexed 
                (fn [id [tok _freq]]
                  [tok [(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]])
                sorted-toks))]
    token-reprs))

(def dict-id-compressed-text
  (flatten (map word-ids optimized-tokens)))

(comment
  (count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices
  )

;;; TODO: Build dictionary

;;; Next, we'll run LZSS on our stream of tokens