bible/dictionary-packed.clj


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104

(require '[clojure.string :as str])
(require '[clojure.java.io :as io])
(require '[babashka.fs :as fs])
(require '[clojure.data.priority-map :as pm])
(require '[clojure.math :as math])
(require '[clojure.core.match :as match])

;;; The full compressor of bible in basic english

;Build the base txt file out of individual txt files
(comment 
  (def files (fs/glob "./base_files/" "**.txt"))

  (defn get-book-num [filename]
    (let [[_ _ book _ _]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt book)))

  (defn get-chap-num [filename]
    (let [[_ _ _ _ chap]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt chap)))

  (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
    (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
      (with-open [reader (io/reader (fs/file f))]
        (doseq [line (drop 2 (line-seq reader))]
          (.write writer (str line "\n")))))))

;;; The full text as a file
(def full-text (slurp "./bbe-newlines-nochaps.txt"))

(def optimized-string
  (-> full-text
      (str/lower-case)
      (str/replace #"'s" " AS ")
      (str/replace #"\.\.\." " DOTDOTDOT ")
      (str/replace #"\*\*\*" " STARSTARSTAR ")
      (str/replace #"—" "-")
      (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
      (str/replace #"\s+" " ")))

(spit "foo.txt" optimized-string)

(def optimized-tokens
  (str/split optimized-string #" "))

(comment ;Some basic stats on our work so far
         (count full-text) ; total chars 4207465
         (count optimized-tokens) ; total tokens 962868
         (count (into #{} optimized-tokens)) ; 5997 total unique tokens
         (apply max (map count (into #{} optimized-tokens))) ; max word is 17 chars long "straightforwardly" -> 1 nyble to represent?
                                                             ; We could maybe do some domain modeling and do like 
                                                             ; "suffix-s" or "suffix-ly"s like with "'s" right now
         ) 


;;; To encode the most common words, we use 1DDDDDDD <- 7 bits for the 127 most common words
;;; To encode the rest, we use 0DDDDDDD DDDDDDDD <- 15 bits for the other dictionary entries
(comment 
  (let [sorted-toks (sort-by val > (frequencies optimized-tokens))
        totalcount (reduce + (map val sorted-toks))
        top127 (take 128 sorted-toks)
        top127count (reduce + (map val top127))]
    {:total totalcount
     :topwords top127count
     :percent-savings (* 100 (double (/ top127count totalcount)))
     :total-remaining-words (- totalcount top127count)})
  ; {:total 965223,
     ;  :topwords 715122,
     ;  :percent-savings 74.08878570029931,
     ;  :total-remaining-words 250101}
  )

;;; We'll start off by bit-packing our representations a bit.
(def word-ids
  (let [sorted-toks (sort-by val > (frequencies optimized-tokens))
        top128 (take 128 sorted-toks)
        rest128 (drop 128 sorted-toks)
        top128-reprs 
        (into {} 
              (map-indexed 
                (fn [id [tok _freq]]
                  [tok (unchecked-byte (bit-or 0x80 id))])
                top128))
        rest128-reprs 
        (into {} 
              (map-indexed
                (fn [id [tok _freq]]
                  [tok [(unchecked-byte (bit-or 2r10000000 (bit-shift-right id 8)))
                        (unchecked-byte (bit-and 0x00FF id))]])
                rest128))
        token-reprs (merge top128-reprs rest128-reprs)]
    (assert (= (count token-reprs) (+ (count top128-reprs) (count rest128-reprs))))
    token-reprs))

(def dict-id-compressed-text
  (flatten (map word-ids optimized-tokens)))

(comment
  (count dict-id-compressed-text) ;Whittled it down to 1212042 total bytes
  )