blob: 96c16f89543c3e83e2b9ec6afeed145b0cc71ba0 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
(require '[clojure.string :as str])
(require '[clojure.java.io :as io])
(require '[babashka.fs :as fs])
(require '[clojure.data.priority-map :as pm])
(require '[clojure.math :as math])
(require '[clojure.core.match :as match])
;;; The full compressor of bible in basic english
;Build the base txt file out of individual txt files
(comment
(def files (fs/glob "./base_files/" "**.txt"))
(defn get-book-num [filename]
(let [[_ _ book _ _]
(str/split (str filename) #"_")
#_#_chap (int _chap)]
(Integer/parseInt book)))
(defn get-chap-num [filename]
(let [[_ _ _ _ chap]
(str/split (str filename) #"_")
#_#_chap (int _chap)]
(Integer/parseInt chap)))
(with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
(doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
(with-open [reader (io/reader (fs/file f))]
(doseq [line (drop 2 (line-seq reader))]
(.write writer (str line "\n")))))))
;;; The full text as a file
(def full-text (slurp "./bbe-newlines-nochaps.txt"))
(def optimized-string
(-> full-text
(str/lower-case)
(str/replace #"'s" " AS ")
(str/replace #"\.\.\." " DOTDOTDOT ")
(str/replace #"\*\*\*" " STARSTARSTAR ")
(str/replace #"—" "-")
(str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
(str/replace #"\s+" " ")))
(spit "foo.txt" optimized-string)
(def optimized-tokens
(str/split optimized-string #" "))
(comment ;Some basic stats on our work so far
(count full-text) ; total chars 4207465
(count optimized-tokens) ; total tokens 962868
(count (into #{} optimized-tokens)) ; 5997 total unique tokens
(apply max (map count (into #{} optimized-tokens))) ; max word is 17 chars long "straightforwardly" -> 1 nyble to represent?
; We could maybe do some domain modeling and do like
; "suffix-s" or "suffix-ly"s like with "'s" right now
)
;;; To encode the most common words, we use 1DDDDDDD <- 7 bits for the 127 most common words
;;; To encode the rest, we use 0DDDDDDD DDDDDDDD <- 15 bits for the other dictionary entries
(comment
(let [sorted-toks (sort-by val > (frequencies optimized-tokens))
totalcount (reduce + (map val sorted-toks))
top127 (take 128 sorted-toks)
top127count (reduce + (map val top127))]
{:total totalcount
:topwords top127count
:percent-savings (* 100 (double (/ top127count totalcount)))
:total-remaining-words (- totalcount top127count)})
; {:total 965223,
; :topwords 715122,
; :percent-savings 74.08878570029931,
; :total-remaining-words 250101}
)
;;; We'll start off by bit-packing our representations a bit.
(def word-ids
(let [sorted-toks (sort-by val > (frequencies optimized-tokens))
top128 (take 128 sorted-toks)
rest128 (drop 128 sorted-toks)
top128-reprs
(into {}
(map-indexed
(fn [id [tok _freq]]
[tok (unchecked-byte (bit-or 0x80 id))])
top128))
rest128-reprs
(into {}
(map-indexed
(fn [id [tok _freq]]
[tok [(unchecked-byte (bit-or 2r10000000 (bit-shift-right id 8)))
(unchecked-byte (bit-and 0x00FF id))]])
rest128))
token-reprs (merge top128-reprs rest128-reprs)]
(assert (= (count token-reprs) (+ (count top128-reprs) (count rest128-reprs))))
token-reprs))
(def dict-id-compressed-text
(flatten (map word-ids optimized-tokens)))
(comment
(count dict-id-compressed-text) ;Whittled it down to 1212042 total bytes
)
|