blob: 7f3130b2d0ab5ec1e46fac5c8a1957dd0cf592d9 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
|
(require '[clojure.string :as str])
(require '[clojure.java.io :as io])
(require '[babashka.fs :as fs])
(require '[clojure.data.priority-map :as pm])
(require '[clojure.math :as math])
(require '[clojure.core.match :as match])
;;; The full compressor of bible in basic english
;Build the base txt file out of individual txt files
(comment
(def files (fs/glob "./base_files/" "**.txt"))
(defn get-book-num [filename]
(let [[_ _ book _ _]
(str/split (str filename) #"_")
#_#_chap (int _chap)]
(Integer/parseInt book)))
(defn get-chap-num [filename]
(let [[_ _ _ _ chap]
(str/split (str filename) #"_")
#_#_chap (int _chap)]
(Integer/parseInt chap)))
(with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
(doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
(with-open [reader (io/reader (fs/file f))]
(doseq [line (drop 2 (line-seq reader))]
(.write writer (str line "\n")))))))
;;; The full text as a file
(def full-text (slurp "./bbe-newlines-nochaps.txt"))
(def optimized-string
(-> full-text
(str/lower-case)
(str/replace #"'s" " AS ")
(str/replace #"\.\.\." " DOTDOTDOT ")
(str/replace #"\*\*\*" " STARSTARSTAR ")
(str/replace #"—" "-")
(str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
(str/replace #"\s+" " ")))
(spit "foo.txt" optimized-string)
(def optimized-tokens
(str/split optimized-string #" "))
(comment ;Some basic stats on our work so far
(count full-text) ; total chars 4207465
(count optimized-tokens) ; total tokens 962868
(count (into #{} optimized-tokens)) ; 5997 total unique tokens
(apply max (map count (into #{} optimized-tokens))) ; max word is 17 chars long "straightforwardly" -> 1 nyble to represent?
; We could maybe do some domain modeling and do like
; "suffix-s" or "suffix-ly"s like with "'s" right now
)
;;; First we'll dictionary-encode our tokens
;; For a less efficient (in the long term) encoding algorithm, see dictionary-packed.clj
(def word-ids
(let [sorted-toks (sort-by val > (frequencies optimized-tokens))
token-reprs
(into {}
(map-indexed
(fn [id [tok _freq]]
[tok [(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]])
sorted-toks))]
token-reprs))
(def dict-id-compressed-text
(flatten (map word-ids optimized-tokens)))
(comment
(count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices
)
;;; TODO: Build dictionary
;;; Next, we'll run LZSS on our stream of tokens
|