blob: 477b14e5aa1168579ab7428bdb52307819bbef99 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
(require '[clojure.string :as str])
(require '[clojure.java.io :as io])
(require '[babashka.fs :as fs])
(require '[clojure.data.priority-map :as pm])
(require '[clojure.math :as math])
(comment
"Build the base file"
(def files (fs/glob "./base_files/" "**.txt"))
(defn get-book-num [filename]
(let [[_ _ book _ _]
(str/split (str filename) #"_")
#_#_chap (int _chap)]
(Integer/parseInt book)))
(defn get-chap-num [filename]
(let [[_ _ _ _ chap]
(str/split (str filename) #"_")
#_#_chap (int _chap)]
(Integer/parseInt chap)))
(with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
(doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
(with-open [reader (io/reader (fs/file f))]
(doseq [line (drop 2 (line-seq reader))]
(.write writer (str line "\n")))))))
(defn sizetable [bits]
(let [bytect (math/ceil (/ bits 8))
kbs (/ bytect 1024)
mbs (/ kbs 1024)]
{:bits bits
:bytes bytect
:kbs kbs
:mbs mbs}))
(def full-text (slurp "bbe-newlines-nochaps.txt"))
(def base-size (sizetable (* 8 (count full-text))))
"Naiive, just tokenize with spaces"
(let [toks (-> full-text
;(str/lower-case)
(str/replace #"\s+" " SP ") ;; all spaces become one space
(str/replace #"[,.;:!?\(\)\[\]'\*-]" #(str " " %1 " "))
(str/replace #"\s+" " ") ;; all spaces normalized to one space
(str/split #" ")
(#(remove str/blank? %1)))
freqs (frequencies toks) ;6689 unique tokens
#_#_sorted-freqs (sort-by val > freqs)
#_#_dictionary (vec
(zipmap
(range (count sorted-freqs))
(map first sorted-freqs)))]
(count freqs)
#_(sizetable (* 13 (count toks))))
"Lower-case, remove 's, remove word final s tokenize"
(let [toks (-> full-text
(str/lower-case)
(str/replace #"'s" " AS ") ;; Apostrophe S
(str/replace #"\s+" " SP ") ;; all spaces become one space
(str/replace #"[,.;:!?\(\)\[\]'\*-]" #(str " " %1 " "))
(str/replace #"\s+" " ") ;; all spaces normalized to one space
(str/split #" ")
(#(remove str/blank? %1)))
freqs (frequencies toks) ;; 5998 unique tokens
#_#_sorted-freqs (sort-by val > freqs)
#_#_dictionary (vec
(zipmap
(range (count sorted-freqs))
(map first sorted-freqs)))]
(count freqs)
#_(sizetable (* 13 (count toks))))
"Takes 13 bits for this dictionary"
(Integer/toBinaryString 5998)
"_-_10111 01101110"
"The text uses only 1000 unique words (not counting proper nouns?)->Possible to
get each word to 10 bits? Maybe two bytes?"
(math/pow 2 10)
"Dictionary compression is a dead end for me (I think), but it was worth looking
into."
|