bible/dictionary.clj


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

(require '[clojure.string :as str])

(require '[clojure.java.io :as io])
(require '[babashka.fs :as fs])

(require '[clojure.data.priority-map :as pm])

(require '[clojure.math :as math])

(comment
  "Build the base file"
  (def files (fs/glob "./base_files/" "**.txt"))

  (defn get-book-num [filename]
    (let [[_ _ book _ _]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt book)))

  (defn get-chap-num [filename]
    (let [[_ _ _ _ chap]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt chap)))

  (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
    (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
      (with-open [reader (io/reader (fs/file f))]
        (doseq [line (drop 2 (line-seq reader))]
          (.write writer (str line "\n")))))))

(defn sizetable [bits]
  (let [bytect (math/ceil (/ bits 8))
        kbs (/ bytect 1024)
        mbs (/ kbs 1024)]
    {:bits bits
     :bytes bytect
     :kbs kbs
     :mbs mbs}))

(def full-text (slurp "bbe-newlines-nochaps.txt"))

(def base-size (sizetable (* 8 (count full-text))))

"Naiive, just tokenize with spaces"
(let [toks (-> full-text 
               ;(str/lower-case)
               (str/replace #"\s+" " SP ") ;; all spaces become one space
               (str/replace #"[,.;:!?\(\)\[\]'\*-]" #(str " " %1 " "))
               (str/replace #"\s+" " ") ;; all spaces normalized to one space
               (str/split #" ")
               (#(remove str/blank? %1)))
      freqs (frequencies toks) ;6689 unique tokens
      #_#_sorted-freqs (sort-by val > freqs)
      #_#_dictionary (vec
                   (zipmap
                     (range (count sorted-freqs))
                     (map first sorted-freqs)))]
  (count freqs)
  #_(sizetable (* 13 (count toks))))


"Lower-case, remove 's, remove word final s tokenize"
(let [toks (-> full-text 
               (str/lower-case)
               (str/replace #"'s" " AS ") ;; Apostrophe S
               (str/replace #"\s+" " SP ") ;; all spaces become one space
               (str/replace #"[,.;:!?\(\)\[\]'\*-]" #(str " " %1 " "))
               (str/replace #"\s+" " ") ;; all spaces normalized to one space
               (str/split #" ")
               (#(remove str/blank? %1)))
      freqs (frequencies toks) ;; 5998 unique tokens
      #_#_sorted-freqs (sort-by val > freqs)
      #_#_dictionary (vec
                   (zipmap
                     (range (count sorted-freqs))
                     (map first sorted-freqs)))]
  (count freqs)
  #_(sizetable (* 13 (count toks))))

"Takes 13 bits for this dictionary"
(Integer/toBinaryString 5998)
"_-_10111 01101110"

"The text uses only 1000 unique words (not counting proper nouns?)->Possible to
get each word to 10 bits? Maybe two bytes?"
(math/pow 2 10)

"Dictionary compression is a dead end for me (I think), but it was worth looking
into."