(require '[clojure.string :as str])

(require '[clojure.java.io :as io])
(require '[babashka.fs :as fs])

(require '[clojure.data.priority-map :as pm])

(require '[clojure.math :as math])

(comment
  "Build the base file"
  (def files (fs/glob "./base_files/" "**.txt"))

  (defn get-book-num [filename]
    (let [[_ _ book _ _]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt book)))

  (defn get-chap-num [filename]
    (let [[_ _ _ _ chap]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt chap)))

  (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
    (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
      (with-open [reader (io/reader (fs/file f))]
        (doseq [line (drop 2 (line-seq reader))]
          (.write writer (str line "\n")))))))

(defn sizetable [bits]
  (let [bytect (math/ceil (/ bits 8))
        kbs (/ bytect 1024)
        mbs (/ kbs 1024)]
    {:bits bits
     :bytes bytect
     :kbs kbs
     :mbs mbs}))

(def full-text (slurp "bbe-newlines-nochaps.txt"))

(def base-size (sizetable (* 8 (count full-text))))

"Naiive, just tokenize with spaces"
(let [toks (-> full-text 
               ;(str/lower-case)
               (str/replace #"\s+" " SP ") ;; all spaces become one space
               (str/replace #"[,.;:!?\(\)\[\]'\*-]" #(str " " %1 " "))
               (str/replace #"\s+" " ") ;; all spaces normalized to one space
               (str/split #" ")
               (#(remove str/blank? %1)))
      freqs (frequencies toks) ;6689 unique tokens
      #_#_sorted-freqs (sort-by val > freqs)
      #_#_dictionary (vec
                   (zipmap
                     (range (count sorted-freqs))
                     (map first sorted-freqs)))]
  (count freqs)
  #_(sizetable (* 13 (count toks))))


"Lower-case, remove 's, remove word final s tokenize"
(let [toks (-> full-text 
               (str/lower-case)
               (str/replace #"'s" " AS ") ;; Apostrophe S
               (str/replace #"\s+" " SP ") ;; all spaces become one space
               (str/replace #"[,.;:!?\(\)\[\]'\*-]" #(str " " %1 " "))
               (str/replace #"\s+" " ") ;; all spaces normalized to one space
               (str/split #" ")
               (#(remove str/blank? %1)))
      freqs (frequencies toks) ;; 5998 unique tokens
      #_#_sorted-freqs (sort-by val > freqs)
      #_#_dictionary (vec
                   (zipmap
                     (range (count sorted-freqs))
                     (map first sorted-freqs)))]
  (count freqs)
  #_(sizetable (* 13 (count toks))))

"Takes 13 bits for this dictionary"
(Integer/toBinaryString 5998)
"_-_10111 01101110"

"The text uses only 1000 unique words (not counting proper nouns?)->Possible to
get each word to 10 bits? Maybe two bytes?"
(math/pow 2 10)

"Dictionary compression is a dead end for me (I think), but it was worth looking
into."