(require '[clojure.string :as str])
(require '[clojure.java.io :as io])
(require '[babashka.fs :as fs])
(require '[clojure.data.priority-map :as pm])
(require '[clojure.math :as math])
(require '[clojure.core.match :as match])

;;; The full compressor of bible in basic english
;;; Mk2 - compressing with common n-grams, ditching LZSS (maybe)

;Build the base txt file out of individual txt files
(comment 
  (def files (fs/glob "./base_files/" "**.txt"))

  (defn get-book-num [filename]
    (let [[_ _ book _ _]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt book)))

  (defn get-chap-num [filename]
    (let [[_ _ _ _ chap]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt chap)))

  (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
    (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
      (with-open [reader (io/reader (fs/file f))]
        (doseq [line (drop 2 (line-seq reader))]
          (.write writer (str line "\n")))))))

;;; The full text as a file
(def full-text (slurp "./bbe-newlines-nochaps.txt"))

(def optimized-string
  (-> full-text
      (str/lower-case)
      #_(str/replace #"\s+" " SPACE ") ;; Added in mk2 if it turns out it's better to break hapax into individual chars to kill dictionary bloat
      (str/replace #"'s" " AS ")
      (str/replace #"\.\.\." " DOTDOTDOT ")
      (str/replace #"\*\*\*" " STARSTARSTAR ")
      (str/replace #"—" "-")
      (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
      (str/replace #"\s+" " ")))

(def optimized-tokens
  (str/split optimized-string #" "))

(comment ;Some basic stats on our work so far
         (count full-text) ; total chars 4207465
         (count optimized-tokens) ; total tokens 962868
         (count (into #{} optimized-tokens)) ; 5997 total unique tokens
         (apply max (map count (into #{} optimized-tokens))) ; max word is 17 chars long "straightforwardly" -> 1 nyble to represent?
         ; We could maybe do some domain modeling and do like 
         ; "suffix-s" or "suffix-ly"s like with "'s" right now
         ) 


;;; First we'll dictionary-encode our tokens

;; For a less efficient (in the long term) encoding algorithm, see dictionary-packed.clj
(def word-ids
  (let [sorted-toks (sort-by val > (frequencies optimized-tokens))
        token-reprs 
        (into {} 
              (map-indexed 
                (fn [id [tok _freq]]
                  [tok (unchecked-short (bit-and 0xffff id))#_[(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]])
                sorted-toks))]
    token-reprs))

(def dict-id-compressed-text
  (map word-ids optimized-tokens))

(comment
  (count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices (it's 962868 shorts)
  (- (dec (math/pow 2 13)) (count (into #{} optimized-tokens))) ; !!NOTE!! that we're leaving 2194 whole tokens on the table.
  )

;;; Then we apply an optimal-n-grams finder:
(comment
  ; some scratch work on how big our n can be
  (defn get-most-common [n]
    (take 10 
          (->> optimized-tokens
               (partition n 1)
               (frequencies)
               (sort-by second >))))
  (def ngrams-data (map get-most-common (range 2 11)))
  ngrams-data)

;;; We could do this with a bunch of tokens but we instead apply recursive
;;; n-grams find

(comment
  (defn make-ngram [t1 t2]
    (str (str/upper-case t1) "_" (str/upper-case t2)))

  (defn replace-with-ngram [tokens [t1 t2] new-token infreqs]
    (loop [in tokens
           out (transient [])
           freqs infreqs]
      (cond
        (empty? in) 
        [(persistent! out) freqs]

        (and (= (first in) t1) (= (second in) t2))
        (recur (drop 2 in) (conj! out new-token) (-> freqs
                                                     (assoc t1 (inc (freqs t1)))
                                                     (assoc t2 (inc (freqs t2)))
                                                     (assoc new-token (dec (get freqs new-token 0)))))

        :else
        (recur (rest in) (conj! out (first in)) freqs))))

  (defn find-recursive-ngrams 
    [intokens]
    (loop [tokens intokens
           freqs (->> intokens 
                      (frequencies)
                      (map (fn [[k v]] [k (- v)]))
                      (into (pm/priority-map)))]
      (if (<= (inc (math/pow 2 13)) (count freqs))
        tokens
        (let [ngrams (partition 2 1 tokens) ;find ngrams
              [ngram _ct] (apply max-key second (frequencies ngrams)) ;find the most common ngram
              [t1 t2] ngram
              [nexttoks nextfreqs] (replace-with-ngram tokens ngram (make-ngram t1 t2) freqs)]

          (println "Working on: " ngram "with freqcount" (count freqs))
          (recur nexttoks nextfreqs)))))

  (spit "optimized-token-stream.txt" (apply str (interpose "\n" optimized-ngrams))))
#_(def optimized-ngrams (find-recursive-ngrams optimized-tokens))
(def optimized-ngrams (str/split (slurp "optimized-token-stream.txt") #"\n"))

(count optimized-tokens) ; 962868 tokens
(count optimized-ngrams) ; 608165 tokens

;;; TODO: Build dictionary