(require '[clojure.string :as str])
(require '[clojure.java.io :as io])
(require '[babashka.fs :as fs])
(require '[clojure.data.priority-map :as pm])
(require '[clojure.math :as math])
(require '[clojure.core.match :as match])

;;; The full compressor of bible in basic english

;Build the base txt file out of individual txt files
(comment 
  (def files (fs/glob "./base_files/" "**.txt"))

  (defn get-book-num [filename]
    (let [[_ _ book _ _]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt book)))

  (defn get-chap-num [filename]
    (let [[_ _ _ _ chap]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt chap)))

  (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
    (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
      (with-open [reader (io/reader (fs/file f))]
        (doseq [line (drop 2 (line-seq reader))]
          (.write writer (str line "\n")))))))

;;; The full text as a file
(def full-text (slurp "./bbe-newlines-nochaps.txt"))

(def optimized-string
  (-> full-text
      (str/lower-case)
      (str/replace #"'s" " AS ")
      (str/replace #"\.\.\." " DOTDOTDOT ")
      (str/replace #"\*\*\*" " STARSTARSTAR ")
      (str/replace #"—" "-")
      (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
      (str/replace #"\s+" " ")))

(def optimized-tokens
  (str/split optimized-string #" "))

(comment ;Some basic stats on our work so far
         (count full-text) ; total chars 4207465
         (count optimized-tokens) ; total tokens 962868
         (count (into #{} optimized-tokens)) ; 5997 total unique tokens
         (apply max (map count (into #{} optimized-tokens))) ; max word is 17 chars long "straightforwardly" -> 1 nyble to represent?
                                                             ; We could maybe do some domain modeling and do like 
                                                             ; "suffix-s" or "suffix-ly"s like with "'s" right now
         ) 


;;; First we'll dictionary-encode our tokens

;; For a less efficient (in the long term) encoding algorithm, see dictionary-packed.clj
(def word-ids
  (let [sorted-toks (sort-by val > (frequencies optimized-tokens))
        token-reprs 
        (into {} 
              (map-indexed 
                (fn [id [tok _freq]]
                  [tok (unchecked-short (bit-and 0xffff id))#_[(unchecked-byte (bit-shift-right id 8)) (unchecked-byte (bit-and 0x00FF id))]])
                sorted-toks))]
    token-reprs))

(def dict-id-compressed-text
  (map word-ids optimized-tokens))

(comment
  (count dict-id-compressed-text) ;Whittled it down to 1925736 total bytes with 16 bit indices (it's 962868 shorts)
  )

;;; TODO: Build dictionary

;;; Next, we'll run LZSS on our tok-id-list
(def WINDOW-SIZE 1024) ; The maximum distance we look back, only allowing 1k RAM
(def MIN-MATCH 3)    ; Minimum length to bother with a reference

(def MATCH-FLAG (count (frequencies dict-id-compressed-text))) 
;Basically it's the total number of tokens they only need 0-(total-1)
; This is the START. Then 10 bits for the offset (1k), and 4 bits for the length (255 toks).

(defn- get-match-len [data-vec pos match-pos max-len]
  (loop [len 0]
    (if (and (< len max-len)
             (= (nth data-vec (+ pos len))
                (nth data-vec (+ match-pos len))))
      (recur (inc len))
      len)))

#_(defn compress
  "LZSS using transient vectors and a hash-index for O(1) lookups."
  [data]
  (let [data-vec (vec data)
        data-len (count data-vec)
        #_#_MIN-MATCH 3] ;; Standard MIN-MATCH
    (loop [cursor 0
           ;; map of [c1 c2 c3] -> list of indices
           index {} 
           ;; Transient vector for high-performance output collection
           out (transient [])]
      (if (>= cursor data-len)
        (apply str (persistent! out)) ;; Finalize the transient and join
        
        (let [;; 1. Check if we have a potential match using the index
              triplet (subvec data-vec cursor (min (+ cursor MIN-MATCH) data-len))
              match-indices (get index triplet)
              
              ;; 2. Find the best match among the candidates
              best-match (when (>= (count triplet) MIN-MATCH)
                           (reduce (fn [best idx]
                                     (let [len (get-match-len data-vec cursor idx (min (- data-len cursor) WINDOW-SIZE))]
                                       (if (> len (:length best 0))
                                         {:offset (- cursor idx) :length len}
                                         best)))
                                   nil
                                   (filter #(> % (- cursor WINDOW-SIZE)) match-indices)))]
          
          (if (and best-match (>= (:length best-match) MIN-MATCH))
            ;; Match found
            (let [len (:length best-match)
                  ref-str (str "<" (:offset best-match) "|" len ">")
                  #_#_ref-str "<00|00"]
              (recur (+ cursor len)
                     ;; Update index for every triplet we skip (simplification: just first)
                     (assoc index triplet (conj (get index triplet []) cursor))
                     (conj! out ref-str)))
            
            ;; No match found
            (let [char-lit (str (nth data-vec cursor))]
              (recur (inc cursor)
                     (if (>= (count triplet) MIN-MATCH)
                       (assoc index triplet (conj (get index triplet []) cursor))
                       index)
                     (conj! out char-lit)))))))))

(defn compress-tokens
  "Takes a sequence of 16-bit token IDs. 
   Returns a vector of maps: {:type :lit :val v} or {:type :match :off o :len l}"
  [data]
  (let [data-vec (vec data)
        data-len (count data-vec)]
    (loop [cursor 0
           index {} 
           out (transient [])]
      
      (if (>= cursor data-len)
        (persistent! out) ;; Return clean vector of data structures
        
        ;; 1. Setup search parameters
        (let [max-search-len (min (- data-len cursor) 255) ;; Cap length (e.g. 255) for 8-bit length fields
              triplet (if (>= max-search-len MIN-MATCH)
                        (subvec data-vec cursor (+ cursor MIN-MATCH))
                        nil)
              match-indices (get index triplet)]

          ;; 2. Find Best Match
          (let [best-match (when (and triplet match-indices)
                             (reduce (fn [best candidate-idx]
                                       ;; Check if candidate is within WINDOW-SIZE
                                       (if (> candidate-idx (- cursor WINDOW-SIZE))
                                         (let [len (get-match-len data-vec cursor candidate-idx max-search-len)]
                                           (if (> len (:len best 0))
                                             {:dist (- cursor candidate-idx) :len len}
                                             best))
                                         best)) ;; Too old, skip
                                     nil
                                     match-indices))]

            (if (and best-match (>= (:len best-match) MIN-MATCH))
              ;; CASE A: Match Found
              (recur (+ cursor (:len best-match))
                     ;; Note: We are still "Lazy Indexing" here for speed. 
                     ;; To maximize compression, you'd loop here to add skipped parts to `index`.
                     (assoc index triplet (conj (get index triplet []) cursor))
                     (conj! out {:type :match 
                                 :dist (:dist best-match) 
                                 :len (:len best-match)}))

              ;; CASE B: Literal
              (recur (inc cursor)
                     (if triplet
                       (assoc index triplet (conj (get index triplet []) cursor))
                       index)
                     (conj! out {:type :lit 
                                 :val (nth data-vec cursor)})))))))))

(def lzss-compressed-dict-ids (compress-tokens dict-id-compressed-text))
; The way this works is it outputs literal shorts, and then if there's a short
; that says MATCH-FLAG it doesn't bother huffman encoding.