(require '[clojure.string :as str])

(require '[clojure.java.io :as io])
(require '[babashka.fs :as fs])

(require '[clojure.data.priority-map :as pm])

(require '[clojure.math :as math])

(comment
  "Build the base file"
  (def files (fs/glob "./base_files/" "**.txt"))

  (defn get-book-num [filename]
    (let [[_ _ book _ _]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt book)))

  (defn get-chap-num [filename]
    (let [[_ _ _ _ chap]
          (str/split (str filename) #"_")
          #_#_chap (int _chap)]
      (Integer/parseInt chap)))

  (with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
    (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
      (with-open [reader (io/reader (fs/file f))]
        (doseq [line (drop 2 (line-seq reader))]
          (.write writer (str line "\n")))))))

(def full-text (slurp "bbe-newlines-nochaps.txt"))

;; This is "Stage1-optimized-tokens"
(def tokens
  (-> full-text 
      (str/lower-case)
      (str/replace #"\s+" " ")
      (str/replace #"'s" " AS ") ;; Apostrophe_S
      (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
      (str/split #" ")
      (#(remove str/blank? %1))))

(def symbol-freqs (frequencies tokens))

(comment
  "Do some basic statistics and print a list of tokens"
  (spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs))))

  (sort-by val > symbol-freqs)  ; Greatest to lease frequency
  (reduce + (map val symbol-freqs))  ; Total tokens
  (count symbol-freqs)  ; Total unique tokens
  (reduce + (take 512 (map val symbol-freqs)))  ; Number of the top 100 common tokens

  (reduce + (map count symbol-freqs))  ; Total chars needed for dict vals

  (def two-grams (frequencies (partition 2 1 tokens)))
  (sort-by val > two-grams))


;;; Make the huffman tree for the symbols (13)
(defrecord Node [left right sym probability])

; Create a prioirity-queue of parentless nodes for each symbol
(defn symboltable->pq [symbolfreqs]
  (into (pm/priority-map-keyfn (juxt first second))
        (map #(vector
                (->Node nil nil (first %1) (second %1))
                [(second %1) (first %1)])
             symbolfreqs)))

(assert (= (count symbol-freqs) (count (symboltable->pq symbol-freqs))) "Priority queue has fewer symbols than symbol list")

;; From: https://michaeldipperstein.github.io/huffman.html#decode
;; Step 1. Create a parentless node for each symbol. Each node should include the symbol and its probability.
;; Step 2. Select the two parentless nodes with the lowest probabilities.
;; Step 3. Create a new node which is the parent of the two lowest probability nodes.
;; Step 4. Assign the new node a probability equal to the sum of its children's probabilities.
;; Step 5. Repeat from Step 2 until there is only one parentless node left.

;; NOTE: This is an inefficient algorithm because we could use the 
;;       two-queue version on wikipedia
(defn build-huffman-tree-from-queue
  "Builds a huffman-tree out of a priority-queue"
  [queue]
  (if (= 1 (count queue))
    (first (peek queue))  ; Repeat until there is only one parentless node left
    (let [[lowest-node [lowest-prob _]] (peek queue)
          [second-node [second-prob _]] (peek (pop queue))  ; Step 2
          new-prob (+ lowest-prob second-prob) ; Step 4
          new-node (->Node second-node lowest-node nil new-prob) ; Step 3 - NOTE: unsure about node order
          next-queue (assoc (pop (pop queue)) new-node [new-prob nil])] 
      (recur next-queue))))

(defn build-huffman-tree
  "builds a huffman-tree out of a frequency table of symbols"
  [symbol-freq-table]
  (build-huffman-tree-from-queue (symboltable->pq symbol-freq-table)))

(def huffman-tree (build-huffman-tree symbol-freqs))

(assert (= (.probability huffman-tree)
           (reduce + (map val symbol-freqs)))
        "Probability of root node is not equal to the sum of all probabilities")

(defn huffman-tree-to-symbol-encodings 
  "Builds a list of symbol encodings out of a huffman-tree
   [tree] tree to symbol encodings
   [node encodings curr-encoding] recursive builder"
   ([tree]
    (huffman-tree-to-symbol-encodings tree {} ""))
  ([node encodings curr-encoding]
  (if (.sym node)
    (assoc encodings (.sym node) curr-encoding)
    (merge
      (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding))
      (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding))))))

(def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} ""))

(assert (= (huffman-tree-to-symbol-encodings huffman-tree {} "")
           (huffman-tree-to-symbol-encodings huffman-tree))
        "Multi-arity def does not work as intended")
(assert (= (count huffman-tree-syms)
           (count (symboltable->pq symbol-freqs))
           (count symbol-freqs))
        "Some function is adding or removing symbols")

;;; Build the canonical encodings

;; From: https://en.wikipedia.org/wiki/Canonical_Huffman_code
;; Each of the existing codes are replaced with a new one of the same length, using the following algorithm:

;;     The first symbol in the list gets assigned a codeword which is the same length as the symbol's original codeword but all zeros. This will often be a single zero ('0').
;;     Each subsequent symbol is assigned the next binary number in sequence, ensuring that following codes are always higher in value.
;;     When you reach a longer codeword, then after incrementing, append zeros until the length of the new codeword is equal to the length of the old codeword. This can be thought of as a left shift.


;; --- little class for Huffman Codewords
(defrecord HuffmanCodeword [sym code length])
(defn get-codeword-string 
  "Takes a [HuffmanCodeword] and returns a binary str of the code"
  [codeword]
  (let [base (Long/toBinaryString (.code codeword))
        basecount (count base)
        len (.length codeword)]
    (if (= basecount len)
      base
      (str (apply str (take (- len basecount) (repeat "0"))) base))))

;; --- little class for Huffman Codewords

(def sorted-huffman-tree-codewords 
  (->> huffman-tree-syms
       (sort-by (juxt (comp count val) key))
       (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1)))))))

(defn build-canonical-encodings
  "Build canonical huffman encodings from a huffman tree
   takes [symbols] a list of huffman codes derived from a code-tree"
  ([tree]
   (let [symbols (->> tree
                      (huffman-tree-to-symbol-encodings)
                      (sort-by (juxt (comp count val) key))

                      (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1))))))
         first-sym (first symbols)
         seed-symbol (->HuffmanCodeword (.sym first-sym) 0 (.length first-sym))]
     (build-canonical-encodings [seed-symbol] (rest symbols))))
  ([codes symbols]
   (if (not-empty symbols)
     (let [prev-codeword (last codes)
           current-codeword (first symbols)
           next-sym (.sym current-codeword)
           next-base-code (unchecked-inc (.code prev-codeword))
           prev-len (.length prev-codeword)
           next-codeword 
           (if (= (.length current-codeword)
                  (.length prev-codeword))
             (->HuffmanCodeword 
               next-sym
               next-base-code
               prev-len)
             (->HuffmanCodeword 
               next-sym
               (bit-shift-left 
                 next-base-code 
                 (- prev-len
                    (- 63 (Long/numberOfLeadingZeros next-base-code)))) ;Not 100% confident about this
               (inc prev-len)))]
       (recur (conj codes next-codeword) (rest symbols)))
     codes)))

(def canonical-encodings
  (build-canonical-encodings huffman-tree))


(assert
  (= (map #(.length %1) sorted-huffman-tree-codewords)
     (map #(.length %1) sorted-huffman-tree-codewords))
  "Some of the codes changed length when canonicalizing")

(assert
  (= (count canonical-encodings)
     (count (set (map #(Long/toBinaryString (.code %1)) canonical-encodings))))
  "There appears to be duplicate canonical encodings")

(defn get-encoded-length [encodings tokenlist]
  (let [encodingtable (into {} (map #(vector (.sym %1) (.length %1)) encodings))
        totalbits (reduce + (map encodingtable tokenlist))
        totalbytes (math/ceil (/ totalbits 8))
        totalkb (/ totalbytes 1024)
        totalmb (/ totalkb 1024)]
    {:bits totalbits
     :bytes totalbytes
     :kb totalkb
     :mb totalmb}))


(comment
  (def base-file-size
    (let [totalbits (* 8 (count full-text))
          totalbytes (math/ceil (/ totalbits 8))
          totalkb (/ totalbytes 1024)
          totalmb (/ totalkb 1024)]
      {:bits totalbits
       :bytes totalbytes
       :kb totalkb
       :mb totalmb}))
  (def stage1-optimized-token-encoding-length
    (get-encoded-length canonical-encodings tokens)))

(comment
  "Some scratchwork figuring out how to make the shortest bit smaller"
  (defn get-highest-freq-percent [freqlist]
    (let [most-freq (first (sort-by val > freqlist))] ;the fn already sorts, but most invocations pre-sort for REPL purposes
    [(key most-freq)
     (double 
      (/ (second most-freq)
         (reduce + (map second symbol-freqs))))]))

  (get-highest-freq-percent symbol-freqs) ;most frequent symbol is 6.83%
  (get-highest-freq-percent (sort-by val > (frequencies (map first tokens)))) ;most frequent 1-letter prefix
  (get-highest-freq-percent (sort-by val > (frequencies (map last tokens)))) ;most frequent 1-letter suffix
  (get-highest-freq-percent (sort-by val > (frequencies (map count tokens)))) ;26% of the text is 3 letter tokens
  )

;;; NOTE FOR VIDEO:
;; First, consider some kind of bit-packing scheme 
;;  - 65 total chars
;;  - Shave it down to 63
;;  - Remove caps/reduce even furhter to try and get it to 31
;; Then introduce huffman encodings
;;  - Raw text
;;  - Text with some of the above optimizations
;;  - Tokenize at spaces and punctuation
;; Then consider tokenizing I started off by tokenizing the text with some optimizations in place with stage1-optimized-token-encoding-length

(comment
  "Huffman on the raw text (65 total symbols)"
  (let [tokenized (seq full-text)
        symbol-freqs (frequencies tokenized)
        huff-tree (build-huffman-tree symbol-freqs)
        canonical-codes (build-canonical-encodings huff-tree)]
    (get-encoded-length canonical-codes tokenized))
  "Huffman on the above with normalized space and without * (63 total symbols)"
  (let [tokenized (seq (-> full-text 
                           (str/replace #"\s+" " ")
                           (str/replace #"\*" " ")))
        symbol-freqs (frequencies tokenized)
        huff-tree (build-huffman-tree symbol-freqs)
        canonical-codes (build-canonical-encodings huff-tree)]
    (get-encoded-length canonical-codes tokenized))
  "Huffman on the above in all lowercase (38 total symbols)"
  (let [tokenized (seq (-> full-text 
                           (str/replace #"\s+" " ")
                           (str/replace #"[\*]" "")
                           (str/lower-case)))
        symbol-freqs (frequencies tokenized)
        huff-tree (build-huffman-tree symbol-freqs)
        canonical-codes (build-canonical-encodings huff-tree)]
    (get-encoded-length canonical-codes tokenized))
  "Huffman on the above with char changes —()?!:; (31 total symbols)"
  (let [tokenized (seq (-> full-text 
                           (str/replace #"—" "-") ; Replace emdash with hyphen
                           (str/replace #"[\*]" "") ; Get rid of chars
                           (str/replace #"\([^)]*\)" "") ; Get rid of everything between parens
                           (str/replace #"[?!]" ".") ; Replace all final punct with periods
                           (str/replace #"[;:]" ",") ; Replace all middle punct with commas
                           (str/replace #"\s+" " ") ; Normalize space
                           (str/lower-case)))
        symbol-freqs (frequencies tokenized)
        huff-tree (build-huffman-tree symbol-freqs)
        canonical-codes (build-canonical-encodings huff-tree)]
    (get-encoded-length canonical-codes tokenized))
  )