(require '[clojure.string :as str])

(require '[clojure.java.io :as io])
(require '[babashka.fs :as fs])

(require '[clojure.data.priority-map :as pm])

#_(def files (fs/glob "./base_files/" "**.txt"))

#_(defn get-book-num [filename]
  (let [[_ _ book _ _]
        (str/split (str filename) #"_")
        #_#_chap (int _chap)]
    (Integer/parseInt book)))

#_(defn get-chap-num [filename]
  (let [[_ _ _ _ chap]
        (str/split (str filename) #"_")
        #_#_chap (int _chap)]
    (Integer/parseInt chap)))

#_(with-open [writer (io/writer "bbe-newlines-nochaps.txt")]
  (doseq [f (sort-by (juxt get-book-num get-chap-num) files)]
    (with-open [reader (io/reader (fs/file f))]
      (doseq [line (drop 2 (line-seq reader))]
        (.write writer (str line "\n"))))))

(def full-text (slurp "bbe-newlines-nochaps.txt"))

(def tokens
  (-> full-text 
    (str/lower-case)
    (str/replace #"\s+" " ")
    (str/replace #"'s" " APOSTROPHE_S ")
    (str/replace #"[,.;:!?()\[\]'\*-]" #(str " " %1 " "))
    (str/split #" ")
    (#(remove str/blank? %1))))

(def symbol-freqs (frequencies tokens))

#_(spit "toks.txt" (apply str (interpose "\n" (map key symbol-freqs))))


#_(sort-by val > symbol-freqs)  ; Greatest to lease frequency
#_(reduce + (map val symbol-freqs))  ; Total tokens
#_(count symbol-freqs)  ; Total unique tokens
#_(reduce + (take 512 (map val symbol-freqs)))  ; Number of the top 100 common tokens

#_(reduce + (map count symbol-freqs))  ; Total chars needed for dict vals

#_(def two-grams (frequencies (partition 2 1 tokens)))
#_(sort-by val > two-grams)


;;; Make the huffman tree for the symbols (13)
(defrecord Node [left right sym probability])

; Create a prioirity-queue of parentless nodes for each symbol
(def pq
  (into (pm/priority-map-keyfn (juxt first second))
        (map #(vector
                (->Node nil nil (first %1) (second %1))
                [(second %1) (first %1)])
             symbol-freqs)))

(assert (= (count symbol-freqs) (count pq)) "Priority queue has fewer symbols than symbol list")

;; From: https://michaeldipperstein.github.io/huffman.html#decode
;; Step 1. Create a parentless node for each symbol. Each node should include the symbol and its probability.
;; Step 2. Select the two parentless nodes with the lowest probabilities.
;; Step 3. Create a new node which is the parent of the two lowest probability nodes.
;; Step 4. Assign the new node a probability equal to the sum of its children's probabilities.
;; Step 5. Repeat from Step 2 until there is only one parentless node left.

;; NOTE: This is an inefficient algorithm because we could use the 
;;       two-queue version on wikipedia
(defn build-huffman-tree [queue]
  (if (= 1 (count queue))
    (first (peek queue))  ; Repeat until there is only one parentless node left
    (let [[lowest-node [lowest-prob _]] (peek queue)
          [second-node [second-prob _]] (peek (pop queue))  ; Step 2
          new-prob (+ lowest-prob second-prob) ; Step 4
          new-node (->Node second-node lowest-node nil new-prob) ; Step 3 - NOTE: unsure about node order
          next-queue (assoc (pop (pop queue)) new-node [new-prob nil])] 
      (recur next-queue))))

(def huffman-tree (build-huffman-tree pq))

(assert (= (.probability huffman-tree)
           (reduce + (map val symbol-freqs)))
        "Probability of root node is not equal to the sum of all probabilities")

(defn huffman-tree-to-symbol-encodings [node encodings curr-encoding]
  (if (.sym node)
    (assoc encodings (.sym node) curr-encoding)
    (merge
        (huffman-tree-to-symbol-encodings (.left node) encodings (str "1" curr-encoding))
        (huffman-tree-to-symbol-encodings (.right node) encodings (str "0" curr-encoding)))))

(def huffman-tree-syms (huffman-tree-to-symbol-encodings huffman-tree {} ""))
(assert (= (count huffman-tree-syms)
           (count pq)
           (count symbol-freqs)))

;;; Build the canonical encodings

;; Each of the existing codes are replaced with a new one of the same length, using the following algorithm:

;;     The first symbol in the list gets assigned a codeword which is the same length as the symbol's original codeword but all zeros. This will often be a single zero ('0').
;;     Each subsequent symbol is assigned the next binary number in sequence, ensuring that following codes are always higher in value.
;;     When you reach a longer codeword, then after incrementing, append zeros until the length of the new codeword is equal to the length of the old codeword. This can be thought of as a left shift.

(defrecord HuffmanCodeword [sym code length])

(def sorted-huffman-tree-codewords 
  (->> huffman-tree-syms
       (sort-by (juxt (comp count val) key))
       (map #(->HuffmanCodeword (first %1) (Long/parseUnsignedLong (second %1) 2) (int (count (second %1)))))))

(defn build-canonical-encodings
  "Build canonical huffman encodings from a sorted list of huffman tree codewords"
  ([symbols]
   (let [first-sym (first symbols)
         seed-symbol (->HuffmanCodeword (.sym first-sym) 0 (.length first-sym))]
     (build-canonical-encodings [seed-symbol] (rest symbols))))
  ([codes symbols]
   (if (not-empty symbols)
     (let [prev-codeword (last codes)
         current-codeword (first symbols)
         next-sym (.sym current-codeword)
         next-base-code (unchecked-inc (.code prev-codeword))
         prev-len (.length prev-codeword)
         next-codeword (if (= (.length current-codeword)
                              (.length prev-codeword))
                         (->HuffmanCodeword next-sym
                                            next-base-code
                                            prev-len)
                         (->HuffmanCodeword next-sym
                                            (bit-shift-left next-base-code (inc prev-len))
                                            (inc prev-len)))]
     (recur (conj codes next-codeword) (rest symbols)))
     codes)))


(def canonical-encodings
  (build-canonical-encodings sorted-huffman-tree-codewords))

(map #(Long/toBinaryString (.code %1)) (take 100 canonical-encodings))