Calculating compression gain - File compression in Clojure (1)
Notes
;; text_analysis.clj
(def file "245-0.txt")
(defn words [text]
(clojure.string/split text #"\s+"))
(defn remove-punctions [text]
(clojure.string/replace text #"\W+" " "))
(defn words-in-file [file]
(->>
file
slurp
remove-punctions
words))
(defn word-frequencies-in-file [file]
(->>
file
words-in-file
frequencies))
(defn compression-gain [word-frequncies]
(map #(let [word (first %)
word-length (count word)
word-count (last %)
gain (* word-length word-count)]
(list gain word)) word-frequncies))
(defn file-compression-gain [file]
(reverse
(sort-by first
(compression-gain (word-frequencies-in-file file)))))