-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclojure-bigrams.clj
More file actions
37 lines (30 loc) · 838 Bytes
/
clojure-bigrams.clj
File metadata and controls
37 lines (30 loc) · 838 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
(ns clojure-bigrams.core
(:gen-class))
(defn remove-punctuation
[string]
(clojure.string/replace string #"[^0-9a-zA-Z\s]" ""))
(defn get-clean-text
[file-in]
(clojure.string/split (.toLowerCase (remove-punctuation (slurp file-in))) #"\s+"))
(defn create-bigram-vector
[string-vec-in]
(vec (map vector string-vec-in (rest string-vec-in))))
(defn create-bigram-output-set
[vec-vec-in]
(frequencies vec-vec-in))
(defn bigram-histogram
[bigram-output-set-in]
(doseq [i bigram-output-set-in]
(println (format "%s %s - %s"
(get(get i 0) 0)
(get(get i 0) 1)
(apply str (repeat (get i 1) \*))))
)
)
(defn -main
[& args]
(println)
(bigram-histogram(create-bigram-output-set(create-bigram-vector
(get-clean-text "bg_input_text.txt"))))
(println)
)