stephenmac7 · September 11, 2015 00:58 · Sep 11, 2015 · Jun 27, 2015 · Jun 27, 2015 · Jun 26, 2015
diff --git a/freq.rb b/freq.rb
@@ -34,16 +34,20 @@ def main(opt)
                                         # big sizes and ve uses pipes
 
   # Process the text and count lemmas, this might take a while
-  freq = Hash.new(0)
-  lines.each do |line|
-    ve_line = filter_blacklisted(Ve.in(:ja).words(line))
-    get_frequency_hash(ve_line, opt["--morpheme"], freq)
-  end
+  freq = calculate_frequency(lines, opt['--morpheme'])
 
   # Show count
   show_count(freq)
 end
 
+def calculate_frequency(lines, morpheme)
+  # Creates a hash with the frequency for all the lines
+  lines.reduce(Hash.new(0)) do |freq,line|
+    ve_line = filter_blacklisted(Ve.in(:ja).words(line))
+    get_frequency_hash(ve_line, morpheme, freq)
+  end
+end
+
 def remove_rubies(text)
   # For Aozora Bunko text as input, rubies need to be removed
   text.gsub(/《.*》/, "")
@@ -79,8 +83,10 @@ def show_count(counts)
   end
 end
 
-begin
-  main Docopt::docopt(doc, version: '0.0.1')
-rescue Docopt::Exit => e
-  puts e.message
-end
+if __FILE__==$0
+  begin
+    main Docopt::docopt(doc, version: '0.0.1')
+  rescue Docopt::Exit => e
+    puts e.message
+  end
+end
diff --git a/freq.rb b/freq.rb
@@ -1,4 +1,4 @@
-# Gem Depends: ve
+# Gem Depends: ve, docopt
 # System Depends: mecab, mecab-ipadic-utf-8
 require 'csv'
 require 've'

diff --git a/freq.rb b/freq.rb
@@ -1,45 +1,86 @@
-# Lemma Frequency Report
 # Gem Depends: ve
 # System Depends: mecab, mecab-ipadic-utf-8
-# Usage: ruby freq.rb [FILE]
+require 'csv'
 require 've'
+require 'docopt'
 
-def main
-  # Input from stdin or args, UTF-8 required
-  contents = ARGF.read
-  # I'm using aozora bunko text as input, so the rubies need to be removed
-  plain = contents.gsub(/《.*》/, "")
-  # Process the text, this might take a while
-  parsed = Ve.in(:ja).words(plain)
-  # Get frequency of words not in the blacklist
-  freq = count_lemmas(filter_blacklisted(parsed))
-  # Show it
+doc = <<DOCOPT
+Lemma Frequency Report.
+
+Usage:
+  #{__FILE__} [options] FILE ...
+  #{__FILE__} -h | --help
+  #{__FILE__} --version
+
+Options:
+  -h --help      Show this screen.
+  -m --morpheme  Target morphemes, instead of lexemes.
+  --version      Show version.
+DOCOPT
+
+def main(opt)
+  # Input from args, UTF-8 required
+  contents = ''
+  opt['FILE'].each do |f|
+    if f == '-'
+      f = '/dev/stdin'
+    end
+    contents << File.read(f)
+  end
+
+  # Pre-processing
+  lines = remove_rubies(contents).split # We need to give mecab bite-sized
+                                        # pieces, because pipes can't handle
+                                        # big sizes and ve uses pipes
+
+  # Process the text and count lemmas, this might take a while
+  freq = Hash.new(0)
+  lines.each do |line|
+    ve_line = filter_blacklisted(Ve.in(:ja).words(line))
+    get_frequency_hash(ve_line, opt["--morpheme"], freq)
+  end
+
+  # Show count
   show_count(freq)
 end
 
-def count_lemmas(words)
-  # Now we have a list of words, let's take the lemmas,
-  # which seem to be all we're interested in at the moment.
-  # And count, using a hash
-  lemma_counts = Hash.new(0)
+def remove_rubies(text)
+  # For Aozora Bunko text as input, rubies need to be removed
+  text.gsub(/《.*》/, "")
+end
+
+# For morpheme operations, it would be much faster to use mecab directly
+def get_frequency_hash(words, morpheme, freq = Hash.new(0))
   words.each do |word|
     unless word.lemma == "*" # if lemma could not be found, don't count
-      lemma_counts[word.lemma] += 1
+      if morpheme
+        word.tokens.each do |token|
+          index = [token[:lemma], token[:pos]]
+          freq[index] += 1
+        end
+      else
+        index = [word.lemma, word.part_of_speech.name]
+        freq[index] += 1
+      end
     end
   end
 
-  lemma_counts
+  freq
 end
 
 def filter_blacklisted(words)
   pos_blacklist = [Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::ProperNoun]
   words.select { |word| not pos_blacklist.include? word.part_of_speech }
 end
 
-def show_count(lemma_counts)
-  lemma_counts.sort_by{|_,count| count}.reverse.each do |lemma,count|
-    puts "#{count}\t#{lemma}"
+def show_count(counts)
+  counts.sort_by{|_,count| count}.reverse.each do |ind,count|
+    print [count, ind.first, ind.last].to_csv
   end
 end
 
-main
+begin
+  main Docopt::docopt(doc, version: '0.0.1')
+rescue Docopt::Exit => e
+  puts e.message
+end
diff --git a/freq.rb b/freq.rb
@@ -0,0 +1,45 @@
+# Lemma Frequency Report
+# Gem Depends: ve
+# System Depends: mecab, mecab-ipadic-utf-8
+# Usage: ruby freq.rb [FILE]
+require 've'
+
+def main
+  # Input from stdin or args, UTF-8 required
+  contents = ARGF.read
+  # I'm using aozora bunko text as input, so the rubies need to be removed
+  plain = contents.gsub(/《.*》/, "")
+  # Process the text, this might take a while
+  parsed = Ve.in(:ja).words(plain)
+  # Get frequency of words not in the blacklist
+  freq = count_lemmas(filter_blacklisted(parsed))
+  # Show it
+  show_count(freq)
+end
+
+def count_lemmas(words)
+  # Now we have a list of words, let's take the lemmas,
+  # which seem to be all we're interested in at the moment.
+  # And count, using a hash
+  lemma_counts = Hash.new(0)
+  words.each do |word|
+    unless word.lemma == "*" # if lemma could not be found, don't count
+      lemma_counts[word.lemma] += 1
+    end
+  end
+
+  lemma_counts
+end
+
+def filter_blacklisted(words)
+  pos_blacklist = [Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::ProperNoun]
+  words.select { |word| not pos_blacklist.include? word.part_of_speech }
+end
+
+def show_count(lemma_counts)
+  lemma_counts.sort_by{|_,count| count}.reverse.each do |lemma,count|
+    puts "#{count}\t#{lemma}"
+  end
+end
+
+main