Last active
          September 11, 2015 00:58 
        
      - 
      
- 
        Save stephenmac7/fa430fb3b3cfc033398a to your computer and use it in GitHub Desktop. 
Revisions
- 
        stephenmac7 revised this gist Sep 11, 2015 . 1 changed file with 16 additions and 10 deletions.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -34,16 +34,20 @@ def main(opt) # big sizes and ve uses pipes # Process the text and count lemmas, this might take a while freq = calculate_frequency(lines, opt['--morpheme']) # Show count show_count(freq) end def calculate_frequency(lines, morpheme) # Creates a hash with the frequency for all the lines lines.reduce(Hash.new(0)) do |freq,line| ve_line = filter_blacklisted(Ve.in(:ja).words(line)) get_frequency_hash(ve_line, morpheme, freq) end end def remove_rubies(text) # For Aozora Bunko text as input, rubies need to be removed text.gsub(/《.*》/, "") @@ -79,8 +83,10 @@ def show_count(counts) end end if __FILE__==$0 begin main Docopt::docopt(doc, version: '0.0.1') rescue Docopt::Exit => e puts e.message end end 
- 
        stephenmac7 revised this gist Jun 27, 2015 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,4 @@ # Gem Depends: ve, docopt # System Depends: mecab, mecab-ipadic-utf-8 require 'csv' require 've' 
- 
        stephenmac7 revised this gist Jun 27, 2015 . 1 changed file with 64 additions and 23 deletions.There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,45 +1,86 @@ # Gem Depends: ve # System Depends: mecab, mecab-ipadic-utf-8 require 'csv' require 've' require 'docopt' doc = <<DOCOPT Lemma Frequency Report. Usage: #{__FILE__} [options] FILE ... #{__FILE__} -h | --help #{__FILE__} --version Options: -h --help Show this screen. -m --morpheme Target morphemes, instead of lexemes. --version Show version. DOCOPT def main(opt) # Input from args, UTF-8 required contents = '' opt['FILE'].each do |f| if f == '-' f = '/dev/stdin' end contents << File.read(f) end # Pre-processing lines = remove_rubies(contents).split # We need to give mecab bite-sized # pieces, because pipes can't handle # big sizes and ve uses pipes # Process the text and count lemmas, this might take a while freq = Hash.new(0) lines.each do |line| ve_line = filter_blacklisted(Ve.in(:ja).words(line)) get_frequency_hash(ve_line, opt["--morpheme"], freq) end # Show count show_count(freq) end def remove_rubies(text) # For Aozora Bunko text as input, rubies need to be removed text.gsub(/《.*》/, "") end # For morpheme operations, it would be much faster to use mecab directly def get_frequency_hash(words, morpheme, freq = Hash.new(0)) words.each do |word| unless word.lemma == "*" # if lemma could not be found, don't count if morpheme word.tokens.each do |token| index = [token[:lemma], token[:pos]] freq[index] += 1 end else index = [word.lemma, word.part_of_speech.name] freq[index] += 1 end end end freq end def filter_blacklisted(words) pos_blacklist = [Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::ProperNoun] words.select { |word| not pos_blacklist.include? word.part_of_speech } end def show_count(counts) counts.sort_by{|_,count| count}.reverse.each do |ind,count| print [count, ind.first, ind.last].to_csv end end begin main Docopt::docopt(doc, version: '0.0.1') rescue Docopt::Exit => e puts e.message end 
- 
        stephenmac7 created this gist Jun 26, 2015 .There are no files selected for viewingThis file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,45 @@ # Lemma Frequency Report # Gem Depends: ve # System Depends: mecab, mecab-ipadic-utf-8 # Usage: ruby freq.rb [FILE] require 've' def main # Input from stdin or args, UTF-8 required contents = ARGF.read # I'm using aozora bunko text as input, so the rubies need to be removed plain = contents.gsub(/《.*》/, "") # Process the text, this might take a while parsed = Ve.in(:ja).words(plain) # Get frequency of words not in the blacklist freq = count_lemmas(filter_blacklisted(parsed)) # Show it show_count(freq) end def count_lemmas(words) # Now we have a list of words, let's take the lemmas, # which seem to be all we're interested in at the moment. # And count, using a hash lemma_counts = Hash.new(0) words.each do |word| unless word.lemma == "*" # if lemma could not be found, don't count lemma_counts[word.lemma] += 1 end end lemma_counts end def filter_blacklisted(words) pos_blacklist = [Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::ProperNoun] words.select { |word| not pos_blacklist.include? word.part_of_speech } end def show_count(lemma_counts) lemma_counts.sort_by{|_,count| count}.reverse.each do |lemma,count| puts "#{count}\t#{lemma}" end end main