Skip to content

Instantly share code, notes, and snippets.

@stephenmac7
Last active September 11, 2015 00:58
Show Gist options
  • Save stephenmac7/fa430fb3b3cfc033398a to your computer and use it in GitHub Desktop.
Save stephenmac7/fa430fb3b3cfc033398a to your computer and use it in GitHub Desktop.

Revisions

  1. stephenmac7 revised this gist Sep 11, 2015. 1 changed file with 16 additions and 10 deletions.
    26 changes: 16 additions & 10 deletions freq.rb
    Original file line number Diff line number Diff line change
    @@ -34,16 +34,20 @@ def main(opt)
    # big sizes and ve uses pipes

    # Process the text and count lemmas, this might take a while
    freq = Hash.new(0)
    lines.each do |line|
    ve_line = filter_blacklisted(Ve.in(:ja).words(line))
    get_frequency_hash(ve_line, opt["--morpheme"], freq)
    end
    freq = calculate_frequency(lines, opt['--morpheme'])

    # Show count
    show_count(freq)
    end

    def calculate_frequency(lines, morpheme)
    # Creates a hash with the frequency for all the lines
    lines.reduce(Hash.new(0)) do |freq,line|
    ve_line = filter_blacklisted(Ve.in(:ja).words(line))
    get_frequency_hash(ve_line, morpheme, freq)
    end
    end

    def remove_rubies(text)
    # For Aozora Bunko text as input, rubies need to be removed
    text.gsub(/《.*》/, "")
    @@ -79,8 +83,10 @@ def show_count(counts)
    end
    end

    begin
    main Docopt::docopt(doc, version: '0.0.1')
    rescue Docopt::Exit => e
    puts e.message
    end
    if __FILE__==$0
    begin
    main Docopt::docopt(doc, version: '0.0.1')
    rescue Docopt::Exit => e
    puts e.message
    end
    end
  2. stephenmac7 revised this gist Jun 27, 2015. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion freq.rb
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,4 @@
    # Gem Depends: ve
    # Gem Depends: ve, docopt
    # System Depends: mecab, mecab-ipadic-utf-8
    require 'csv'
    require 've'
  3. stephenmac7 revised this gist Jun 27, 2015. 1 changed file with 64 additions and 23 deletions.
    87 changes: 64 additions & 23 deletions freq.rb
    Original file line number Diff line number Diff line change
    @@ -1,45 +1,86 @@
    # Lemma Frequency Report
    # Gem Depends: ve
    # System Depends: mecab, mecab-ipadic-utf-8
    # Usage: ruby freq.rb [FILE]
    require 'csv'
    require 've'
    require 'docopt'

    def main
    # Input from stdin or args, UTF-8 required
    contents = ARGF.read
    # I'm using aozora bunko text as input, so the rubies need to be removed
    plain = contents.gsub(/《.*》/, "")
    # Process the text, this might take a while
    parsed = Ve.in(:ja).words(plain)
    # Get frequency of words not in the blacklist
    freq = count_lemmas(filter_blacklisted(parsed))
    # Show it
    doc = <<DOCOPT
    Lemma Frequency Report.
    Usage:
    #{__FILE__} [options] FILE ...
    #{__FILE__} -h | --help
    #{__FILE__} --version
    Options:
    -h --help Show this screen.
    -m --morpheme Target morphemes, instead of lexemes.
    --version Show version.
    DOCOPT

    def main(opt)
    # Input from args, UTF-8 required
    contents = ''
    opt['FILE'].each do |f|
    if f == '-'
    f = '/dev/stdin'
    end
    contents << File.read(f)
    end

    # Pre-processing
    lines = remove_rubies(contents).split # We need to give mecab bite-sized
    # pieces, because pipes can't handle
    # big sizes and ve uses pipes

    # Process the text and count lemmas, this might take a while
    freq = Hash.new(0)
    lines.each do |line|
    ve_line = filter_blacklisted(Ve.in(:ja).words(line))
    get_frequency_hash(ve_line, opt["--morpheme"], freq)
    end

    # Show count
    show_count(freq)
    end

    def count_lemmas(words)
    # Now we have a list of words, let's take the lemmas,
    # which seem to be all we're interested in at the moment.
    # And count, using a hash
    lemma_counts = Hash.new(0)
    def remove_rubies(text)
    # For Aozora Bunko text as input, rubies need to be removed
    text.gsub(/《.*》/, "")
    end

    # For morpheme operations, it would be much faster to use mecab directly
    def get_frequency_hash(words, morpheme, freq = Hash.new(0))
    words.each do |word|
    unless word.lemma == "*" # if lemma could not be found, don't count
    lemma_counts[word.lemma] += 1
    if morpheme
    word.tokens.each do |token|
    index = [token[:lemma], token[:pos]]
    freq[index] += 1
    end
    else
    index = [word.lemma, word.part_of_speech.name]
    freq[index] += 1
    end
    end
    end

    lemma_counts
    freq
    end

    def filter_blacklisted(words)
    pos_blacklist = [Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::ProperNoun]
    words.select { |word| not pos_blacklist.include? word.part_of_speech }
    end

    def show_count(lemma_counts)
    lemma_counts.sort_by{|_,count| count}.reverse.each do |lemma,count|
    puts "#{count}\t#{lemma}"
    def show_count(counts)
    counts.sort_by{|_,count| count}.reverse.each do |ind,count|
    print [count, ind.first, ind.last].to_csv
    end
    end

    main
    begin
    main Docopt::docopt(doc, version: '0.0.1')
    rescue Docopt::Exit => e
    puts e.message
    end
  4. stephenmac7 created this gist Jun 26, 2015.
    45 changes: 45 additions & 0 deletions freq.rb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,45 @@
    # Lemma Frequency Report
    # Gem Depends: ve
    # System Depends: mecab, mecab-ipadic-utf-8
    # Usage: ruby freq.rb [FILE]
    require 've'

    def main
    # Input from stdin or args, UTF-8 required
    contents = ARGF.read
    # I'm using aozora bunko text as input, so the rubies need to be removed
    plain = contents.gsub(/《.*》/, "")
    # Process the text, this might take a while
    parsed = Ve.in(:ja).words(plain)
    # Get frequency of words not in the blacklist
    freq = count_lemmas(filter_blacklisted(parsed))
    # Show it
    show_count(freq)
    end

    def count_lemmas(words)
    # Now we have a list of words, let's take the lemmas,
    # which seem to be all we're interested in at the moment.
    # And count, using a hash
    lemma_counts = Hash.new(0)
    words.each do |word|
    unless word.lemma == "*" # if lemma could not be found, don't count
    lemma_counts[word.lemma] += 1
    end
    end

    lemma_counts
    end

    def filter_blacklisted(words)
    pos_blacklist = [Ve::PartOfSpeech::Symbol, Ve::PartOfSpeech::ProperNoun]
    words.select { |word| not pos_blacklist.include? word.part_of_speech }
    end

    def show_count(lemma_counts)
    lemma_counts.sort_by{|_,count| count}.reverse.each do |lemma,count|
    puts "#{count}\t#{lemma}"
    end
    end

    main