pvdb · February 16, 2025 17:40 · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/gloc b/gloc
@@ -4,9 +4,22 @@
 #
 # INSTALLATION
 #
-# ln -s ${PWD}/gloc $(brew --prefix)/bin/
-# sudo ln -s ${PWD}/gloc /usr/local/bin/
+#   ln -s ${PWD}/gloc $(brew --prefix)/bin/
+#   sudo ln -s ${PWD}/gloc /usr/local/bin/
 #
+# DEPENDENCIES
+#
+#   brew install cmark-gfm
+#
+
+if $help
+  require 'tempfile'
+
+  readme = File.join(__dir__, 'README.md')
+  html = Tempfile.new(['gloc.', '.html'])
+  `cmark-gfm #{readme} > #{html.path}`
+  exec "open #{html.path}"
+end
 
 #
 # rubocop:disable Layout/IndentationWidth

diff --git a/gloc b/gloc
@@ -220,9 +220,21 @@ unless $visual
 end
 
 if $visual
-  require 'rainbow'
   require 'io/console'
 
+  class String
+    def colorize(color_code)
+      "\e[#{color_code}m#{self}\e[0m"
+    end
+
+    # rubocop:disable Style/SingleLineMethods
+    def red()    colorize('31'); end
+    def green()  colorize('32'); end
+    def yellow() colorize('33'); end
+    def blue()   colorize('34'); end
+    # rubocop:enable Style/SingleLineMethods
+  end
+
   max_line_count   = file_stats.values.map(&:line_count).max
   longest_filename = file_stats.keys.map(&:first).map(&:length).max
   _, console_width = IO.console.winsize
@@ -238,9 +250,9 @@ if $visual
     puts format(
       " %-#{longest_filename}<file>s | %<code>s%<comment>s%<blank>s",
       file: file,
-      code:    Rainbow('+' * code_width).green,
-      comment: Rainbow('-' * comment_width).red,
-      blank:   Rainbow('_' * blank_width).blue
+      code:    ('+' * code_width).green,
+      comment: ('-' * comment_width).red,
+      blank:   ('_' * blank_width).blue
     )
   end
 end

diff --git a/gloc b/gloc
@@ -77,6 +77,7 @@ COMMENTS = {
   '*.html' => %r{\A\s*(<!--.*-->)\s*\Z},
   '*.css'  => %r{\A\s*(/\*.*\*/)\s*\Z},
   '*.js'   => %r{\A\s*(//.*|/\*.*\*/)\s*\Z},
+  '*.php'  => %r{\A\s*(//.*|#.*)\s*\Z},
 }.freeze
 
 STATS_FOR_FILE = Hash.new do |stats_for_file, (file, blank_re, comment_re)|

diff --git a/README.md b/README.md
@@ -0,0 +1,124 @@
+# yet another code counter: `gloc`
+
+`gloc` is [yet another code counter](https://github.com/search?q=%22code+counter%22), with some differences and enhancements compared to similar utilities.
+
+Most importantly, it simply groups files based on their file extension, unlike other code counters that group files based on the - oftentimes incorrect - language of their content _(as determined by whatever "language detection" heuristics these tools may use)_.
+
+Further differences and enhancements:
+
+* it doesn't ignore files just because it doesn't recognise them _(ie. cannot correctly determine or guess their language)_
+* in a git repo, it processes `$( git ls-files )` by default
+* in a non-git directory, it processes `$( find . -type f )` by default
+* it generates human-friendly, `(c)loc`-alike output
+* it is Unix pipeline friendly, by design:
+   * it reads the list of filenames to process from `stdin` if `[ ! -t 0 ]`
+   * it writes machine-parsable JSON output to `stdout` if `[ ! -t 1 ]`
+
+## Example
+
+For the popular Ruby on Rails framework, `gloc` generates the following `(c)loc`-alike output:
+
+    --------------------------------------------------------------------------------
+     Language             Files        Lines        Blank      Comment         Code
+    --------------------------------------------------------------------------------
+     *.rb                 2,149      304,495       47,846       42,651      213,998
+     *.md                    74       49,604       14,204            0       35,400
+     *.js                    39        9,717        1,452          564        7,701
+     *.yml                  150        3,367          278            0        3,089
+     *.erb                  408        2,183          254            0        1,929
+     *                       81        2,255          392            0        1,863
+     *.css                   24        1,640          214           32        1,394
+     *.coffee                24        1,190          197            0          993
+     *.rake                  16          864          137            0          727
+     *.rdoc                  11          985          352            0          633
+     *.tt                    28          515           88            0          427
+     *.lock                   1          437           11            0          426
+     *.yaml                   1          231            1            0          230
+     *.gemspec               11          306           79            0          227
+     *.html                  28          225           15            3          207
+     *.json                   3           65            0            0           65
+     *.builder               19           62            2            0           60
+     *.y                      1           50            4            0           46
+     *.sql                    1           49            6            0           43
+     *.zoo                    2            8            0            0            8
+     *.ru                     2            8            2            0            6
+     *.txt                    6            6            0            0            6
+     *.ruby                   2            4            0            0            4
+     *.erb~                   4            4            0            0            4
+     *.raw                    2            2            0            0            2
+     *.styles                 1            1            0            0            1
+     *.log                    1            1            0            0            1
+     *.dtd                    1            1            0            0            1
+     *.mab                    1            1            0            0            1
+     *.javascript             1            1            0            0            1
+    --------------------------------------------------------------------------------
+     Total                3,092      378,277       65,534       43,250      269,493
+    --------------------------------------------------------------------------------
+
+## What It Is Not!
+
+For various reasons, none of these existing utilities to count lines of code are fit for _(my)_ purpose:
+
+* [cgag/loc](https://github.com/cgag/loc)
+* [AlDanial/cloc](https://github.com/AlDanial/cloc)
+* [Aaronepower/tokei](https://github.com/Aaronepower/tokei)
+* [SLOCCount](http://www.dwheeler.com/sloccount/)
+
+And it **definitely** isn't:
+
+* [LocMetrics](http://www.locmetrics.com/)
+
+## Usage
+
+The simplest way to use `gloc` is to simply run:
+
+    gloc
+
+It should behave pretty much as you'd expect!
+
+### in a git repo
+
+In a git repo, running `gloc` will process all files known to git, so is roughly equivalent to:
+
+    git ls-files | gloc
+
+### in a non-git repo
+
+In a non-git repo, running `gloc` will process all files in the directory, so is roughly equivalent to:
+
+    find . -type f | gloc
+
+## Sorting
+
+The results are sorted by "lines of code" by default _(with "lines of code" defined as lines that aren't blank or comment-only)_ but the following options are supported to sort the results differently:
+
+    gloc -files     # sort by number of files
+    gloc -lines     # sort by the total number of lines
+    gloc -blank     # sort by the number of blank lines
+    gloc -comment   # sort by the number of comment lines
+    gloc -code      # sort by lines of code (default)
+
+## Processing
+
+When `gloc`'s STDOUT isn't a TTY, it outputs the LoC stats in JSON format, for further parsing and processing.
+
+This also means you can pretty-print the LoC stats as follows:
+
+    gloc | jq
+
+... which uses the [the `jq` utility](https://stedolan.github.io/jq/) for processing the JSON output.
+
+To "force" the typical TTY output even when STDOUT isn't a TTY, you can use the `-tty` option as follows:
+
+    gloc -tty | pbcopy
+
+... which copies the non-JSON version of the LoC stats to the clipboard.
+
+## Known Issues and Possible Enhancements
+
+* identify comment-only lines for a lot more languages
+* support more file encodings (not just `UTF-8` and `ISO-8859-1`)
+* (?) parse shebang lines for scripts without a file extension
+* (?) installation via Homebrew
+* (?) convert script to Perl/Go/Rust/... for performance
+
diff --git a/gloc b/gloc
@@ -0,0 +1,262 @@
+#!/usr/bin/env -S ruby -s
+# frozen_string_literal: true
+
+#
+# INSTALLATION
+#
+# ln -s ${PWD}/gloc $(brew --prefix)/bin/
+# sudo ln -s ${PWD}/gloc /usr/local/bin/
+#
+
+#
+# rubocop:disable Layout/IndentationWidth
+# rubocop:disable Layout/HashAlignment
+# rubocop:disable Layout/ElseAlignment
+# rubocop:disable Layout/EndAlignment
+#
+# rubocop:disable Style/TrailingCommaInHashLiteral
+# rubocop:disable Style/TrailingCommaInArguments
+# rubocop:disable Style/EmptyCaseCondition
+# rubocop:disable Style/BlockDelimiters
+# rubocop:disable Style/Documentation
+# rubocop:disable Style/RegexpLiteral
+# rubocop:disable Style/GlobalVars
+#
+
+require 'English'
+require 'ostruct'
+
+source_files = if $stdin.tty? || $tty
+  if `git rev-parse --is-inside-work-tree`.chomp == 'true'
+    # we're inside a git repo so
+    # get list of files from git
+    `git ls-files -z #{ARGV.join(' ')}`.split("\0")
+  else
+    # we are not inside a git repo:
+    # find all files in current dir
+    `find #{ARGV.empty? ? Dir.pwd : ARGV.join(' ')} -print0`.split("\0")
+  end
+else
+  # assume we're running it in a pipeline
+  # and read list of filenames from $stdin
+  $stdin.read.split($RS).map(&:chomp)
+end
+
+# exclude binary files from stats
+# (files with NUL in file header)
+#
+# much slower alternative:
+#
+# `egrep -q '\\x00' #{file}` ; $? == 0
+#
+# note: git itself uses the first
+# 8,000 characters of a file, but
+# looking at the first 16 is fine
+# for our purposes... for now :-)
+# see buffer_is_binary() function
+# in the "git" source repository!
+source_files.delete_if { |file|
+  (
+    File.extname(file) == '.pdf'      || # skip bl**dy PDF documents
+    File.basename(file) =~ /\A\..*\z/ || # skip hidden ".*" files
+    !File.exist?(file)                || # skip non-existent paths
+    !File.file?(file)                 || # skip directories
+    !File.size?(file)                 || # skip empty files
+    !File.read(file, 16)["\0"].nil?      # skip binary files
+  ) && ($verbose && warn("SKIPPING #{file}..."); true)
+}
+
+BLANKS = Hash.new(%r{\A\s*\Z}.freeze) # TODO: ext-specific regex for blanks?
+
+COMMENTS = {
+  # FIXME: does not work for multi-line comments
+  #        (for the languages that support them)
+  '*.rb'   => %r{\A\s*(#.*)\s*\Z},
+  '*.sh'   => %r{\A\s*(#.*)\s*\Z},
+  '*.xml'  => %r{\A\s*(<!--.*-->)\s*\Z},
+  '*.html' => %r{\A\s*(<!--.*-->)\s*\Z},
+  '*.css'  => %r{\A\s*(/\*.*\*/)\s*\Z},
+  '*.js'   => %r{\A\s*(//.*|/\*.*\*/)\s*\Z},
+}.freeze
+
+STATS_FOR_FILE = Hash.new do |stats_for_file, (file, blank_re, comment_re)|
+  file_content = File.read(file, encoding: 'UTF-8')
+  unless file_content.valid_encoding?
+    file_content = File.read(file, encoding: 'ISO-8859-1')
+    # FIXME: what about file encodings other than these two???
+  end
+
+  lines = file_content.each_line
+
+  stats_for_file[[file, blank_re, comment_re]] = OpenStruct.new(
+    line_count:    line_count = lines.count,
+    blank_count:   blank_count = lines.grep(blank_re).count,
+    comment_count: comment_count = lines.grep(comment_re).count,
+    code_count:    (line_count - blank_count - comment_count),
+  )
+end
+
+STATS_FOR = Hash.new do |stats_for_ext, ext|
+  stats_for_ext[ext] = OpenStruct.new(
+    file_count:    0,
+    line_count:    0,
+    blank_count:   0,
+    comment_count: 0,
+    code_count:    0,
+  )
+end
+
+source_files.each do |file|
+  ext = File.extname(file).prepend('*') # e.g. '*.rb' or '*' if no ext!
+
+  blank_regex   = BLANKS[ext]
+  comment_regex = COMMENTS[ext]
+
+  stats_for_file = STATS_FOR_FILE[[file, blank_regex, comment_regex]]
+  stats_for_ext  = STATS_FOR[ext]
+
+  stats_for_ext.file_count    += 1
+  stats_for_ext.line_count    += stats_for_file.line_count
+  stats_for_ext.blank_count   += stats_for_file.blank_count
+  stats_for_ext.comment_count += stats_for_file.comment_count
+  stats_for_ext.code_count    += stats_for_file.code_count
+end
+
+sort_metric = case
+              when $files   then :file_count
+              when $lines   then :line_count
+              when $blank   then :blank_count
+              when $comment then :comment_count
+              when $code    then :code_count
+              else :code_count
+              end
+
+file_stats = STATS_FOR_FILE.sort_by { |_, stats|
+  stats.send(sort_metric)
+}.reverse.to_h
+
+source_stats = STATS_FOR.sort_by { |_, stats|
+  stats.send(sort_metric)
+}.reverse.to_h
+
+source_stats['TOTAL'] = OpenStruct.new(
+  file_count:    source_stats.values.map(&:file_count).reduce(:+)    || 0,
+  line_count:    source_stats.values.map(&:line_count).reduce(:+)    || 0,
+  blank_count:   source_stats.values.map(&:blank_count).reduce(:+)   || 0,
+  comment_count: source_stats.values.map(&:comment_count).reduce(:+) || 0,
+  code_count:    source_stats.values.map(&:code_count).reduce(:+)    || 0,
+)
+
+#
+# JSON formatting for non-TTY output
+#
+
+unless $stdout.tty? || $tty || $visual
+  require 'json'
+
+  class OpenStruct
+    def to_json(*args)
+      to_h.to_json(args)
+    end
+  end
+
+  puts source_stats.to_json
+
+  exit
+end
+
+unless $visual
+  class String
+    def commify
+      gsub(/(\d)(?=(\d{3})+(\..*)?$)/, '\1,')
+    end
+  end
+
+  class Numeric
+    def commify
+      to_s.commify
+    end
+  end
+
+  #
+  # fancy formatting for TTY output
+  #
+
+  source_stats.each_value do |stats_for_ext|
+    stats_for_ext.file_count    = stats_for_ext.file_count.commify
+    stats_for_ext.line_count    = stats_for_ext.line_count.commify
+    stats_for_ext.blank_count   = stats_for_ext.blank_count.commify
+    stats_for_ext.comment_count = stats_for_ext.comment_count.commify
+    stats_for_ext.code_count    = stats_for_ext.code_count.commify
+  end
+
+  DIVIDER  = ('-' * 80) # because loc uses 80 columns
+  TEMPLATE = ' %-13s %12s %12s %12s %12s %12s'
+
+  puts format(
+    "#{DIVIDER}\n#{TEMPLATE}\n#{DIVIDER}",
+    'Language', 'Files', 'Lines', 'Blank', 'Comment', 'Code'
+  )
+
+  source_stats.each do |file_ext, stats|
+    next if file_ext == 'TOTAL'
+
+    puts format(
+      TEMPLATE,
+      file_ext,
+      stats.file_count,
+      stats.line_count,
+      stats.blank_count,
+      stats.comment_count,
+      stats.code_count,
+    )
+  end
+
+  puts format(
+    "#{DIVIDER}\n#{TEMPLATE}\n#{DIVIDER}",
+    'Total', *source_stats.fetch('TOTAL').to_h.values
+  )
+end
+
+if $visual
+  require 'rainbow'
+  require 'io/console'
+
+  max_line_count   = file_stats.values.map(&:line_count).max
+  longest_filename = file_stats.keys.map(&:first).map(&:length).max
+  _, console_width = IO.console.winsize
+  available_width  = Float(console_width - longest_filename - 5)
+
+  abort 'Terminal not wide enough... aborting!' if available_width.negative?
+
+  file_stats.each_pair do |(file, _, _), stats|
+    code_width    = (available_width * stats.code_count / max_line_count)
+    comment_width = (available_width * stats.comment_count / max_line_count)
+    blank_width   = (available_width * stats.blank_count / max_line_count)
+
+    puts format(
+      " %-#{longest_filename}<file>s | %<code>s%<comment>s%<blank>s",
+      file: file,
+      code:    Rainbow('+' * code_width).green,
+      comment: Rainbow('-' * comment_width).red,
+      blank:   Rainbow('_' * blank_width).blue
+    )
+  end
+end
+
+#
+# rubocop:enable Style/GlobalVars
+# rubocop:enable Style/RegexpLiteral
+# rubocop:enable Style/Documentation
+# rubocop:enable Style/BlockDelimiters
+# rubocop:enable Style/EmptyCaseCondition
+# rubocop:enable Style/TrailingCommaInArguments
+# rubocop:enable Style/TrailingCommaInHashLiteral
+#
+# rubocop:enable Layout/EndAlignment
+# rubocop:enable Layout/ElseAlignment
+# rubocop:enable Layout/HashAlignment
+# rubocop:enable Layout/IndentationWidth
+#
+
+# That's all Folks!