mk-qi · August 29, 2015 14:23 · Mar 22, 2013 · Jul 5, 2011 · Jun 24, 2011 · Jun 24, 2011
diff --git a/README → apache-logtop-README b/README → apache-logtop-README
diff --git a/logstat.pl b/logstat.pl
@@ -65,10 +65,10 @@ sub init_time
             $referrer, $ua) = (m/
         ^(\S+)\s                    # vhost
         \S+\s                       # IP
-        \S+\s                       # remote logname
-        (?:\S+\s)+                  # remote user
+        \S+\s+                      # remote logname
+        (?:\S+\s+)+                 # remote user
         \[([^]]+)\]\s               # date
-        "(\S+)\s?                   # method
+        "(\S*)\s?                   # method
         (?:((?:[^"]*(?:\\")?)*)\s   # URL
         ([^"]*)"\s|                 # protocol
         ((?:[^"]*(?:\\")?)*)"\s)    # or, possibly URL with no protocol

diff --git a/README b/README
@@ -2,7 +2,8 @@ DESCRIPTION
 
 logstat.pl and logtop.pl
 
-These scripts show current and average request-per-second counts based on apache access logs in real-time.
+These scripts show current and average request-per-second counts based on
+apache access logs in real-time.
 
 You can see the total requests-per-second as well as a breakdown by:
 * vhost
@@ -24,14 +25,16 @@ The output shows:
 * and the request's vhost, URL, user agent or referrer.
 
 See also the apachetop tool - http://freshmeat.net/projects/apachetop/ and
-http://packages.debian.org/search?keywords=apachetop . This script is not based on or intended to
-mimic apachetop, but the functionality is broadly similar.
+http://packages.debian.org/search?keywords=apachetop .  This script is not
+based on or intended to mimic apachetop, but the functionality is broadly
+similar.
 
 SYNOPSIS
 
 To use:
 
-In one screen, run the log parser. It writes out a stats file which the other scripts use.
+In one screen, run the log parser. It writes out a stats file which the
+other scripts use.
 
 $ cd /dev/shm # use shared memory
 $ tail -F /var/log/httpd/access.log | ~/logstat.pl # centralized logging helps here
@@ -68,7 +71,7 @@ etc.
 
 EXAMPLE
 
-Every 5.0s: ~/logtop.pl                                                                                                Fri Jun 24 03:31:59 2011
+Every 5.0s: ~/logtop.pl                                                   Fri Jun 24 03:31:59 2011
 
 Fri Jun 24 03:31:58 2011 hps:      2,  average: 5.35, 4.45, 2.37,   0 seconds ago,  total
 vhosts sorted by five
@@ -79,17 +82,20 @@ Fri Jun 24 03:31:34 2011 hps:      2,  average: 0.99, 0.33, 0.12,  24 seconds ag
 
 NOTES
 
-* The script's tally sub is properly abstracted so this script could be quite easily modified to tally
-and report averages on anything you can count (not just httpd logs)
+* The script's tally sub is properly abstracted so this script could be
+quite easily modified to tally and report averages on anything you can count
+(not just httpd logs)
 
-* The log parsing regex matches the "v-combined" log format (combined with the virtual host at the front)
+* The log parsing regex matches the "v-combined" log format (combined with
+* the virtual host at the front)
 
-* The logtop script accepts "match" and/or "skip" arguments to only show events that match (or do
-not match) a given regex.
+* The logtop script accepts "match" and/or "skip" arguments to only show
+events that match (or do not match) a given regex.
 
 TODO
 
-* Persist historical data and reload on restart (this can be done asynchronously with another client script)
+* Persist historical data and reload on restart (this can be done
+* asynchronously with another client script)
 
-* Persist position in log file (using ftell) to recover from where we left off on restart (this should
-account for log file rotation)
+* Persist position in log file (using ftell) to recover from where we left
+off on restart (this should account for log file rotation)
diff --git a/logstat.pl b/logstat.pl
@@ -6,10 +6,12 @@
 use Time::HiRes 'time';
 use List::Util qw/sum/;
 use Storable;
+use Digest::MD5 'md5_hex';
 #use Data::Dumper;
 
 my %data; # internal (server-only) data
 my %shared; # data shared with clients
+my %keys; # keys to data shared with clients
 
 sub init
 {
@@ -48,6 +50,15 @@ sub init_time
 # TODO: read previous data from persistant store on restart
 # (have a seprate client that persists data)
 
+my $stotal = 0; # FIXME debug
+
+# update this when adding new categories
+my @data = qw/vhosts urls ua referrer/;
+
+foreach my $key (@data) {
+    mkdir $key;
+}
+
 while (<>) {
     my $nt = time();
     my ($vhost, $date, $method, $url, $protocol, $alt_url, $code, $bytes,
@@ -72,24 +83,29 @@ sub init_time
 
     # vhost counts
     $data{vhosts}{$vhost} ||= $shared{vhosts}{$vhost} ||= {};
-    tally($data{vhosts}{$vhost}, $shared{vhosts}{$vhost}, $nt);
+    tally($data{vhosts}{$vhost}, $shared{vhosts}{$vhost}, $nt,
+        'vhosts', $vhost);
 
     # url counts
-    $data{urls}{"$vhost $url"} ||= $shared{urls}{"$vhost $url"} ||= {};
-    tally($data{urls}{"$vhost $url"}, $shared{urls}{"$vhost $url"}, $nt);
+    my $vhost_url = "$vhost $url";
+    $data{urls}{$vhost_url} ||= $shared{urls}{$vhost_url} ||= {};
+    tally($data{urls}{$vhost_url}, $shared{urls}{$vhost_url}, $nt,
+        'urls', $vhost_url);
 
     # user agent counts
     $data{ua}{$ua} ||= $shared{ua}{$ua} ||= {};
-    tally($data{ua}{$ua}, $shared{ua}{$ua}, $nt);
+    tally($data{ua}{$ua}, $shared{ua}{$ua}, $nt, 'ua', $ua);
 
     # referrer counts
     $data{referrer}{$referrer} ||= $shared{referrer}{$referrer} ||= {};
-    tally($data{referrer}{$referrer}, $shared{referrer}{$referrer}, $nt);
+    tally($data{referrer}{$referrer}, $shared{referrer}{$referrer}, $nt,
+        'referrer', $referrer);
 
     # TODO: status codes? bytes? (bytes would require some additional work to
     # be useful)
 
     # total hit count
+    print '.'; # FIXME debug
     next unless tally(\%data, \%shared, $nt);
 
     # show the last line we parsed; this can be used to confirm we're parsing
@@ -100,29 +116,49 @@ sub init_time
     # due to a slow network) the date may be seconds or even minutes in the
     # past, even though the line was actually written to the log much more
     # recently than that (i.e. just now)
-    print "last line parsed: [$vhost] [$date] [$method] [$url] [$protocol] [$alt_url] [$code] [$bytes] [$referrer] [$ua]\n\n";
+    print "\n\nlast line parsed: [$vhost] [$date] [$method] [$url] " .
+        "[$protocol] [$alt_url] [$code] [$bytes] [$referrer] [$ua]\n\n";
 
     # show the last second's total hits-per-second and the historical average
-    # and last prune data
+    # and the result of the last prune
     my $pt = $prune_time - $nt;
-    show(\%shared, sprintf(" total\nnext prune in %.0f seconds; last $last_prune_took\n\n", $pt));
+    show(\%shared, sprintf(
+        " total\nnext prune in %.0f seconds; " .
+            "last $last_prune_took\n", $pt));
 
-    # store the data for clients to read.
+    # store the total and keys to the categorized data for clients to read.
     # NOTE: change the working directory to /dev/shm before starting these
     # scripts to use shared memory; this can be considerably faster than
     # writing the shared data to local disk
     # NOTE: rename is used to atomically update the data file so clients do
     # not try to read the data while we're writing it
-    store(\%shared, 'logstats.data.tmp');
-    rename('logstats.data.tmp', 'logstats.data');
+    my $nst = time(); # FIXME debug
+    $keys{total} = {
+        hps     => $shared{hps},
+        one     => $shared{one},
+        five    => $shared{five},
+        fifteen => $shared{fifteen},
+        time    => $shared{time},
+    };
+    my $entry_count = 0;
+    foreach my $key (@data) {
+        my $count = keys %{ $keys{$key} };
+        $entry_count += $count;
+        print "$count $key; ";
+    }
+    store(\%keys, 'logstats.keys.tmp');
+    rename('logstats.keys.tmp', 'logstats.keys');
+    $stotal += time() - $nst; # FIXME debug
+    printf "$entry_count total entries; store took %.3f seconds\n\n", $stotal;
+    $stotal = 0;
 
     # prune old data to prevent using too much memory
     # NOTE: this does block (for a hopefully short amount of time); we
     # should catch back up quickly, though
     next unless $pt <= 0;
     $pt = time();
     print "pruning data...\n";
-    my $entry_count = 0;
+    $entry_count = 0;
     my $prune_count = 0;
     $prune_time = $nt + $prune_every;
     foreach my $k (keys %data) {
@@ -133,6 +169,8 @@ sub init_time
             $entry_count++;
             next unless (ref($val) || '') eq 'HASH'
                 and $nt - $val->{time} > $prune_every;
+            unlink("$k/" . $keys{$k}{$key});
+            delete $keys{$k}{$key};
             delete $v->{$key};
             delete $shared{$k}{$key};
             $prune_count++;
@@ -147,11 +185,15 @@ sub init_time
 sub tally
 {
     my $nt = $_[2] || time();
-    init($_[0], $_[1], $nt) unless defined $_[0]->{count};
+    my $init = defined $_[0]->{count};
+    init($_[0], $_[1], $nt) unless $init;
 
     # count the event
     $_[0]->{count}++;
 
+    # save data if this is the first time we've seen this event
+    save($_[0], $_[1], $_[3], $_[4]) if $_[3] && !$init;
+
     # add $count to per-minute count and reset $count every second
     my $diff = $nt - $_[1]->{time};
     return 0 unless $diff >= 1;
@@ -183,6 +225,7 @@ sub tally
         $shared->{one} = $data->{counts}[1] / 60;
         $shared->{five} = sum(@{$data->{counts}}[1..5]) / 5 / 60;
         $shared->{fifteen} = sum(@{$data->{counts}}[1..15]) / 15 / 60;
+        save($data, $shared, $_[3], $_[4]) if $_[3];
         return 1;
     }
 
@@ -198,14 +241,26 @@ sub tally
     no warnings 'uninitialized';
     $shared->{five} = sum($count, @{$data->{counts}}[1..5]) / 6 / 60;
     $shared->{fifteen} = sum($count, @{$data->{counts}}[1..15]) / 16 / 60;
+    save($data, $shared, $_[3], $_[4]) if $_[3];
     return 1;
 }
 
 sub show
 {
     my $shared = $_[0];
     print scalar localtime($shared->{time});
-    printf " hps: %6.0f,  average: %.2f, %.2f, %.2f,  ", $shared->{hps}, $shared->{one}, $shared->{five}, $shared->{fifteen};
+    printf " hps: %6.0f,  average: %.2f, %.2f, %.2f,  ",
+        $shared->{hps}, $shared->{one}, $shared->{five}, $shared->{fifteen};
     print $_[1] || "\n";
     return;
 }
+
+sub save
+{
+    my $nst = time(); # FIXME debug
+    my $file = $_[2] . '/' . ($keys{$_[2]}{$_[3]} ||= md5_hex($_[3]));
+    store($_[1], "$file.tmp");
+    rename("$file.tmp", $file);
+    $stotal += time() - $nst; # FIXME debug
+    return;
+}
diff --git a/logtop.pl b/logtop.pl
@@ -23,13 +23,24 @@
 $match_re = qr/$match/i if $match;
 $skip_re = qr/$skip/i if $skip;
 
-my $data = retrieve("logstats.data");
+# load data
+my $rt = time();
+my $datakeys = retrieve('logstats.keys');
 
-my $now = time();
+my $data = $datakeys->{total};
+foreach my $key (keys %{ $datakeys->{$type} }) {
+    my $md5 = $datakeys->{$type}{$key};
+    my $d;
+    # file may have been pruned; skip if we can't read it
+    eval { $d = retrieve("$type/$md5") };
+    next if $@;
+    $data->{$type}{$key} = $d;
+}
 
+my $now = time();
 show($data, " total\n", $now - $data->{time});
 
-print "$type sorted by $sort\n";
+printf "loaded data in %.3f seconds - $type sorted by $sort\n", $now - $rt;
 
 my @stale;
 foreach my $event (sort

diff --git a/logstat.pl b/logstat.pl
@@ -54,8 +54,8 @@ sub init_time
             $referrer, $ua) = (m/
         ^(\S+)\s                    # vhost
         \S+\s                       # IP
-        \S+\s
-        \S+\s
+        \S+\s                       # remote logname
+        (?:\S+\s)+                  # remote user
         \[([^]]+)\]\s               # date
         "(\S+)\s?                   # method
         (?:((?:[^"]*(?:\\")?)*)\s   # URL

diff --git a/README b/README
@@ -1,15 +1,27 @@
 DESCRIPTION
 
-These scripts show total, per-vhost and per-URL request-per-second counts based on apache
-access logs in real-time.
+logstat.pl and logtop.pl
+
+These scripts show current and average request-per-second counts based on apache access logs in real-time.
+
+You can see the total requests-per-second as well as a breakdown by:
+* vhost
+* URL
+* user agent
+* or referrer.
+
+You can sort by:
+* number of requests in the last second
+* time of last stats update
+* average requests-per-second over the last one, five or fifteen minutes
 
 The output shows:
-* date of last stats update
-* last second's hits per second (hps)
-* one, five and fifteen minute hps average
-* seconds since last request
-* asterisk marks request hit in the last 5 seconds
-* vhost or URL requested
+* the date of the last stats update
+* the last second's hits per second (hps)
+* the one, five and fifteen minute hps average
+* the seconds since last request
+* an asterisk to mark requests hit in the last 5 seconds
+* and the request's vhost, URL, user agent or referrer.
 
 See also the apachetop tool - http://freshmeat.net/projects/apachetop/ and
 http://packages.debian.org/search?keywords=apachetop . This script is not based on or intended to
@@ -24,33 +36,60 @@ In one screen, run the log parser. It writes out a stats file which the other sc
 $ cd /dev/shm # use shared memory
 $ tail -F /var/log/httpd/access.log | ~/logstat.pl # centralized logging helps here
 
-In another screen, view vhost stats:
+In another screen, view your stats:
 
 $ cd /dev/shm # use shared memory
+
+# defaults to vhosts sorted by five minute average
 $ watch -n 5 -- ~/logtop.pl
 
-In a third screen, view URL stats:
+or
 
-$ cd /dev/shm # use shared memory
-$ watch -n 5 -- '~/logtop-urls.pl --skip '\''\.(jpg|png|gif|js|css)$|^\S+\s(/robots\.txt$|/favicon\.ico$)'\'''
+# URLs sorted by five minute average, ignoring images, JS and CSS
+$ watch -n 5 -- '~/logtop.pl urls \
+  --skip '\''\.(jpg|png|gif|js|css)$|^\S+\s(/robots\.txt$|/favicon\.ico$)'\'''
 
-EXAMPLE
+or
+
+# user agents sorted by requests in the last second
+$ watch -n 5 -- ~/logtop.pl ua --sort hps
+
+or
 
-Every 5.0s: ./logtop.pl                                                                                                                        Wed Jun 22 09:55:54 2011
+# referrers sorted by one minute average
+$ watch -n 5 -- ~/logtop.pl referrer --sort one
 
-Wed Jun 22 09:55:54 2011 hps:      9,  average: 12.87, 13.19, 13.33,   0 seconds ago,  total
+or
 
-Wed Jun 22 09:55:48 2011 hps:     17,  average: 5.66, 1.90, 0.86,   6 seconds ago,    example.com
-Wed Jun 22 09:55:45 2011 hps:      6,  average: 1.17, 1.43, 1.48,   9 seconds ago,    example.org
-Wed Jun 22 09:55:50 2011 hps:      3,  average: 0.94, 1.33, 1.14,   4 seconds ago,  * example.net
+# URLs sorted by last stats update
+$ watch -n 5 -- ~/logtop.pl urls --sort time
+
+etc.
+
+EXAMPLE
+
+Every 5.0s: ~/logtop.pl                                                                                                Fri Jun 24 03:31:59 2011
+
+Fri Jun 24 03:31:58 2011 hps:      2,  average: 5.35, 4.45, 2.37,   0 seconds ago,  total
+vhosts sorted by five
+Fri Jun 24 03:31:52 2011 hps:      0,  average: 0.41, 0.59, 0.35,   6 seconds ago,     example.com
+Fri Jun 24 03:31:54 2011 hps:      1,  average: 1.17, 0.58, 0.23,   4 seconds ago,  *  example.net
+Fri Jun 24 03:31:34 2011 hps:      2,  average: 0.99, 0.33, 0.12,  24 seconds ago,     example.org
 ...
 
 NOTES
 
-* The script's tally sub is properly abstracted so this script could be modified to tally and report
-averages on anything you can count (not just httpd logs)
+* The script's tally sub is properly abstracted so this script could be quite easily modified to tally
+and report averages on anything you can count (not just httpd logs)
 
 * The log parsing regex matches the "v-combined" log format (combined with the virtual host at the front)
 
-* The logtop-urls script accepts "match" and/or "skip" arguments to only show URLs that match (or do
+* The logtop script accepts "match" and/or "skip" arguments to only show events that match (or do
 not match) a given regex.
+
+TODO
+
+* Persist historical data and reload on restart (this can be done asynchronously with another client script)
+
+* Persist position in log file (using ftell) to recover from where we left off on restart (this should
+account for log file rotation)
diff --git a/logstat.pl b/logstat.pl
@@ -17,10 +17,12 @@ sub init
     my $shared = $_[1];
     $data->{count} = 0;
     $data->{counts} = [0];
+    $shared->{hps} = 0;
     $shared->{one} = 0;
     $shared->{five} = 0;
     $shared->{fifteen} = 0;
     init_time($data, $shared, $_[2]);
+    return;
 }
 
 sub init_time
@@ -36,8 +38,16 @@ sub init_time
     }
     return if $mode == 1;
     $data->{minute} = $shared->{time} + 60;
+    return;
 }
 
+my $prune_every = 60 * 20; # prune every 20 minutes
+my $prune_time = time() + $prune_every;
+my $last_prune_took = "prune hasn't happened yet";
+
+# TODO: read previous data from persistant store on restart
+# (have a seprate client that persists data)
+
 while (<>) {
     my $nt = time();
     my ($vhost, $date, $method, $url, $protocol, $alt_url, $code, $bytes,
@@ -59,41 +69,96 @@ sub init_time
     die "Couldn't match $_" unless $vhost;
     $alt_url ||= '';
     $url ||= $alt_url;
+
     # vhost counts
-    $data{vhosts}{$vhost} ||= {};
-    $shared{vhosts}{$vhost} ||= {};
+    $data{vhosts}{$vhost} ||= $shared{vhosts}{$vhost} ||= {};
     tally($data{vhosts}{$vhost}, $shared{vhosts}{$vhost}, $nt);
 
     # url counts
-    $data{urls}{"$vhost $url"} ||= {};
-    $shared{urls}{"$vhost $url"} ||= {};
+    $data{urls}{"$vhost $url"} ||= $shared{urls}{"$vhost $url"} ||= {};
     tally($data{urls}{"$vhost $url"}, $shared{urls}{"$vhost $url"}, $nt);
 
-    # TODO: user agents? referrers? status codes? bytes?
+    # user agent counts
+    $data{ua}{$ua} ||= $shared{ua}{$ua} ||= {};
+    tally($data{ua}{$ua}, $shared{ua}{$ua}, $nt);
+
+    # referrer counts
+    $data{referrer}{$referrer} ||= $shared{referrer}{$referrer} ||= {};
+    tally($data{referrer}{$referrer}, $shared{referrer}{$referrer}, $nt);
+
+    # TODO: status codes? bytes? (bytes would require some additional work to
+    # be useful)
 
     # total hit count
-    if (tally(\%data, \%shared, $nt)) {
-        print "last line parsed: [$vhost] [$date] [$method] [$url] [$protocol] [$alt_url] [$code] [$bytes] [$referrer] [$ua]\n\n";
-        show(\%shared, " total\n\n");
-        store(\%shared, 'logstats.data.tmp');
-        rename('logstats.data.tmp', 'logstats.data');
+    next unless tally(\%data, \%shared, $nt);
+
+    # show the last line we parsed; this can be used to confirm we're parsing
+    # data in real-time (and that we're parsing the log correctly)
+    # NOTE: the date from the apache log is the date the HTTP request
+    # started, but apache writes the request to the log when the request
+    # finishes. This means if the request took a long time to serve (e.g.
+    # due to a slow network) the date may be seconds or even minutes in the
+    # past, even though the line was actually written to the log much more
+    # recently than that (i.e. just now)
+    print "last line parsed: [$vhost] [$date] [$method] [$url] [$protocol] [$alt_url] [$code] [$bytes] [$referrer] [$ua]\n\n";
+
+    # show the last second's total hits-per-second and the historical average
+    # and last prune data
+    my $pt = $prune_time - $nt;
+    show(\%shared, sprintf(" total\nnext prune in %.0f seconds; last $last_prune_took\n\n", $pt));
+
+    # store the data for clients to read.
+    # NOTE: change the working directory to /dev/shm before starting these
+    # scripts to use shared memory; this can be considerably faster than
+    # writing the shared data to local disk
+    # NOTE: rename is used to atomically update the data file so clients do
+    # not try to read the data while we're writing it
+    store(\%shared, 'logstats.data.tmp');
+    rename('logstats.data.tmp', 'logstats.data');
+
+    # prune old data to prevent using too much memory
+    # NOTE: this does block (for a hopefully short amount of time); we
+    # should catch back up quickly, though
+    next unless $pt <= 0;
+    $pt = time();
+    print "pruning data...\n";
+    my $entry_count = 0;
+    my $prune_count = 0;
+    $prune_time = $nt + $prune_every;
+    foreach my $k (keys %data) {
+        my $v = $data{$k};
+        next unless (ref($v) || '') eq 'HASH';
+        foreach my $key (keys %$v) {
+            my $val = $shared{$k}{$key};
+            $entry_count++;
+            next unless (ref($val) || '') eq 'HASH'
+                and $nt - $val->{time} > $prune_every;
+            delete $v->{$key};
+            delete $shared{$k}{$key};
+            $prune_count++;
+        }
     }
+    my $elapsed = time() - $pt;
+    $last_prune_took =
+        sprintf "pruned $prune_count stale out of $entry_count total entries in %.3f seconds.", $elapsed;
+    print "$last_prune_took\n\n";
 }
 
 sub tally
 {
-    my $data = $_[0];
-    my $shared = $_[1];
     my $nt = $_[2] || time();
-    init($data, $shared, $nt) unless defined $data->{count};
+    init($_[0], $_[1], $nt) unless defined $_[0]->{count};
 
     # count the event
-    $data->{count}++;
+    $_[0]->{count}++;
 
     # add $count to per-minute count and reset $count every second
-    my $diff = $nt - $shared->{time};
+    my $diff = $nt - $_[1]->{time};
     return 0 unless $diff >= 1;
 
+    my $data = $_[0];
+    my $shared = $_[1];
+
     init_time($data, $shared, $nt, 1);
     $shared->{hps} = $data->{count} / $diff;
     $data->{count} = 0;
@@ -142,4 +207,5 @@ sub show
     print scalar localtime($shared->{time});
     printf " hps: %6.0f,  average: %.2f, %.2f, %.2f,  ", $shared->{hps}, $shared->{one}, $shared->{five}, $shared->{fifteen};
     print $_[1] || "\n";
+    return;
 }
diff --git a/logtop-urls.pl b/logtop-urls.pl
@@ -1,60 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use warnings;
-
-use Time::HiRes 'time';
-use Storable;
-
-use Getopt::Long;
-my ($match, $skip, $match_re, $skip_re);
-GetOptions(
-    'match=s' => \$match,
-    'skip=s'  => \$skip,
-);
-
-$match_re = qr/$match/i if $match;
-$skip_re = qr/$skip/i if $skip;
-
-my $data = retrieve("logstats.data");
-
-my $now = time;
-
-show($data, " total\n\n", $now - $data->{time});
-
-my @stale;
-foreach my $url (sort { $data->{urls}{$b}{five} <=> $data->{urls}{$a}{five} } keys %{ $data->{urls} }) {
-    next if $skip and $url =~ m/$skip_re/;
-    next if $match and $url !~ m/$match_re/;
-    my $d = $data->{urls}{$url};
-    my $t = $now - $d->{time};
-    if ($t > 60) {
-        push @stale, $url;
-        next;
-    }
-    my $text = '   ';
-    if ($now - $d->{time} < 6) {
-        $text = " * ";
-    }
-    $text .= " $url";
-
-    show($d, "$text\n", $t);
-}
-
-print "\nstale:\n" if @stale;
-
-my $i = 0;
-foreach my $url (@stale) {
-    my $d = $data->{urls}{$url};
-    my $t = $now - $d->{time};
-    show($d, " $url\n", $t);
-    last if ++$i > 100;
-}
-
-sub show
-{
-    my $data = $_[0];
-    print scalar localtime($data->{time});
-    printf " hps: %6.0f,  average: %.2f, %.2f, %.2f,  %2.0f seconds ago, ", $data->{hps} || 0, $data->{one}, $data->{five}, $data->{fifteen}, $_[2];
-    print $_[1] || "\n";
-}

diff --git a/logtop.pl b/logtop.pl
@@ -6,41 +6,66 @@
 use Time::HiRes 'time';
 use Storable;
 
+use Getopt::Long;
+my ($match, $skip, $match_re, $skip_re, $sort);
+GetOptions(
+    'match=s' => \$match,
+    'skip=s'  => \$skip,
+    'sort=s'  => \$sort,
+);
+
+# time, hps, one, five or fifteen
+$sort ||= 'five';
+
+# vhosts, urls, ua or referrer
+my $type = shift() || 'vhosts';
+
+$match_re = qr/$match/i if $match;
+$skip_re = qr/$skip/i if $skip;
+
 my $data = retrieve("logstats.data");
 
-my $now = time;
+my $now = time();
+
+show($data, " total\n", $now - $data->{time});
 
-show($data, " total\n\n", $now - $data->{time});
+print "$type sorted by $sort\n";
 
 my @stale;
-foreach my $vhost (sort { $data->{vhosts}{$b}{five} <=> $data->{vhosts}{$a}{five} } keys %{ $data->{vhosts} }) {
-    my $d = $data->{vhosts}{$vhost};
+foreach my $event (sort
+        { $data->{$type}{$b}{$sort} <=> $data->{$type}{$a}{$sort} }
+            keys %{ $data->{$type} }) {
+    next if $skip and $event =~ m/$skip_re/;
+    next if $match and $event !~ m/$match_re/;
+    my $d = $data->{$type}{$event};
     my $t = $now - $d->{time};
     if ($t > 60) {
-        push @stale, $vhost;
+        push @stale, $event;
         next;
     }
     my $text = '   ';
     if ($now - $d->{time} < 6) {
         $text = " * ";
     }
-    $text .= " $vhost";
-
-    show($d, "$text\n", $t);
+    show($d, "$text $event\n", $t);
 }
 
 print "\nstale:\n" if @stale;
 
-foreach my $vhost (@stale) {
-    my $d = $data->{vhosts}{$vhost};
+my $i = 0;
+foreach my $event (@stale) {
+    my $d = $data->{$type}{$event};
     my $t = $now - $d->{time};
-    show($d, " $vhost\n", $t);
+    show($d, " $event\n", $t);
+    last if ++$i > 100;
 }
 
 sub show
 {
     my $data = $_[0];
     print scalar localtime($data->{time});
-    printf " hps: %6.0f,  average: %.2f, %.2f, %.2f,  %2.0f seconds ago, ", $data->{hps} || 0, $data->{one}, $data->{five}, $data->{fifteen}, $_[2];
+    my $s = "s";
+    $s = " " if sprintf("%.0f", $_[2]) eq '1';
+    printf " hps: %6.0f,  average: %.2f, %.2f, %.2f,  %2.0f second$s ago, ", $data->{hps}, $data->{one}, $data->{five}, $data->{fifteen}, $_[2];
     print $_[1] || "\n";
 }
diff --git a/README b/README
@@ -32,7 +32,7 @@ $ watch -n 5 -- ~/logtop.pl
 In a third screen, view URL stats:
 
 $ cd /dev/shm # use shared memory
-$ watch -n 5 -- './logtop-urls.pl --skip '\''\.(jpg|png|gif|js|css)$|^\S+\s(/robots\.txt$|/favicon\.ico$)'\'''
+$ watch -n 5 -- '~/logtop-urls.pl --skip '\''\.(jpg|png|gif|js|css)$|^\S+\s(/robots\.txt$|/favicon\.ico$)'\'''
 
 EXAMPLE
 

diff --git a/README b/README
@@ -1,6 +1,7 @@
 DESCRIPTION
 
-These scripts show total, per-vhost and per-URL request-per-second counts based on apache access logs in real-time.
+These scripts show total, per-vhost and per-URL request-per-second counts based on apache
+access logs in real-time.
 
 The output shows:
 * date of last stats update
@@ -10,23 +11,28 @@ The output shows:
 * asterisk marks request hit in the last 5 seconds
 * vhost or URL requested
 
-See also the apachetop tool - http://freshmeat.net/projects/apachetop/ and http://packages.debian.org/search?keywords=apachetop . This script is not based on or intended to mimic apachetop, but the functionality is broadly similar.
+See also the apachetop tool - http://freshmeat.net/projects/apachetop/ and
+http://packages.debian.org/search?keywords=apachetop . This script is not based on or intended to
+mimic apachetop, but the functionality is broadly similar.
 
 SYNOPSIS
 
 To use:
 
 In one screen, run the log parser. It writes out a stats file which the other scripts use.
 
-$ tail -F access.log | ./logstat.pl # centralized logging helps here
+$ cd /dev/shm # use shared memory
+$ tail -F /var/log/httpd/access.log | ~/logstat.pl # centralized logging helps here
 
 In another screen, view vhost stats:
 
-$ watch -n 5 -- ./logtop.pl
+$ cd /dev/shm # use shared memory
+$ watch -n 5 -- ~/logtop.pl
 
 In a third screen, view URL stats:
 
-watch -n 5 -- './logtop-urls.pl --skip '\''\.(jpg|png|gif|js|css)$|^\S+\s(/robots\.txt$|/favicon\.ico$)'\'''
+$ cd /dev/shm # use shared memory
+$ watch -n 5 -- './logtop-urls.pl --skip '\''\.(jpg|png|gif|js|css)$|^\S+\s(/robots\.txt$|/favicon\.ico$)'\'''
 
 EXAMPLE
 
@@ -41,8 +47,10 @@ Wed Jun 22 09:55:50 2011 hps:      3,  average: 0.94, 1.33, 1.14,   4 seconds ag
 
 NOTES
 
-* The script's tally sub is properly abstracted so this script could be modified to tally and report averages on anything you can count (not just httpd logs)
+* The script's tally sub is properly abstracted so this script could be modified to tally and report
+averages on anything you can count (not just httpd logs)
 
 * The log parsing regex matches the "v-combined" log format (combined with the virtual host at the front)
 
-* The logtop-urls script accepts "match" and/or "skip" arguments to only show URLs that match (or do not match) a given regex.
+* The logtop-urls script accepts "match" and/or "skip" arguments to only show URLs that match (or do
+not match) a given regex.
diff --git a/logstat.pl b/logstat.pl
@@ -8,17 +8,19 @@
 use Storable;
 #use Data::Dumper;
 
-my %data;
+my %data; # internal (server-only) data
+my %shared; # data shared with clients
 
-sub init 
+sub init
 {
     my $data = $_[0];
+    my $shared = $_[1];
     $data->{count} = 0;
     $data->{counts} = [0];
-    $data->{one} = 0;
-    $data->{five} = 0;
-    $data->{fifteen} = 0;
-    init_time($data);
+    $shared->{one} = 0;
+    $shared->{five} = 0;
+    $shared->{fifteen} = 0;
+    init_time($data, $shared, $_[2]);
 }
 
 sub init_time
@@ -27,111 +29,117 @@ sub init_time
     # mode 1 == init time and second only
     # mode 2 == init minute only
     my $data = $_[0];
-    my $mode = $_[2] || 0;
+    my $shared = $_[1];
+    my $mode = $_[3] || 0;
     unless ($mode == 2) {
-        $data->{time} = $_[1] || time();
+        $shared->{time} = $_[2] || time();
     }
     return if $mode == 1;
-    $data->{minute} = $data->{time} + 60;
+    $data->{minute} = $shared->{time} + 60;
 }
-    
+
 while (<>) {
     my $nt = time();
-    my ($vhost, $method, $url, $protocol, $alt_url, $code, $bytes, $referrer, $ua) = (m/
+    my ($vhost, $date, $method, $url, $protocol, $alt_url, $code, $bytes,
+            $referrer, $ua) = (m/
         ^(\S+)\s                    # vhost
         \S+\s                       # IP
-	\S+\s
-	\S+\s
-	\[[^]]+\]\s                 # date
-	"(\S+)\s?                   # method
-	(?:((?:[^"]*(?:\\")?)*)\s   # URL   
-	([^"]*)"\s|                 # protocol
-	((?:[^"]*(?:\\")?)*)"\s)    # or, possibly URL with no protocol
-	(\S+)\s                     # status code
-	(\S+)\s                     # bytes
-	"((?:[^"]*(?:\\")?)*)"\s    # referrer
-	"(.*)"$                     # user agent
-	/x);
+        \S+\s
+        \S+\s
+        \[([^]]+)\]\s               # date
+        "(\S+)\s?                   # method
+        (?:((?:[^"]*(?:\\")?)*)\s   # URL
+        ([^"]*)"\s|                 # protocol
+        ((?:[^"]*(?:\\")?)*)"\s)    # or, possibly URL with no protocol
+        (\S+)\s                     # status code
+        (\S+)\s                     # bytes
+        "((?:[^"]*(?:\\")?)*)"\s    # referrer
+        "(.*)"$                     # user agent
+    /x);
     die "Couldn't match $_" unless $vhost;
-#    print "$vhost $method $url $protocol $alt_url $code $bytes $referrer $ua\n";
+    $alt_url ||= '';
     $url ||= $alt_url;
     # vhost counts
     $data{vhosts}{$vhost} ||= {};
-    if (tally($data{vhosts}{$vhost}, $nt)) {
-        show($data{vhosts}{$vhost}, " $vhost\n");
-    }
+    $shared{vhosts}{$vhost} ||= {};
+    tally($data{vhosts}{$vhost}, $shared{vhosts}{$vhost}, $nt);
+
     # url counts
     $data{urls}{"$vhost $url"} ||= {};
-    if (tally($data{urls}{"$vhost $url"}, $nt)) {
-        show($data{urls}{"$vhost $url"}, " $vhost $url\n");
-    }
-
+    $shared{urls}{"$vhost $url"} ||= {};
+    tally($data{urls}{"$vhost $url"}, $shared{urls}{"$vhost $url"}, $nt);
+
     # TODO: user agents? referrers? status codes? bytes?
 
     # total hit count
-    if (tally(\%data, $nt)) {
-        print "\n";
-        show(\%data, " total *\n\n");
-        store(\%data, "logstats.data.tmp");
-        rename("logstats.data.tmp", "logstats.data");
+    if (tally(\%data, \%shared, $nt)) {
+        print "last line parsed: [$vhost] [$date] [$method] [$url] [$protocol] [$alt_url] [$code] [$bytes] [$referrer] [$ua]\n\n";
+        show(\%shared, " total\n\n");
+        store(\%shared, 'logstats.data.tmp');
+        rename('logstats.data.tmp', 'logstats.data');
     }
 }
 
 sub tally
 {
     my $data = $_[0];
-    # reset $count every second
-    init($data) unless defined $data->{count};
+    my $shared = $_[1];
+    my $nt = $_[2] || time();
+    init($data, $shared, $nt) unless defined $data->{count};
+
+    # count the event
     $data->{count}++;
-    my $nt = $_[1] || time();
-    my $diff = $nt - $data->{time};
-    my $gimme_a_sec = 0;
-    if ($diff >= 1) {
-        $gimme_a_sec = 1;
-        init_time($data, $nt, 1);
-        $data->{hps} = $data->{count} / $diff;
-        $data->{count} = 0;
-        # keep per-minute count
-        $data->{counts}[0] += $data->{hps};
-        # update per-minute counter
-        $diff = $nt - $data->{minute};
-        if ($diff >= 0) {
-            init_time($data, $nt, 2);
-            # log "0" counts if this is an infrequent stat
-            my $count = $data->{counts}[0];
-            $data->{counts}[0] = 0;
-            while ($diff >= 60) {
-                unshift @{$data->{counts}}, 0;
-                $diff -= 60;
-            }
-            $data->{counts}[0] = $count;
-            unshift @{$data->{counts}}, 0;
-            no warnings qw/uninitialized misc/;
-            splice @{$data->{counts}}, 16;
-            my @count = @{$data->{counts}};
-            $data->{one} = $count[1] / 60;
-            $data->{five} = sum(@count[1..5]) / 5 / 60;
-            $data->{fifteen} = sum(@count[1..15]) / 15 / 60;
-        } else {
-            # extrapolate running average
-            $diff += 60;
-            my $count = $data->{counts}[0];
-            $count *= 60 / $diff;
-            my @count = @{$data->{counts}};
-            defined($count[1]) or $count[1] = $count;
-            $data->{one} = sum($count, $count[1]) / 2 / 60;
-            no warnings 'uninitialized';
-            $data->{five} = sum($count, @count[1..5]) / 6 / 60;
-            $data->{fifteen} = sum($count, @count[1..15]) / 16 / 60;
-        }
+
+    # add $count to per-minute count and reset $count every second
+    my $diff = $nt - $shared->{time};
+    return 0 unless $diff >= 1;
+
+    init_time($data, $shared, $nt, 1);
+    $shared->{hps} = $data->{count} / $diff;
+    $data->{count} = 0;
+    $data->{counts}[0] += $shared->{hps};
+
+    # add per-minute count to 15-minute historical data set and reset
+    # per-minute count every minute
+    $diff = $nt - $data->{minute};
+    if ($diff >= 0) {
+        init_time($data, $shared, $nt, 2);
+
+        # log "0" counts to historical data set if this is an infrequent stat
+        splice(@{$data->{counts}}, 1, 0, (0) x int($diff / 60));
+
+        # FIXME: reduce value of per-minute count by remainder if it's been
+        # over a minute (that is, add a fractional "0" count)
+
+        # compute historical average
+        unshift @{$data->{counts}}, 0;
+        no warnings qw/uninitialized misc/;
+        splice @{$data->{counts}}, 16;
+        $shared->{one} = $data->{counts}[1] / 60;
+        $shared->{five} = sum(@{$data->{counts}}[1..5]) / 5 / 60;
+        $shared->{fifteen} = sum(@{$data->{counts}}[1..15]) / 15 / 60;
+        return 1;
     }
-    return $gimme_a_sec;
+
+    # if it hasn't been a minute yet, extrapolate a running historical average
+    my $count = $data->{counts}[0];
+    # this gets more accurate as the minute progresses,
+    # but it does tend to over-estimate infrequent events
+    $count *= 60 / ($diff + 60);
+    # egregious HACK - but this only matters for the first minute(s)
+    # after we start up
+    defined($data->{counts}[1]) or $data->{counts}[1] = $count;
+    $shared->{one} = sum($count, $data->{counts}[1]) / 2 / 60;
+    no warnings 'uninitialized';
+    $shared->{five} = sum($count, @{$data->{counts}}[1..5]) / 6 / 60;
+    $shared->{fifteen} = sum($count, @{$data->{counts}}[1..15]) / 16 / 60;
+    return 1;
 }
 
 sub show
 {
-    my $data = $_[0];
-    print scalar localtime($data->{time});
-    printf " hps: %6.0f,  average: %.2f, %.2f, %.2f,  ", $data->{hps}, $data->{one}, $data->{five}, $data->{fifteen};
+    my $shared = $_[0];
+    print scalar localtime($shared->{time});
+    printf " hps: %6.0f,  average: %.2f, %.2f, %.2f,  ", $shared->{hps}, $shared->{one}, $shared->{five}, $shared->{fifteen};
     print $_[1] || "\n";
 }
diff --git a/logtop-urls.pl b/logtop-urls.pl
@@ -6,6 +6,16 @@
 use Time::HiRes 'time';
 use Storable;
 
+use Getopt::Long;
+my ($match, $skip, $match_re, $skip_re);
+GetOptions(
+    'match=s' => \$match,
+    'skip=s'  => \$skip,
+);
+
+$match_re = qr/$match/i if $match;
+$skip_re = qr/$skip/i if $skip;
+
 my $data = retrieve("logstats.data");
 
 my $now = time;
@@ -14,6 +24,8 @@
 
 my @stale;
 foreach my $url (sort { $data->{urls}{$b}{five} <=> $data->{urls}{$a}{five} } keys %{ $data->{urls} }) {
+    next if $skip and $url =~ m/$skip_re/;
+    next if $match and $url !~ m/$match_re/;
     my $d = $data->{urls}{$url};
     my $t = $now - $d->{time};
     if ($t > 60) {

diff --git a/README b/README
@@ -1,38 +1,42 @@
 DESCRIPTION
 
-These scripts show total and per-vhost request-per-second counts based on apache access logs in real-time.
+These scripts show total, per-vhost and per-URL request-per-second counts based on apache access logs in real-time.
 
 The output shows:
 * date of last stats update
 * last second's hits per second (hps)
 * one, five and fifteen minute hps average
 * seconds since last request
-* vhost
-* asterisk marks vhosts with hits in the last 5 seconds
+* asterisk marks request hit in the last 5 seconds
+* vhost or URL requested
 
 See also the apachetop tool - http://freshmeat.net/projects/apachetop/ and http://packages.debian.org/search?keywords=apachetop . This script is not based on or intended to mimic apachetop, but the functionality is broadly similar.
 
 SYNOPSIS
 
 To use:
 
-In one screen:
+In one screen, run the log parser. It writes out a stats file which the other scripts use.
 
 $ tail -F access.log | ./logstat.pl # centralized logging helps here
 
-In another screen:
+In another screen, view vhost stats:
 
 $ watch -n 5 -- ./logtop.pl
 
+In a third screen, view URL stats:
+
+watch -n 5 -- './logtop-urls.pl --skip '\''\.(jpg|png|gif|js|css)$|^\S+\s(/robots\.txt$|/favicon\.ico$)'\'''
+
 EXAMPLE
 
 Every 5.0s: ./logtop.pl                                                                                                                        Wed Jun 22 09:55:54 2011
 
-Wed Jun 22 09:55:54 2011 hps:      9,  average: 12.87, 13.19, 13.33,   0 seconds ago,  total *
+Wed Jun 22 09:55:54 2011 hps:      9,  average: 12.87, 13.19, 13.33,   0 seconds ago,  total
 
-Wed Jun 22 09:55:48 2011 hps:     17,  average: 5.66, 1.90, 0.86,   6 seconds ago,  example.com
-Wed Jun 22 09:55:45 2011 hps:      6,  average: 1.17, 1.43, 1.48,   9 seconds ago,  example.org
-Wed Jun 22 09:55:50 2011 hps:      3,  average: 0.94, 1.33, 1.14,   4 seconds ago,  example.net *
+Wed Jun 22 09:55:48 2011 hps:     17,  average: 5.66, 1.90, 0.86,   6 seconds ago,    example.com
+Wed Jun 22 09:55:45 2011 hps:      6,  average: 1.17, 1.43, 1.48,   9 seconds ago,    example.org
+Wed Jun 22 09:55:50 2011 hps:      3,  average: 0.94, 1.33, 1.14,   4 seconds ago,  * example.net
 ...
 
 NOTES
@@ -41,4 +45,4 @@ NOTES
 
 * The log parsing regex matches the "v-combined" log format (combined with the virtual host at the front)
 
-* This currently only breaks down requests by vhost (not by URL, etc); though as noted above it's easy to add more counters
+* The logtop-urls script accepts "match" and/or "skip" arguments to only show URLs that match (or do not match) a given regex.
diff --git a/logstat.pl b/logstat.pl
@@ -37,27 +37,36 @@ sub init_time
 
 while (<>) {
     my $nt = time();
-    my ($vhost, $method, $url, $code, $bytes, $referrer, $ua) = (m/
-        ^(\S+)\s                 # vhost
-        \S+\s                    # IP
+    my ($vhost, $method, $url, $protocol, $alt_url, $code, $bytes, $referrer, $ua) = (m/
+        ^(\S+)\s                    # vhost
+        \S+\s                       # IP
 	\S+\s
 	\S+\s
-	\[[^]]+\]\s              # date
-	"(\S+)\s                 # method
-	((?:[^"]*(?:\\")?)*)\s?  # URL   
-	[^"]*"\s                 # protocol
-	(\S+)\s                  # status code
-	(\S+)\s                  # bytes
-	"((?:[^"]*(?:\\")?)*)"\s # referrer
-	"(.*)"$                  # user agent
+	\[[^]]+\]\s                 # date
+	"(\S+)\s?                   # method
+	(?:((?:[^"]*(?:\\")?)*)\s   # URL   
+	([^"]*)"\s|                 # protocol
+	((?:[^"]*(?:\\")?)*)"\s)    # or, possibly URL with no protocol
+	(\S+)\s                     # status code
+	(\S+)\s                     # bytes
+	"((?:[^"]*(?:\\")?)*)"\s    # referrer
+	"(.*)"$                     # user agent
 	/x);
     die "Couldn't match $_" unless $vhost;
+#    print "$vhost $method $url $protocol $alt_url $code $bytes $referrer $ua\n";
+    $url ||= $alt_url;
     # vhost counts
     $data{vhosts}{$vhost} ||= {};
     if (tally($data{vhosts}{$vhost}, $nt)) {
         show($data{vhosts}{$vhost}, " $vhost\n");
     }
-    # TODO: urls? user agents? referrers? status codes?
+    # url counts
+    $data{urls}{"$vhost $url"} ||= {};
+    if (tally($data{urls}{"$vhost $url"}, $nt)) {
+        show($data{urls}{"$vhost $url"}, " $vhost $url\n");
+    }
+
+    # TODO: user agents? referrers? status codes? bytes?
 
     # total hit count
     if (tally(\%data, $nt)) {

diff --git a/logtop-urls.pl b/logtop-urls.pl
@@ -0,0 +1,48 @@
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+
+use Time::HiRes 'time';
+use Storable;
+
+my $data = retrieve("logstats.data");
+
+my $now = time;
+
+show($data, " total\n\n", $now - $data->{time});
+
+my @stale;
+foreach my $url (sort { $data->{urls}{$b}{five} <=> $data->{urls}{$a}{five} } keys %{ $data->{urls} }) {
+    my $d = $data->{urls}{$url};
+    my $t = $now - $d->{time};
+    if ($t > 60) {
+        push @stale, $url;
+        next;
+    }
+    my $text = '   ';
+    if ($now - $d->{time} < 6) {
+        $text = " * ";
+    }
+    $text .= " $url";
+
+    show($d, "$text\n", $t);
+}
+
+print "\nstale:\n" if @stale;
+
+my $i = 0;
+foreach my $url (@stale) {
+    my $d = $data->{urls}{$url};
+    my $t = $now - $d->{time};
+    show($d, " $url\n", $t);
+    last if ++$i > 100;
+}
+
+sub show
+{
+    my $data = $_[0];
+    print scalar localtime($data->{time});
+    printf " hps: %6.0f,  average: %.2f, %.2f, %.2f,  %2.0f seconds ago, ", $data->{hps} || 0, $data->{one}, $data->{five}, $data->{fifteen}, $_[2];
+    print $_[1] || "\n";
+}
diff --git a/logtop.pl b/logtop.pl
@@ -10,7 +10,7 @@
 
 my $now = time;
 
-show($data, " total *\n\n", $now - $data->{time});
+show($data, " total\n\n", $now - $data->{time});
 
 my @stale;
 foreach my $vhost (sort { $data->{vhosts}{$b}{five} <=> $data->{vhosts}{$a}{five} } keys %{ $data->{vhosts} }) {
@@ -20,10 +20,11 @@
         push @stale, $vhost;
         next;
     }
-    my $text = " $vhost";
-    if ($now - $d->{time} <= 5) {
-        $text .= " *";
+    my $text = '   ';
+    if ($now - $d->{time} < 6) {
+        $text = " * ";
     }
+    $text .= " $vhost";
 
     show($d, "$text\n", $t);
 }

diff --git a/logstat.pl b/logstat.pl
@@ -44,7 +44,7 @@ sub init_time
 	\S+\s
 	\[[^]]+\]\s              # date
 	"(\S+)\s                 # method
-	((?:[^"]*(?:\\")?)*)\s   # URL   
+	((?:[^"]*(?:\\")?)*)\s?  # URL   
 	[^"]*"\s                 # protocol
 	(\S+)\s                  # status code
 	(\S+)\s                  # bytes

diff --git a/README b/README
@@ -1,6 +1,6 @@
 DESCRIPTION
 
-These scripts show total and per-vhost request-per-second counts based on apache access logs.
+These scripts show total and per-vhost request-per-second counts based on apache access logs in real-time.
 
 The output shows:
 * date of last stats update

diff --git a/README b/README
@@ -1,5 +1,21 @@
+DESCRIPTION
+
+These scripts show total and per-vhost request-per-second counts based on apache access logs.
+
+The output shows:
+* date of last stats update
+* last second's hits per second (hps)
+* one, five and fifteen minute hps average
+* seconds since last request
+* vhost
+* asterisk marks vhosts with hits in the last 5 seconds
+
+See also the apachetop tool - http://freshmeat.net/projects/apachetop/ and http://packages.debian.org/search?keywords=apachetop . This script is not based on or intended to mimic apachetop, but the functionality is broadly similar.
+
 SYNOPSIS
 
+To use:
+
 In one screen:
 
 $ tail -F access.log | ./logstat.pl # centralized logging helps here
@@ -21,7 +37,7 @@ Wed Jun 22 09:55:50 2011 hps:      3,  average: 0.94, 1.33, 1.14,   4 seconds ag
 
 NOTES
 
-* The tally sub is properly abstracted so this can be used to tally and report averages on anything you can count (not just httpd logs)
+* The script's tally sub is properly abstracted so this script could be modified to tally and report averages on anything you can count (not just httpd logs)
 
 * The log parsing regex matches the "v-combined" log format (combined with the virtual host at the front)
 

diff --git a/logstat.pl b/logstat.pl
@@ -38,18 +38,18 @@ sub init_time
 while (<>) {
     my $nt = time();
     my ($vhost, $method, $url, $code, $bytes, $referrer, $ua) = (m/
-        ^(\S+)\s    # vhost
-        \S+\s       # IP
+        ^(\S+)\s                 # vhost
+        \S+\s                    # IP
 	\S+\s
 	\S+\s
-	\[[^]]+\]\s # date
-	"(\S+)\s    # method
-	(\S*)\s     # URL
-	[^"]*"\s    # protocol
-	(\S+)\s     # status code
-	(\S+)\s     # bytes
-	"([^"]*)"\s # referrer
-	"(.*)"$     # user agent
+	\[[^]]+\]\s              # date
+	"(\S+)\s                 # method
+	((?:[^"]*(?:\\")?)*)\s   # URL   
+	[^"]*"\s                 # protocol
+	(\S+)\s                  # status code
+	(\S+)\s                  # bytes
+	"((?:[^"]*(?:\\")?)*)"\s # referrer
+	"(.*)"$                  # user agent
 	/x);
     die "Couldn't match $_" unless $vhost;
     # vhost counts

diff --git a/README b/README
@@ -0,0 +1,28 @@
+SYNOPSIS
+
+In one screen:
+
+$ tail -F access.log | ./logstat.pl # centralized logging helps here
+
+In another screen:
+
+$ watch -n 5 -- ./logtop.pl
+
+EXAMPLE
+
+Every 5.0s: ./logtop.pl                                                                                                                        Wed Jun 22 09:55:54 2011
+
+Wed Jun 22 09:55:54 2011 hps:      9,  average: 12.87, 13.19, 13.33,   0 seconds ago,  total *
+
+Wed Jun 22 09:55:48 2011 hps:     17,  average: 5.66, 1.90, 0.86,   6 seconds ago,  example.com
+Wed Jun 22 09:55:45 2011 hps:      6,  average: 1.17, 1.43, 1.48,   9 seconds ago,  example.org
+Wed Jun 22 09:55:50 2011 hps:      3,  average: 0.94, 1.33, 1.14,   4 seconds ago,  example.net *
+...
+
+NOTES
+
+* The tally sub is properly abstracted so this can be used to tally and report averages on anything you can count (not just httpd logs)
+
+* The log parsing regex matches the "v-combined" log format (combined with the virtual host at the front)
+
+* This currently only breaks down requests by vhost (not by URL, etc); though as noted above it's easy to add more counters
diff --git a/logstat.pl b/logstat.pl
@@ -0,0 +1,128 @@
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+
+use Time::HiRes 'time';
+use List::Util qw/sum/;
+use Storable;
+#use Data::Dumper;
+
+my %data;
+
+sub init 
+{
+    my $data = $_[0];
+    $data->{count} = 0;
+    $data->{counts} = [0];
+    $data->{one} = 0;
+    $data->{five} = 0;
+    $data->{fifteen} = 0;
+    init_time($data);
+}
+
+sub init_time
+{
+    # mode 0 == init all
+    # mode 1 == init time and second only
+    # mode 2 == init minute only
+    my $data = $_[0];
+    my $mode = $_[2] || 0;
+    unless ($mode == 2) {
+        $data->{time} = $_[1] || time();
+    }
+    return if $mode == 1;
+    $data->{minute} = $data->{time} + 60;
+}
+
+while (<>) {
+    my $nt = time();
+    my ($vhost, $method, $url, $code, $bytes, $referrer, $ua) = (m/
+        ^(\S+)\s    # vhost
+        \S+\s       # IP
+	\S+\s
+	\S+\s
+	\[[^]]+\]\s # date
+	"(\S+)\s    # method
+	(\S*)\s     # URL
+	[^"]*"\s    # protocol
+	(\S+)\s     # status code
+	(\S+)\s     # bytes
+	"([^"]*)"\s # referrer
+	"(.*)"$     # user agent
+	/x);
+    die "Couldn't match $_" unless $vhost;
+    # vhost counts
+    $data{vhosts}{$vhost} ||= {};
+    if (tally($data{vhosts}{$vhost}, $nt)) {
+        show($data{vhosts}{$vhost}, " $vhost\n");
+    }
+    # TODO: urls? user agents? referrers? status codes?
+
+    # total hit count
+    if (tally(\%data, $nt)) {
+        print "\n";
+        show(\%data, " total *\n\n");
+        store(\%data, "logstats.data.tmp");
+        rename("logstats.data.tmp", "logstats.data");
+    }
+}
+
+sub tally
+{
+    my $data = $_[0];
+    # reset $count every second
+    init($data) unless defined $data->{count};
+    $data->{count}++;
+    my $nt = $_[1] || time();
+    my $diff = $nt - $data->{time};
+    my $gimme_a_sec = 0;
+    if ($diff >= 1) {
+        $gimme_a_sec = 1;
+        init_time($data, $nt, 1);
+        $data->{hps} = $data->{count} / $diff;
+        $data->{count} = 0;
+        # keep per-minute count
+        $data->{counts}[0] += $data->{hps};
+        # update per-minute counter
+        $diff = $nt - $data->{minute};
+        if ($diff >= 0) {
+            init_time($data, $nt, 2);
+            # log "0" counts if this is an infrequent stat
+            my $count = $data->{counts}[0];
+            $data->{counts}[0] = 0;
+            while ($diff >= 60) {
+                unshift @{$data->{counts}}, 0;
+                $diff -= 60;
+            }
+            $data->{counts}[0] = $count;
+            unshift @{$data->{counts}}, 0;
+            no warnings qw/uninitialized misc/;
+            splice @{$data->{counts}}, 16;
+            my @count = @{$data->{counts}};
+            $data->{one} = $count[1] / 60;
+            $data->{five} = sum(@count[1..5]) / 5 / 60;
+            $data->{fifteen} = sum(@count[1..15]) / 15 / 60;
+        } else {
+            # extrapolate running average
+            $diff += 60;
+            my $count = $data->{counts}[0];
+            $count *= 60 / $diff;
+            my @count = @{$data->{counts}};
+            defined($count[1]) or $count[1] = $count;
+            $data->{one} = sum($count, $count[1]) / 2 / 60;
+            no warnings 'uninitialized';
+            $data->{five} = sum($count, @count[1..5]) / 6 / 60;
+            $data->{fifteen} = sum($count, @count[1..15]) / 16 / 60;
+        }
+    }
+    return $gimme_a_sec;
+}
+
+sub show
+{
+    my $data = $_[0];
+    print scalar localtime($data->{time});
+    printf " hps: %6.0f,  average: %.2f, %.2f, %.2f,  ", $data->{hps}, $data->{one}, $data->{five}, $data->{fifteen};
+    print $_[1] || "\n";
+}
diff --git a/logtop.pl b/logtop.pl
@@ -0,0 +1,45 @@
+#!/usr/bin/perl -w
+
+use strict;
+use warnings;
+
+use Time::HiRes 'time';
+use Storable;
+
+my $data = retrieve("logstats.data");
+
+my $now = time;
+
+show($data, " total *\n\n", $now - $data->{time});
+
+my @stale;
+foreach my $vhost (sort { $data->{vhosts}{$b}{five} <=> $data->{vhosts}{$a}{five} } keys %{ $data->{vhosts} }) {
+    my $d = $data->{vhosts}{$vhost};
+    my $t = $now - $d->{time};
+    if ($t > 60) {
+        push @stale, $vhost;
+        next;
+    }
+    my $text = " $vhost";
+    if ($now - $d->{time} <= 5) {
+        $text .= " *";
+    }
+
+    show($d, "$text\n", $t);
+}
+
+print "\nstale:\n" if @stale;
+
+foreach my $vhost (@stale) {
+    my $d = $data->{vhosts}{$vhost};
+    my $t = $now - $d->{time};
+    show($d, " $vhost\n", $t);
+}
+
+sub show
+{
+    my $data = $_[0];
+    print scalar localtime($data->{time});
+    printf " hps: %6.0f,  average: %.2f, %.2f, %.2f,  %2.0f seconds ago, ", $data->{hps} || 0, $data->{one}, $data->{five}, $data->{fifteen}, $_[2];
+    print $_[1] || "\n";
+}