nigelhorne · November 8, 2017 15:06 · Nov 8, 2017 · Dec 7, 2012 · Dec 7, 2012
diff --git a/find_duplicate_code.pl b/find_duplicate_code.pl
@@ -141,7 +141,7 @@ sub add_line_numbers {
 
     my $line = 1;
     foreach (@$contents) {
-        next if /^\s*\$\^H{/;    # skip those damned strict lines
+        next if /^\s*\$\^H\{/;    # skip those damned strict lines
         if (/^#line\s+([0-9]+)/) {
             $line = $1;
             next;
@@ -206,7 +206,7 @@ sub prefilter {
         #
         # $skip{sub_begin} filters this out
 
-        if (/^sub BEGIN {/) {
+        if (/^sub BEGIN \{/) {
             $skip{sub_begin} = 1;
             $skip++;
         }
@@ -231,13 +231,13 @@ sub prefilter {
         #     }
         #
         # $skip{strict_begin} filters this out
-        if (/^(\s*)BEGIN {/) {
+        if (/^(\s*)BEGIN \{/) {
             my $padding = $1;
             my $next    = $contents->[ $i + 1 ];
-            if ( $next =~ /^\s+\$\^H{/ ) {
+            if ( $next =~ /^\s+\$\^H\{/ ) {
                 my $index = $i + 2;
               STRICT_SEARCH: while ( $index++ ) {
-                    next if $contents->[$index] =~ /^\s+\$\^H{/;   # $^H{'...'}
+                    next if $contents->[$index] =~ /^\s+\$\^H\{/;   # $^H{'...'}
                     last STRICT_SEARCH
                       if not /^$padding}/
                     ;    # the block did not terminate as we expect
@@ -297,4 +297,4 @@ =head1 OPTIONS
 
     if ( $ignore and $duplicate_code =~ /$ignore/ ) {
         # don't report this chunk of code as a duplicate
-    }
+    }
diff --git a/find_duplicate_code → find_duplicate_code.pl b/find_duplicate_code → find_duplicate_code.pl
diff --git a/find_duplicate_code b/find_duplicate_code
@@ -0,0 +1,300 @@
+#!/usr/bin/env perl
+
+use 5.12.0;
+use autodie;
+use Carp;
+use utf8::all;
+use File::Spec::Functions qw(catfile catdir);
+use File::Find::Rule;
+use Getopt::Long;
+use Capture::Tiny qw(capture);
+use File::Slurp;
+use Term::ANSIColor ':constants';
+
+local $Term::ANSIColor::AUTORESET = 1;
+
+GetOptions(
+    'window=i'  => \( my $window = 5 ),
+    'dir=s'     => \( my $dir    = 'lib' ),
+    'ignore=s@' => \my @ignore,
+) or die "Bad options";
+my $IGNORE = join '|' => @ignore;
+
+my $CACHE_DIR = catdir( $ENV{HOME}, '.find_cnp' );
+
+if ( -d $CACHE_DIR ) {
+    my @cached = File::Find::Rule->file->in($CACHE_DIR);
+    unlink $_ for @cached;
+}
+else {
+    mkdir $CACHE_DIR;
+}
+
+unless ( -d $dir ) {
+    croak("Cannot find dir $dir");
+}
+
+my @files     = File::Find::Rule->file->name('*.pm')->in($dir);
+my $num_files = @files;
+
+for my $i ( 0 .. $#files - 1 ) {
+    my $next = $i + 1;
+    print WHITE "Processing $next out of $num_files files ";
+    for my $j ( $next .. $#files ) {
+        print '.';
+        my ( $first, $second ) = @files[ $i, $j ];
+        search_for_dups( $first, $second, $window );
+    }
+    print "\n";
+}
+
+sub search_for_dups {
+    my ( $first, $second, $window ) = @_;
+
+    my $code1 = get_text($first);
+    my $code2 = get_text($second);
+
+    my %in_second = map { $_->{key} => 1 } @$code2;
+
+    my $matches_found = 0;
+    my $last_found    = 0;
+    foreach my $i ( 0 .. $#$code1 ) {
+        if ( $in_second{ $code1->[$i]{key} } ) {
+            if ( $i == $last_found + 1 ) {
+                $matches_found++;
+            }
+            $last_found = $i;
+        }
+    }
+    if ( $matches_found < $window ) {
+        return;
+    }
+
+    # brute force is bad!
+  LINE: foreach ( my $i = 0; $i < @$code1 - $window; $i++ ) {
+        next LINE unless $in_second{ $code1->[$i]{key} };
+
+        my @code1 = @{$code1}[ $i .. $#$code1 ];
+        foreach my $j ( 0 .. $#$code2 - $window ) {
+            my @code2   = @{$code2}[ $j .. $#$code2 ];
+            my $matches = 0;
+            my $longest = 0;
+          WINDOW: foreach my $k ( 0 .. $#code1 ) {
+                if ( $code1[$k]{key} eq $code2[$k]{key} ) {
+                    $matches++;
+                    my $length1 = length( $code1[$k]{code} );
+                    if ( $length1 > $longest ) {
+                        $longest = $length1;
+                    }
+                    my $length2 = length( $code2[$k]{code} );
+                    if ( $length1 > $longest ) {
+                        $longest = $length1;
+                    }
+                }
+                else {
+                    last WINDOW;
+                }
+            }
+            if ( $matches >= $window ) {
+                my $line1         = 0 + $code1[0]{line};
+                my $line2         = 0 + $code2[0]{line};
+                my $code_to_print = '';
+                for ( 0 .. $matches - 1 ) {
+                    my ( $line1, $line2 ) =
+                      map { chomp; $_ } ( $code1[$_]{code}, $code2[$_]{code} );
+                    $code_to_print
+                      .= $line1 . ( ' ' x ( $longest - length($line1) ) );
+                    $code_to_print .= " | $line2\n";
+                }
+                $i += $window;
+                if ( $IGNORE and $code_to_print =~ /$IGNORE/ ) {
+                    next LINE;
+                }
+                say BOLD RED
+                  "\nPossible match ($first near line $line1) ($second near line $line2)\n",
+                  "Begining at:";
+                print WHITE $code_to_print;
+            }
+        }
+    }
+}
+
+sub get_text {
+    my $file     = shift;
+    my $filename = $file;
+    $filename =~ s/\W/_/g;
+    $filename = catfile( $CACHE_DIR, $filename );
+    my @contents;
+    if ( -f $filename ) {
+        @contents = split /(\n)/ => read_file($filename);
+    }
+    else {
+        ( undef, undef, @contents ) = capture {qx($^X -MO=Deparse,-l $file)};
+        write_file( $filename, @contents );
+    }
+    return add_line_numbers( \@contents );
+}
+
+sub add_line_numbers {
+    my $contents = prefilter(shift);
+    my @contents;
+
+    my $line = 1;
+    foreach (@$contents) {
+        next if /^\s*\$\^H{/;    # skip those damned strict lines
+        if (/^#line\s+([0-9]+)/) {
+            $line = $1;
+            next;
+        }
+        push @contents => {
+            line => $line,
+            key  => munge_line($_),
+            code => $_,
+        };
+        $line++;
+    }
+    return postfilter( \@contents );
+}
+
+sub postfilter {
+    my $contents = shift;
+
+    my @contents;
+  INDEX: for ( my $i = 0; $i < @$contents; $i++ ) {
+        if ( $contents->[$i]{code} =~ /^(\s*)BEGIN\s*{/ ) {    #    BEGIN {
+            my $padding =~ $1;
+            if ( $contents->[ $i + 1 ]{code} =~ /^$padding}/ ) {
+                $DB::single = 1;
+                $i++;
+                next INDEX;
+            }
+        }
+        push @contents => $contents->[$i];
+    }
+
+    #my $lines = join '' => map { $_->{code}} @contents;
+    #say BLUE $lines;
+    #<STDIN>;
+
+    return \@contents;
+}
+
+sub prefilter {
+    my $contents = shift;
+
+    my @contents;
+    my %skip = (
+        sub_begin => 0,
+    );
+    my $skip = 0;
+
+  LINE: for ( my $i = 0; $i < @$contents; $i++ ) {
+        local $_ = $contents->[$i];
+        next if /^\s*(?:use|require)\b/;    # use/require
+        next if /^\s*$/;                    # blank lines
+        next if /^#(?!line\s+[0-9]+)/;  # comments which aren't line directives
+
+
+        # Modules which import things create code like this:
+        #
+        #     sub BEGIN {
+        #         require strict;
+        #         do {
+        #             'strict'->import('refs')
+        #         };
+        #     }
+        #
+        # $skip{sub_begin} filters this out
+
+        if (/^sub BEGIN {/) {
+            $skip{sub_begin} = 1;
+            $skip++;
+        }
+        elsif ( $skip{sub_begin} and /^}/ ) {
+            $skip{sub_begin} = 0;
+            $skip--;
+            next;
+        }
+
+        # Modules which use strict often have blocks like this:
+        #
+        #     BEGIN {
+        #         $^H{'indirect'} = q(31664984);
+        #         $^H{'feature_unicode'} = q(1);
+        #         $^H{'autodie'} = q(Fatal :lexical :all);
+        #         $^H{'feature_say'} = q(1);
+        #         $^H{'guard Fatal'} = q(ARRAY(0x2300d20));
+        #         $^H{'feature_state'} = q(1);
+        #         $^H{'autovivification'} = q(52);
+        #         $^H{'utf8::all'} = q(1);
+        #         $^H{'feature_switch'} = q(1);
+        #     }
+        #
+        # $skip{strict_begin} filters this out
+        if (/^(\s*)BEGIN {/) {
+            my $padding = $1;
+            my $next    = $contents->[ $i + 1 ];
+            if ( $next =~ /^\s+\$\^H{/ ) {
+                my $index = $i + 2;
+              STRICT_SEARCH: while ( $index++ ) {
+                    next if $contents->[$index] =~ /^\s+\$\^H{/;   # $^H{'...'}
+                    last STRICT_SEARCH
+                      if not /^$padding}/
+                    ;    # the block did not terminate as we expect
+                    $i = $index;
+                    next LINE;
+                }
+            }
+        }
+
+        push @contents => $_ unless $skip;
+    }
+    return \@contents;
+}
+
+sub munge_line {
+    local $_ = shift;
+    chomp;
+    s/\s//g;
+    return $_;
+}
+
+__END__
+
+=head1 NAME
+
+find_duplicate_code
+
+=head1 SYNOPSIS
+
+ find_duplicate_code --window 7 --dir lib/ --ignore 'catch {'
+
+=head1 DESCRIPTION
+
+This program searches for cut-n-paste code. It does not (at the present time)
+try to account for cases where people may have changed variable names, but it
+applies a heuristic process for finding duplicate code and works moderately
+well. It's also slow.
+
+For every file in the target directory, C<--dir> (defaults to C<lib/>), it
+runs the code through L<B::Deparse>, caches it, and then walks through that
+code comparing C<--window> number of lines (default 5). If that many lines
+matches, as determined by stripping all whitespace and doing an C<eq>, then we
+have duplicated code.
+
+Blank lines are skipped, as are C<use> and C<require>. This may be
+configurable in the future.
+
+=head1 OPTIONS
+
+ --window,-w   Minimum number of lines to needed for a match (default 5)
+ --dir,-d      Directory of .pm files to search through (default 'lib/')
+ --ignore,-i   Duplicates to ignore (may be repeated).
+
+The C<--ignore> switch may be useful if you are repeatedly getting "duplicate"
+sections of code that you aren't interested in refactoring right now. The
+values of C<--ignore> are joined with a pipe and the check looks like this:
+
+    if ( $ignore and $duplicate_code =~ /$ignore/ ) {
+        # don't report this chunk of code as a duplicate
+    }
No results found