Skip to content

Instantly share code, notes, and snippets.

@nigelhorne
Forked from Ovid/find_duplicate_code.pl
Created November 8, 2017 15:06
Show Gist options
  • Select an option

  • Save nigelhorne/96f518124303c21138a1ba3d154984d2 to your computer and use it in GitHub Desktop.

Select an option

Save nigelhorne/96f518124303c21138a1ba3d154984d2 to your computer and use it in GitHub Desktop.

Revisions

  1. nigelhorne revised this gist Nov 8, 2017. 1 changed file with 6 additions and 6 deletions.
    12 changes: 6 additions & 6 deletions find_duplicate_code.pl
    Original file line number Diff line number Diff line change
    @@ -141,7 +141,7 @@ sub add_line_numbers {

    my $line = 1;
    foreach (@$contents) {
    next if /^\s*\$\^H{/; # skip those damned strict lines
    next if /^\s*\$\^H\{/; # skip those damned strict lines
    if (/^#line\s+([0-9]+)/) {
    $line = $1;
    next;
    @@ -206,7 +206,7 @@ sub prefilter {
    #
    # $skip{sub_begin} filters this out

    if (/^sub BEGIN {/) {
    if (/^sub BEGIN \{/) {
    $skip{sub_begin} = 1;
    $skip++;
    }
    @@ -231,13 +231,13 @@ sub prefilter {
    # }
    #
    # $skip{strict_begin} filters this out
    if (/^(\s*)BEGIN {/) {
    if (/^(\s*)BEGIN \{/) {
    my $padding = $1;
    my $next = $contents->[ $i + 1 ];
    if ( $next =~ /^\s+\$\^H{/ ) {
    if ( $next =~ /^\s+\$\^H\{/ ) {
    my $index = $i + 2;
    STRICT_SEARCH: while ( $index++ ) {
    next if $contents->[$index] =~ /^\s+\$\^H{/; # $^H{'...'}
    next if $contents->[$index] =~ /^\s+\$\^H\{/; # $^H{'...'}
    last STRICT_SEARCH
    if not /^$padding}/
    ; # the block did not terminate as we expect
    @@ -297,4 +297,4 @@ =head1 OPTIONS
    if ( $ignore and $duplicate_code =~ /$ignore/ ) {
    # don't report this chunk of code as a duplicate
    }
    }
  2. @Ovid Ovid renamed this gist Dec 7, 2012. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  3. @Ovid Ovid created this gist Dec 7, 2012.
    300 changes: 300 additions & 0 deletions find_duplicate_code
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,300 @@
    #!/usr/bin/env perl

    use 5.12.0;
    use autodie;
    use Carp;
    use utf8::all;
    use File::Spec::Functions qw(catfile catdir);
    use File::Find::Rule;
    use Getopt::Long;
    use Capture::Tiny qw(capture);
    use File::Slurp;
    use Term::ANSIColor ':constants';

    local $Term::ANSIColor::AUTORESET = 1;

    GetOptions(
    'window=i' => \( my $window = 5 ),
    'dir=s' => \( my $dir = 'lib' ),
    'ignore=s@' => \my @ignore,
    ) or die "Bad options";
    my $IGNORE = join '|' => @ignore;

    my $CACHE_DIR = catdir( $ENV{HOME}, '.find_cnp' );

    if ( -d $CACHE_DIR ) {
    my @cached = File::Find::Rule->file->in($CACHE_DIR);
    unlink $_ for @cached;
    }
    else {
    mkdir $CACHE_DIR;
    }

    unless ( -d $dir ) {
    croak("Cannot find dir $dir");
    }

    my @files = File::Find::Rule->file->name('*.pm')->in($dir);
    my $num_files = @files;

    for my $i ( 0 .. $#files - 1 ) {
    my $next = $i + 1;
    print WHITE "Processing $next out of $num_files files ";
    for my $j ( $next .. $#files ) {
    print '.';
    my ( $first, $second ) = @files[ $i, $j ];
    search_for_dups( $first, $second, $window );
    }
    print "\n";
    }

    sub search_for_dups {
    my ( $first, $second, $window ) = @_;

    my $code1 = get_text($first);
    my $code2 = get_text($second);

    my %in_second = map { $_->{key} => 1 } @$code2;

    my $matches_found = 0;
    my $last_found = 0;
    foreach my $i ( 0 .. $#$code1 ) {
    if ( $in_second{ $code1->[$i]{key} } ) {
    if ( $i == $last_found + 1 ) {
    $matches_found++;
    }
    $last_found = $i;
    }
    }
    if ( $matches_found < $window ) {
    return;
    }

    # brute force is bad!
    LINE: foreach ( my $i = 0; $i < @$code1 - $window; $i++ ) {
    next LINE unless $in_second{ $code1->[$i]{key} };

    my @code1 = @{$code1}[ $i .. $#$code1 ];
    foreach my $j ( 0 .. $#$code2 - $window ) {
    my @code2 = @{$code2}[ $j .. $#$code2 ];
    my $matches = 0;
    my $longest = 0;
    WINDOW: foreach my $k ( 0 .. $#code1 ) {
    if ( $code1[$k]{key} eq $code2[$k]{key} ) {
    $matches++;
    my $length1 = length( $code1[$k]{code} );
    if ( $length1 > $longest ) {
    $longest = $length1;
    }
    my $length2 = length( $code2[$k]{code} );
    if ( $length1 > $longest ) {
    $longest = $length1;
    }
    }
    else {
    last WINDOW;
    }
    }
    if ( $matches >= $window ) {
    my $line1 = 0 + $code1[0]{line};
    my $line2 = 0 + $code2[0]{line};
    my $code_to_print = '';
    for ( 0 .. $matches - 1 ) {
    my ( $line1, $line2 ) =
    map { chomp; $_ } ( $code1[$_]{code}, $code2[$_]{code} );
    $code_to_print
    .= $line1 . ( ' ' x ( $longest - length($line1) ) );
    $code_to_print .= " | $line2\n";
    }
    $i += $window;
    if ( $IGNORE and $code_to_print =~ /$IGNORE/ ) {
    next LINE;
    }
    say BOLD RED
    "\nPossible match ($first near line $line1) ($second near line $line2)\n",
    "Begining at:";
    print WHITE $code_to_print;
    }
    }
    }
    }

    sub get_text {
    my $file = shift;
    my $filename = $file;
    $filename =~ s/\W/_/g;
    $filename = catfile( $CACHE_DIR, $filename );
    my @contents;
    if ( -f $filename ) {
    @contents = split /(\n)/ => read_file($filename);
    }
    else {
    ( undef, undef, @contents ) = capture {qx($^X -MO=Deparse,-l $file)};
    write_file( $filename, @contents );
    }
    return add_line_numbers( \@contents );
    }

    sub add_line_numbers {
    my $contents = prefilter(shift);
    my @contents;

    my $line = 1;
    foreach (@$contents) {
    next if /^\s*\$\^H{/; # skip those damned strict lines
    if (/^#line\s+([0-9]+)/) {
    $line = $1;
    next;
    }
    push @contents => {
    line => $line,
    key => munge_line($_),
    code => $_,
    };
    $line++;
    }
    return postfilter( \@contents );
    }

    sub postfilter {
    my $contents = shift;

    my @contents;
    INDEX: for ( my $i = 0; $i < @$contents; $i++ ) {
    if ( $contents->[$i]{code} =~ /^(\s*)BEGIN\s*{/ ) { # BEGIN {
    my $padding =~ $1;
    if ( $contents->[ $i + 1 ]{code} =~ /^$padding}/ ) {
    $DB::single = 1;
    $i++;
    next INDEX;
    }
    }
    push @contents => $contents->[$i];
    }

    #my $lines = join '' => map { $_->{code}} @contents;
    #say BLUE $lines;
    #<STDIN>;

    return \@contents;
    }

    sub prefilter {
    my $contents = shift;

    my @contents;
    my %skip = (
    sub_begin => 0,
    );
    my $skip = 0;

    LINE: for ( my $i = 0; $i < @$contents; $i++ ) {
    local $_ = $contents->[$i];
    next if /^\s*(?:use|require)\b/; # use/require
    next if /^\s*$/; # blank lines
    next if /^#(?!line\s+[0-9]+)/; # comments which aren't line directives


    # Modules which import things create code like this:
    #
    # sub BEGIN {
    # require strict;
    # do {
    # 'strict'->import('refs')
    # };
    # }
    #
    # $skip{sub_begin} filters this out

    if (/^sub BEGIN {/) {
    $skip{sub_begin} = 1;
    $skip++;
    }
    elsif ( $skip{sub_begin} and /^}/ ) {
    $skip{sub_begin} = 0;
    $skip--;
    next;
    }

    # Modules which use strict often have blocks like this:
    #
    # BEGIN {
    # $^H{'indirect'} = q(31664984);
    # $^H{'feature_unicode'} = q(1);
    # $^H{'autodie'} = q(Fatal :lexical :all);
    # $^H{'feature_say'} = q(1);
    # $^H{'guard Fatal'} = q(ARRAY(0x2300d20));
    # $^H{'feature_state'} = q(1);
    # $^H{'autovivification'} = q(52);
    # $^H{'utf8::all'} = q(1);
    # $^H{'feature_switch'} = q(1);
    # }
    #
    # $skip{strict_begin} filters this out
    if (/^(\s*)BEGIN {/) {
    my $padding = $1;
    my $next = $contents->[ $i + 1 ];
    if ( $next =~ /^\s+\$\^H{/ ) {
    my $index = $i + 2;
    STRICT_SEARCH: while ( $index++ ) {
    next if $contents->[$index] =~ /^\s+\$\^H{/; # $^H{'...'}
    last STRICT_SEARCH
    if not /^$padding}/
    ; # the block did not terminate as we expect
    $i = $index;
    next LINE;
    }
    }
    }

    push @contents => $_ unless $skip;
    }
    return \@contents;
    }

    sub munge_line {
    local $_ = shift;
    chomp;
    s/\s//g;
    return $_;
    }

    __END__
    =head1 NAME
    find_duplicate_code
    =head1 SYNOPSIS
    find_duplicate_code --window 7 --dir lib/ --ignore 'catch {'
    =head1 DESCRIPTION
    This program searches for cut-n-paste code. It does not (at the present time)
    try to account for cases where people may have changed variable names, but it
    applies a heuristic process for finding duplicate code and works moderately
    well. It's also slow.
    For every file in the target directory, C<--dir> (defaults to C<lib/>), it
    runs the code through L<B::Deparse>, caches it, and then walks through that
    code comparing C<--window> number of lines (default 5). If that many lines
    matches, as determined by stripping all whitespace and doing an C<eq>, then we
    have duplicated code.
    Blank lines are skipped, as are C<use> and C<require>. This may be
    configurable in the future.
    =head1 OPTIONS
    --window,-w Minimum number of lines to needed for a match (default 5)
    --dir,-d Directory of .pm files to search through (default 'lib/')
    --ignore,-i Duplicates to ignore (may be repeated).
    The C<--ignore> switch may be useful if you are repeatedly getting "duplicate"
    sections of code that you aren't interested in refactoring right now. The
    values of C<--ignore> are joined with a pipe and the check looks like this:
    if ( $ignore and $duplicate_code =~ /$ignore/ ) {
    # don't report this chunk of code as a duplicate
    }