Skip to content

Instantly share code, notes, and snippets.

@dnmfarrell
Created January 14, 2019 01:15
Show Gist options
  • Select an option

  • Save dnmfarrell/5dde6d3957bf9ae037e170cdb44f75a5 to your computer and use it in GitHub Desktop.

Select an option

Save dnmfarrell/5dde6d3957bf9ae037e170cdb44f75a5 to your computer and use it in GitHub Desktop.

Revisions

  1. dnmfarrell created this gist Jan 14, 2019.
    114 changes: 114 additions & 0 deletions spider.pl
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,114 @@
    #!/usr/bin/env perl
    use strict;
    use warnings;
    use HTTP::Tiny;
    use Getopt::Long 'GetOptions';
    use Encode qw(encode decode);
    use Parallel::ForkManager;
    use PerlIO::gzip;
    use Selenium::Remote::Driver;
    use Time::HiRes 'sleep';

    GetOptions(
    'processes=i' => \(my $max_processes = 2),
    'output=s' => \(my $output_dir = 'data'),
    'depth=i' => \(my $depth = 1),
    'trace' => \(my $trace = 0),
    ) or die 'unrecognized options';

    ping_selenium_server();

    my $pm = Parallel::ForkManager->new($max_processes);

    DOMAIN:
    while (my $d = <<>>) {
    my $pid = $pm->start and next DOMAIN;
    chomp $d;
    my $path = $d =~ s{/}{__}gr;
    print "$d -> $path\n";
    open my $FH, '>:raw:gzip', "$output_dir/$path.gz" or die $!;
    my $driver = Selenium::Remote::Driver->new(
    browser_name => 'chrome',
    # allow to be run as root
    # no GUI
    # load websites with invalid ssl certs
    # disable "chrome is being controlled by software" notification
    extra_capabilities => { chromeOptions => {args => [ qw(window-size=1920,1080 no-sandbox headless allow-running-insecure-content disable-infobars) ]}},
    );
    $driver->{visited} = {};
    my $url = "http://$d";
    spider_site($driver, $url, $depth, $FH);
    $driver->quit();
    $pm->finish;
    }
    $pm->wait_all_children;
    sub spider_site {
    my ($driver, $url, $depth, $FH) = @_;
    warn "fetching $url\n";
    $driver->get($url);
    $driver->{visited}{$url}++;
    my $host = URI->new($url)->host;
    # some sites have links to the parent domain without www
    $host =~ s/^www\.//;
    my $text = $driver->get_body;
    print $FH encode('UTF-8', $text);
    if ($depth > 0) {
    my @links = $driver->find_elements('a', 'tag_name');
    warn sprintf "found %s links\n", scalar @links if $trace;
    my @urls = ();
    for my $l (@links) {
    my $link_url = eval { $l->get_attribute('href') };
    next unless $link_url;
    my $link_uri = URI->new($link_url);
    next unless $link_uri->can('host'); # not all URIs have a domain
    my $link_host = $link_uri->host;
    # only visit links to subdomains of our starting URL
    if ($link_host =~ /\Q$host\E$/) {
    push @urls, $link_url;
    warn sprintf "included %s\n", $link_url if $trace;
    }
    else {
    warn sprintf "skipped %s (%s) different subdomain to %s (%s)\n", $link_url, $link_host, $url, $host if $trace;
    }
    }
    for my $u (@urls) {
    if ($driver->{visited}{$u}) {
    warn sprintf "already visited, ignoring %s\n", $u if $trace;
    }
    else {
    sleep rand 1;
    spider_site($driver, $u, $depth - 1, $FH);
    }
    }
    }
    else {
    warn "have reached maximum depth\n" if $trace;
    }
    }
    sub ping_selenium_server {
    my $attempts = 0;
    my $success = undef;
    my $ua = HTTP::Tiny->new;
    until ($success) {
    my $res = $ua->get('http://127.0.0.1:4444/wd/hub/status');
    $success = $res->{success};
    sleep 0.1;
    $attempts++;
    if ($attempts > 60) {
    die 'unable to find selenium server';
    }
    }
    }