Created
January 14, 2019 01:15
-
-
Save dnmfarrell/5dde6d3957bf9ae037e170cdb44f75a5 to your computer and use it in GitHub Desktop.
Revisions
-
dnmfarrell created this gist
Jan 14, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,114 @@ #!/usr/bin/env perl use strict; use warnings; use HTTP::Tiny; use Getopt::Long 'GetOptions'; use Encode qw(encode decode); use Parallel::ForkManager; use PerlIO::gzip; use Selenium::Remote::Driver; use Time::HiRes 'sleep'; GetOptions( 'processes=i' => \(my $max_processes = 2), 'output=s' => \(my $output_dir = 'data'), 'depth=i' => \(my $depth = 1), 'trace' => \(my $trace = 0), ) or die 'unrecognized options'; ping_selenium_server(); my $pm = Parallel::ForkManager->new($max_processes); DOMAIN: while (my $d = <<>>) { my $pid = $pm->start and next DOMAIN; chomp $d; my $path = $d =~ s{/}{__}gr; print "$d -> $path\n"; open my $FH, '>:raw:gzip', "$output_dir/$path.gz" or die $!; my $driver = Selenium::Remote::Driver->new( browser_name => 'chrome', # allow to be run as root # no GUI # load websites with invalid ssl certs # disable "chrome is being controlled by software" notification extra_capabilities => { chromeOptions => {args => [ qw(window-size=1920,1080 no-sandbox headless allow-running-insecure-content disable-infobars) ]}}, ); $driver->{visited} = {}; my $url = "http://$d"; spider_site($driver, $url, $depth, $FH); $driver->quit(); $pm->finish; } $pm->wait_all_children; sub spider_site { my ($driver, $url, $depth, $FH) = @_; warn "fetching $url\n"; $driver->get($url); $driver->{visited}{$url}++; my $host = URI->new($url)->host; # some sites have links to the parent domain without www $host =~ s/^www\.//; my $text = $driver->get_body; print $FH encode('UTF-8', $text); if ($depth > 0) { my @links = $driver->find_elements('a', 'tag_name'); warn sprintf "found %s links\n", scalar @links if $trace; my @urls = (); for my $l (@links) { my $link_url = eval { $l->get_attribute('href') }; next unless $link_url; my $link_uri = URI->new($link_url); next unless $link_uri->can('host'); # not all URIs have a domain my $link_host = $link_uri->host; # only visit links to subdomains of our starting URL if ($link_host =~ /\Q$host\E$/) { push @urls, $link_url; warn sprintf "included %s\n", $link_url if $trace; } else { warn sprintf "skipped %s (%s) different subdomain to %s (%s)\n", $link_url, $link_host, $url, $host if $trace; } } for my $u (@urls) { if ($driver->{visited}{$u}) { warn sprintf "already visited, ignoring %s\n", $u if $trace; } else { sleep rand 1; spider_site($driver, $u, $depth - 1, $FH); } } } else { warn "have reached maximum depth\n" if $trace; } } sub ping_selenium_server { my $attempts = 0; my $success = undef; my $ua = HTTP::Tiny->new; until ($success) { my $res = $ua->get('http://127.0.0.1:4444/wd/hub/status'); $success = $res->{success}; sleep 0.1; $attempts++; if ($attempts > 60) { die 'unable to find selenium server'; } } }