#!/usr/bin/env perl use Modern::Perl; use Mojo::UserAgent; use Mojo::IOLoop; use Mojo::URL; # FIFO queue my $seed_url = 'http://www.google.co.uk/'; my @urls = ($seed_url); # User agent following up to 5 redirects my $ua = Mojo::UserAgent->new(max_redirects => 5); my %seen = ($seed_url => 1); my $active = 0; Mojo::IOLoop->recurring(0 => sub { # Keep up to 4 parallel crawlers sharing the same user agent for ($active .. 4 - 1) { # Dequeue or halt if there are no active crawlers anymore return ($active or Mojo::IOLoop->stop) unless my $url = shift @urls; # Fetch non-blocking just by adding a callback and marking as active ++$active; $ua->get($url => sub { my ($ua, $tx) = @_; # Extract and enqueue URLs say $url; for my $e ($tx->res->dom('a[href]')->each) { my $link = Mojo::URL->new($e->{href})->to_abs($tx->req->url); $link = clean_url($link); next unless is_valid_url($link); next if ($seen{$link}); push @urls, $link; $seen{$link} = 1; say " -> $urls[-1]"; } my $title = $tx->res->dom->at('title')->text; # Deactivate --$active; }); } }); # Start event loop if necessary Mojo::IOLoop->start unless Mojo::IOLoop->is_running; sub is_valid_url { my $url = shift; if ($url =~ m{\.(?:css|js|png|pdf|jpe?g|docx?|xlsx?|pptx?|gif)$}i) { return undef; } # don't crawl external sites unless ($url =~ m{^$seed_url}i) { return undef; } return 1; } sub clean_url { my $link = shift; # remove url fragments $link =~ s!#.*$!!; # remove session id params $link =~ s![?&]j?session(?:id)?=[^&]+!!gi; # remove google analytics tracking $link =~ s![?&]utm_(?:medium|source|campaign|content)=[^&]+!!gi; return $link; }