require "bundler/inline" gemfile do source "https://rubygems.org" gem "minitest" gem "async" gem "ferrum" gem "state_machines" gem "breaker_machines" gem "concurrent-ruby" gem "nokogiri" gem "pandoc-ruby" gem "webmock" gem "timecop" end require "minitest/autorun" require "minitest/pride" require "webmock/minitest" require "timecop" require_relative "async_scraper" class ScrapedPageTest < Minitest::Test def setup @url = "https://example.com" @page = ScrapedPage.new(@url) end def test_initialize assert_equal @url, @page.url assert_nil @page.html assert_nil @page.scraped_at assert_nil @page.error_message assert_equal 0, @page.retry_count assert_nil @page.last_error_at assert_equal "pending", @page.state end def test_state_machine_starts_in_pending_state assert @page.pending? assert @page.ready_for_processing? end def test_transitions_from_pending_to_processing @page.start_processing assert @page.processing? refute @page.ready_for_processing? end def test_transitions_from_processing_to_completed @page.start_processing @page.complete assert @page.completed? assert @page.success? refute @page.ready_for_processing? end def test_transitions_from_processing_to_failed @page.start_processing @page.mark_failed assert @page.failed? assert @page.can_retry? assert @page.ready_for_processing? end def test_transitions_from_failed_to_retrying @page.start_processing @page.mark_failed @page.retry_page assert @page.retrying? assert @page.ready_for_processing? end def test_transitions_to_permanently_failed_after_giving_up @page.start_processing @page.mark_failed @page.give_up assert @page.permanently_failed? refute @page.success? refute @page.ready_for_processing? end def test_record_error error = StandardError.new("Test error") Timecop.freeze do @page.record_error(error) assert_equal "Test error", @page.error_message assert_equal Time.now, @page.last_error_at assert_equal 1, @page.retry_count end end def test_record_success html = "Test" Timecop.freeze do @page.record_success(html) assert_equal html, @page.html assert_equal Time.now, @page.scraped_at assert_nil @page.error_message end end def test_can_retry_when_count_less_than_3 @page.start_processing @page.mark_failed assert @page.can_retry? end def test_prevents_retry_when_count_reaches_3 @page.start_processing @page.mark_failed @page.instance_variable_set(:@retry_count, 3) refute @page.can_retry? end def test_respects_backoff_delay @page.start_processing @page.mark_failed Timecop.freeze do @page.record_error(StandardError.new("Error")) refute @page.can_retry? # After first error, retry_count is 1, so backoff is 30 * (2**1) = 60 seconds Timecop.travel(61) # Past the 60 second backoff assert @page.can_retry? end end def test_implements_exponential_backoff @page.start_processing @page.mark_failed assert_equal 30, @page.send(:backoff_delay) @page.instance_variable_set(:@retry_count, 1) assert_equal 60, @page.send(:backoff_delay) @page.instance_variable_set(:@retry_count, 2) assert_equal 120, @page.send(:backoff_delay) end def test_recreates_page_from_hash_data data = { "url" => @url, "html" => "test", "scraped_at" => Time.now, "state" => "completed", "error_message" => nil, "retry_count" => 0, "last_error_at" => nil } restored_page = ScrapedPage.from_h(data) assert_equal @url, restored_page.url assert_equal "test", restored_page.html assert restored_page.completed? end end class AsyncScraperTest < Minitest::Test def setup @urls = ["https://example.com", "https://example.com/page2"] @scraper = AsyncScraper.new(*@urls, max_concurrent: 2, timeout: 5) end def test_sets_configuration_correctly assert_equal @urls, @scraper.instance_variable_get(:@initial_urls) assert_equal 2, @scraper.instance_variable_get(:@max_concurrent) assert_equal 5, @scraper.instance_variable_get(:@timeout) assert_equal "idle", @scraper.state end def test_raises_error_for_empty_urls assert_raises(ArgumentError) { AsyncScraper.new } end def test_raises_error_for_invalid_mode assert_raises(ArgumentError) { AsyncScraper.new("https://example.com", mode: :invalid) } end def test_accepts_spider_mode spider_scraper = AsyncScraper.new("https://example.com", mode: :spider) assert_equal :spider, spider_scraper.instance_variable_get(:@mode) end def test_sets_default_values default_scraper = AsyncScraper.new("https://example.com") assert_equal :scrape, default_scraper.instance_variable_get(:@mode) assert_equal 5, default_scraper.instance_variable_get(:@max_concurrent) assert_equal 10, default_scraper.instance_variable_get(:@timeout) assert_equal AsyncScraper::MAX_CONTENT_SIZE, default_scraper.instance_variable_get(:@max_content_size) end def test_starts_in_idle_state assert @scraper.idle? assert @scraper.can_start? end def test_transitions_to_running_when_started @scraper.start assert @scraper.running? assert @scraper.active? assert @scraper.can_process_pages? end def test_can_pause_and_resume @scraper.start @scraper.pause assert @scraper.paused? assert @scraper.active? refute @scraper.can_process_pages? @scraper.resume assert @scraper.running? assert @scraper.can_process_pages? end def test_can_stop_from_any_active_state @scraper.start @scraper.stop assert @scraper.stopped? refute @scraper.active? refute @scraper.can_process_pages? end def test_handles_error_state @scraper.start @scraper.error_occurred assert @scraper.error? refute @scraper.active? refute @scraper.can_process_pages? end def test_can_reset_to_idle @scraper.start @scraper.stop @scraper.reset assert @scraper.idle? end def test_returns_false_when_no_max_pages_set refute @scraper.send(:max_pages_reached?) end def test_returns_true_when_max_pages_reached limited_scraper = AsyncScraper.new("https://example.com", max_pages: 2) limited_scraper.instance_variable_get(:@pages_scraped).increment limited_scraper.instance_variable_get(:@pages_scraped).increment assert limited_scraper.send(:max_pages_reached?) end def test_extracts_and_queues_same_host_links_only html = <<~HTML Page 1 Page 2 Other Site HTML current_url = "https://example.com" @scraper.instance_variable_set(:@url_queue, Async::Queue.new) @scraper.send(:extract_and_queue_links, html, current_url) discovered_urls = @scraper.instance_variable_get(:@discovered_urls) assert_includes discovered_urls, "https://example.com/page1" refute_includes discovered_urls, "https://other-site.com/page" end def test_filters_out_non_content_urls html = <<~HTML CSS File Image HTML current_url = "https://example.com" @scraper.instance_variable_set(:@url_queue, Async::Queue.new) @scraper.send(:extract_and_queue_links, html, current_url) discovered_urls = @scraper.instance_variable_get(:@discovered_urls) refute_includes discovered_urls, "https://example.com/style.css" refute_includes discovered_urls, "https://example.com/image.jpg" end def test_ignores_special_link_types html = <<~HTML Section Link JS Link Email HTML current_url = "https://example.com" @scraper.instance_variable_set(:@url_queue, Async::Queue.new) @scraper.send(:extract_and_queue_links, html, current_url) discovered_urls = @scraper.instance_variable_get(:@discovered_urls) refute_includes discovered_urls, "#section" refute_includes discovered_urls, "javascript:void(0)" refute_includes discovered_urls, "mailto:test@example.com" end end class FileStateContainerTest < Minitest::Test def setup @file_path = "/tmp/test_scraper_state.json" @container = FileStateContainer.new(@file_path) @test_state = {"test" => "data", "number" => 42} end def teardown File.delete(@file_path) if File.exist?(@file_path) end def test_writes_and_reads_state_correctly @container.write_state(@test_state) assert File.exist?(@file_path) read_state = @container.read_state assert_equal @test_state, read_state end def test_returns_nil_when_file_doesnt_exist assert_nil @container.read_state end def test_handles_corrupted_json_gracefully File.write(@file_path, "invalid json") assert_nil @container.read_state end end class AsyncMarkdownScraperTest < Minitest::Test def setup @urls = ["https://example.com"] @markdown_scraper = AsyncMarkdownScraper.new(*@urls, max_concurrent: 1) end def test_creates_underlying_async_scraper assert_instance_of AsyncScraper, @markdown_scraper.scraper end def test_passes_options_to_async_scraper scraper = AsyncMarkdownScraper.new("https://example.com", mode: :spider, max_pages: 5) underlying_scraper = scraper.scraper assert_equal :spider, underlying_scraper.instance_variable_get(:@mode) assert_equal 5, underlying_scraper.instance_variable_get(:@max_pages) end def test_extracts_frontmatter_correctly html = <<~HTML Test Page Title

Content

HTML url = "https://example.com/test" frontmatter = @markdown_scraper.send(:extract_meta_frontmatter, html, url) assert_includes frontmatter, "title: \"Test Page Title\"" assert_includes frontmatter, "description: \"Test page description\"" assert_includes frontmatter, "- \"test\"" assert_includes frontmatter, "- \"page\"" assert_includes frontmatter, "- \"example\"" assert_includes frontmatter, "author: \"Test Author\"" assert_includes frontmatter, "language: \"en\"" assert_includes frontmatter, "canonical_url: \"https://example.com/canonical\"" assert_includes frontmatter, "robots: \"index, follow\"" assert_includes frontmatter, "url: \"https://example.com/test\"" assert_includes frontmatter, "scraped_at:" assert frontmatter.start_with?("---\n") assert_includes frontmatter, "\n---\n" end def test_handles_missing_meta_tags_gracefully minimal_html = "Minimal" url = "https://example.com/test" frontmatter = @markdown_scraper.send(:extract_meta_frontmatter, minimal_html, url) assert_includes frontmatter, "title: \"Minimal\"" assert_includes frontmatter, "url: \"https://example.com/test\"" refute_includes frontmatter, "description:" refute_includes frontmatter, "keywords:" end def test_escapes_quotes_in_content html_with_quotes = "Title with \"quotes\"" url = "https://example.com/test" frontmatter = @markdown_scraper.send(:extract_meta_frontmatter, html_with_quotes, url) assert_includes frontmatter, "title: \"Title with \\\"quotes\\\"\"" end def test_removes_unwanted_elements_and_converts_to_markdown html = <<~HTML Test

Main Title

Paragraph content

HTML markdown = @markdown_scraper.send(:html_to_clean_markdown, html) assert_includes markdown, "# Main Title" assert_includes markdown, "Paragraph content" refute_includes markdown, "alert(\"remove me\")" refute_includes markdown, "color: red" refute_includes markdown, "Navigation" refute_includes markdown, "Footer content" end end