import asyncio import os from urllib.parse import urlparse from pathlib import Path from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai.deep_crawling import BestFirstCrawlingStrategy from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai.deep_crawling.filters import FilterChain def url_to_file_path(url, base_dir="crawled_docs"): """Convert URL to a safe file path with folder structure""" parsed = urlparse(url) # Create base directory structure domain = parsed.netloc.replace(":", "_") path_parts = parsed.path.strip("/").split("/") if parsed.path.strip("/") else [] # Handle empty path (root page) if not path_parts or path_parts == [""]: path_parts = ["index"] # Create directory path dir_path = Path(base_dir) / domain if len(path_parts) > 1: dir_path = dir_path / Path(*path_parts[:-1]) # Create filename from last part of path filename = path_parts[-1] if path_parts else "index" # Remove invalid characters and ensure .md extension filename = "".join(c for c in filename if c.isalnum() or c in ".-_") if not filename.endswith(".md"): filename += ".md" return dir_path, filename class FileExistsFilter: """Filter to skip URLs whose corresponding files already exist""" def apply(self, url): """Return False if the file for this URL already exists""" try: dir_path, filename = url_to_file_path(url) file_path = dir_path / filename exists = file_path.exists() if exists: print(f"Skipping (file exists): {url} -> {file_path}") return not exists except Exception as e: print(f"Error checking file existence for {url}: {e}") return True # Include by default if we can't check async def main(): # Create a scorer scorer = KeywordRelevanceScorer( keywords=["zia", "zpa", "data-protection", "security"], weight=0.7 ) # Create file existence filter file_filter = FileExistsFilter() filter_chain = FilterChain([file_filter]) # Configure the strategy strategy = BestFirstCrawlingStrategy( max_depth=3, include_external=False, url_scorer=scorer, max_pages=100, # Maximum number of pages to crawl (optional) filter_chain=filter_chain # Add the file existence filter ) # Configure markdown generator md_generator = DefaultMarkdownGenerator() config = CrawlerRunConfig( markdown_generator=md_generator, deep_crawl_strategy=strategy, stream=True ) async with AsyncWebCrawler() as crawler: crawled_count = 0 processed_urls = [] async for result in await crawler.arun("https://help.zscaler.com/business-insights", config=config): print(f"Processing: {result.url}") processed_urls.append(result.url) if result.success and result.markdown: # Get file path structure from URL dir_path, filename = url_to_file_path(result.url) file_path = dir_path / filename # Create directory if it doesn't exist dir_path.mkdir(parents=True, exist_ok=True) # Save markdown to file with open(file_path, 'w', encoding='utf-8') as f: f.write(f"# {result.url}\n\n") f.write(result.markdown) print(f"Saved: {file_path} (markdown length: {len(result.markdown)})") crawled_count += 1 else: print(f"Failed to crawl: {result.error_message if hasattr(result, 'error_message') else 'No markdown content'}") print(f"Successfully crawled and saved {crawled_count} pages to markdown files") print(f"Files saved in: ./crawled_docs/") # Show some processed URLs print(f"\nProcessed URLs (showing first 3):") for i, url in enumerate(processed_urls[:3]): print(f" {i+1}. {url}") if __name__ == "__main__": asyncio.run(main())