Created
July 8, 2025 10:51
-
-
Save norandom/85ece0c4bd30645e2d1fd4ff158d5686 to your computer and use it in GitHub Desktop.
Crawl4AI Markdown scraper for documentation dataset generation (Zscaler)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import asyncio | |
| import os | |
| from urllib.parse import urlparse | |
| from pathlib import Path | |
| from crawl4ai import AsyncWebCrawler, CrawlerRunConfig | |
| from crawl4ai.deep_crawling import BestFirstCrawlingStrategy | |
| from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer | |
| from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator | |
| from crawl4ai.deep_crawling.filters import FilterChain | |
| def url_to_file_path(url, base_dir="crawled_docs"): | |
| """Convert URL to a safe file path with folder structure""" | |
| parsed = urlparse(url) | |
| # Create base directory structure | |
| domain = parsed.netloc.replace(":", "_") | |
| path_parts = parsed.path.strip("/").split("/") if parsed.path.strip("/") else [] | |
| # Handle empty path (root page) | |
| if not path_parts or path_parts == [""]: | |
| path_parts = ["index"] | |
| # Create directory path | |
| dir_path = Path(base_dir) / domain | |
| if len(path_parts) > 1: | |
| dir_path = dir_path / Path(*path_parts[:-1]) | |
| # Create filename from last part of path | |
| filename = path_parts[-1] if path_parts else "index" | |
| # Remove invalid characters and ensure .md extension | |
| filename = "".join(c for c in filename if c.isalnum() or c in ".-_") | |
| if not filename.endswith(".md"): | |
| filename += ".md" | |
| return dir_path, filename | |
| class FileExistsFilter: | |
| """Filter to skip URLs whose corresponding files already exist""" | |
| def apply(self, url): | |
| """Return False if the file for this URL already exists""" | |
| try: | |
| dir_path, filename = url_to_file_path(url) | |
| file_path = dir_path / filename | |
| exists = file_path.exists() | |
| if exists: | |
| print(f"Skipping (file exists): {url} -> {file_path}") | |
| return not exists | |
| except Exception as e: | |
| print(f"Error checking file existence for {url}: {e}") | |
| return True # Include by default if we can't check | |
| async def main(): | |
| # Create a scorer | |
| scorer = KeywordRelevanceScorer( | |
| keywords=["zia", "zpa", "data-protection", "security"], | |
| weight=0.7 | |
| ) | |
| # Create file existence filter | |
| file_filter = FileExistsFilter() | |
| filter_chain = FilterChain([file_filter]) | |
| # Configure the strategy | |
| strategy = BestFirstCrawlingStrategy( | |
| max_depth=3, | |
| include_external=False, | |
| url_scorer=scorer, | |
| max_pages=100, # Maximum number of pages to crawl (optional) | |
| filter_chain=filter_chain # Add the file existence filter | |
| ) | |
| # Configure markdown generator | |
| md_generator = DefaultMarkdownGenerator() | |
| config = CrawlerRunConfig( | |
| markdown_generator=md_generator, | |
| deep_crawl_strategy=strategy, | |
| stream=True | |
| ) | |
| async with AsyncWebCrawler() as crawler: | |
| crawled_count = 0 | |
| processed_urls = [] | |
| async for result in await crawler.arun("https://help.zscaler.com/business-insights", config=config): | |
| print(f"Processing: {result.url}") | |
| processed_urls.append(result.url) | |
| if result.success and result.markdown: | |
| # Get file path structure from URL | |
| dir_path, filename = url_to_file_path(result.url) | |
| file_path = dir_path / filename | |
| # Create directory if it doesn't exist | |
| dir_path.mkdir(parents=True, exist_ok=True) | |
| # Save markdown to file | |
| with open(file_path, 'w', encoding='utf-8') as f: | |
| f.write(f"# {result.url}\n\n") | |
| f.write(result.markdown) | |
| print(f"Saved: {file_path} (markdown length: {len(result.markdown)})") | |
| crawled_count += 1 | |
| else: | |
| print(f"Failed to crawl: {result.error_message if hasattr(result, 'error_message') else 'No markdown content'}") | |
| print(f"Successfully crawled and saved {crawled_count} pages to markdown files") | |
| print(f"Files saved in: ./crawled_docs/") | |
| # Show some processed URLs | |
| print(f"\nProcessed URLs (showing first 3):") | |
| for i, url in enumerate(processed_urls[:3]): | |
| print(f" {i+1}. {url}") | |
| if __name__ == "__main__": | |
| asyncio.run(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment