Skip to content

Instantly share code, notes, and snippets.

@norandom
Created July 8, 2025 10:51
Show Gist options
  • Save norandom/85ece0c4bd30645e2d1fd4ff158d5686 to your computer and use it in GitHub Desktop.
Save norandom/85ece0c4bd30645e2d1fd4ff158d5686 to your computer and use it in GitHub Desktop.
Crawl4AI Markdown scraper for documentation dataset generation (Zscaler)
import asyncio
import os
from urllib.parse import urlparse
from pathlib import Path
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.deep_crawling.filters import FilterChain
def url_to_file_path(url, base_dir="crawled_docs"):
"""Convert URL to a safe file path with folder structure"""
parsed = urlparse(url)
# Create base directory structure
domain = parsed.netloc.replace(":", "_")
path_parts = parsed.path.strip("/").split("/") if parsed.path.strip("/") else []
# Handle empty path (root page)
if not path_parts or path_parts == [""]:
path_parts = ["index"]
# Create directory path
dir_path = Path(base_dir) / domain
if len(path_parts) > 1:
dir_path = dir_path / Path(*path_parts[:-1])
# Create filename from last part of path
filename = path_parts[-1] if path_parts else "index"
# Remove invalid characters and ensure .md extension
filename = "".join(c for c in filename if c.isalnum() or c in ".-_")
if not filename.endswith(".md"):
filename += ".md"
return dir_path, filename
class FileExistsFilter:
"""Filter to skip URLs whose corresponding files already exist"""
def apply(self, url):
"""Return False if the file for this URL already exists"""
try:
dir_path, filename = url_to_file_path(url)
file_path = dir_path / filename
exists = file_path.exists()
if exists:
print(f"Skipping (file exists): {url} -> {file_path}")
return not exists
except Exception as e:
print(f"Error checking file existence for {url}: {e}")
return True # Include by default if we can't check
async def main():
# Create a scorer
scorer = KeywordRelevanceScorer(
keywords=["zia", "zpa", "data-protection", "security"],
weight=0.7
)
# Create file existence filter
file_filter = FileExistsFilter()
filter_chain = FilterChain([file_filter])
# Configure the strategy
strategy = BestFirstCrawlingStrategy(
max_depth=3,
include_external=False,
url_scorer=scorer,
max_pages=100, # Maximum number of pages to crawl (optional)
filter_chain=filter_chain # Add the file existence filter
)
# Configure markdown generator
md_generator = DefaultMarkdownGenerator()
config = CrawlerRunConfig(
markdown_generator=md_generator,
deep_crawl_strategy=strategy,
stream=True
)
async with AsyncWebCrawler() as crawler:
crawled_count = 0
processed_urls = []
async for result in await crawler.arun("https://help.zscaler.com/business-insights", config=config):
print(f"Processing: {result.url}")
processed_urls.append(result.url)
if result.success and result.markdown:
# Get file path structure from URL
dir_path, filename = url_to_file_path(result.url)
file_path = dir_path / filename
# Create directory if it doesn't exist
dir_path.mkdir(parents=True, exist_ok=True)
# Save markdown to file
with open(file_path, 'w', encoding='utf-8') as f:
f.write(f"# {result.url}\n\n")
f.write(result.markdown)
print(f"Saved: {file_path} (markdown length: {len(result.markdown)})")
crawled_count += 1
else:
print(f"Failed to crawl: {result.error_message if hasattr(result, 'error_message') else 'No markdown content'}")
print(f"Successfully crawled and saved {crawled_count} pages to markdown files")
print(f"Files saved in: ./crawled_docs/")
# Show some processed URLs
print(f"\nProcessed URLs (showing first 3):")
for i, url in enumerate(processed_urls[:3]):
print(f" {i+1}. {url}")
if __name__ == "__main__":
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment