norandom · July 8, 2025 10:51
diff --git a/crawl_documentation_zscaler.py b/crawl_documentation_zscaler.py
 import asyncio
 import os
 from urllib.parse import urlparse
 from pathlib import Path
 from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
 from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
 from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
 from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
 from crawl4ai.deep_crawling.filters import FilterChain


 def url_to_file_path(url, base_dir="crawled_docs"):
    """Convert URL to a safe file path with folder structure"""
    parsed = urlparse(url)
    
    # Create base directory structure
    domain = parsed.netloc.replace(":", "_")
    path_parts = parsed.path.strip("/").split("/") if parsed.path.strip("/") else []
    
    # Handle empty path (root page)
    if not path_parts or path_parts == [""]:
        path_parts = ["index"]
    
    # Create directory path
    dir_path = Path(base_dir) / domain
    if len(path_parts) > 1:
        dir_path = dir_path / Path(*path_parts[:-1])
    
    # Create filename from last part of path
    filename = path_parts[-1] if path_parts else "index"
    
    # Remove invalid characters and ensure .md extension
    filename = "".join(c for c in filename if c.isalnum() or c in ".-_")
    if not filename.endswith(".md"):
        filename += ".md"
    
    return dir_path, filename


 class FileExistsFilter:
    """Filter to skip URLs whose corresponding files already exist"""
    
    def apply(self, url):
        """Return False if the file for this URL already exists"""
        try:
            dir_path, filename = url_to_file_path(url)
            file_path = dir_path / filename
            exists = file_path.exists()
            if exists:
                print(f"Skipping (file exists): {url} -> {file_path}")
            return not exists
        except Exception as e:
            print(f"Error checking file existence for {url}: {e}")
            return True  # Include by default if we can't check


 async def main():
    # Create a scorer
    scorer = KeywordRelevanceScorer(
        keywords=["zia", "zpa", "data-protection", "security"],
        weight=0.7
    )

    # Create file existence filter
    file_filter = FileExistsFilter()
    filter_chain = FilterChain([file_filter])

    # Configure the strategy
    strategy = BestFirstCrawlingStrategy(
        max_depth=3,
        include_external=False,
        url_scorer=scorer,
        max_pages=100,              # Maximum number of pages to crawl (optional)
        filter_chain=filter_chain  # Add the file existence filter
    )

    # Configure markdown generator
    md_generator = DefaultMarkdownGenerator()
    
    config = CrawlerRunConfig(
        markdown_generator=md_generator,
        deep_crawl_strategy=strategy,
        stream=True
    )

    async with AsyncWebCrawler() as crawler:
        crawled_count = 0
        processed_urls = []
        
        async for result in await crawler.arun("https://help.zscaler.com/business-insights", config=config):
            print(f"Processing: {result.url}")
            processed_urls.append(result.url)
            
            if result.success and result.markdown:
                # Get file path structure from URL
                dir_path, filename = url_to_file_path(result.url)
                file_path = dir_path / filename
                
                # Create directory if it doesn't exist
                dir_path.mkdir(parents=True, exist_ok=True)
                
                # Save markdown to file
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(f"# {result.url}\n\n")
                    f.write(result.markdown)
                
                print(f"Saved: {file_path} (markdown length: {len(result.markdown)})")
                crawled_count += 1
            else:
                print(f"Failed to crawl: {result.error_message if hasattr(result, 'error_message') else 'No markdown content'}")

        print(f"Successfully crawled and saved {crawled_count} pages to markdown files")
        print(f"Files saved in: ./crawled_docs/")

        # Show some processed URLs
        print(f"\nProcessed URLs (showing first 3):")
        for i, url in enumerate(processed_urls[:3]):
            print(f"  {i+1}. {url}")

 if __name__ == "__main__":
    asyncio.run(main())
	import asyncio
	import os
	from urllib.parse import urlparse
	from pathlib import Path
	from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
	from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
	from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
	from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
	from crawl4ai.deep_crawling.filters import FilterChain


	def url_to_file_path(url, base_dir="crawled_docs"):
	"""Convert URL to a safe file path with folder structure"""
	parsed = urlparse(url)

	# Create base directory structure
	domain = parsed.netloc.replace(":", "_")
	path_parts = parsed.path.strip("/").split("/") if parsed.path.strip("/") else []

	# Handle empty path (root page)
	if not path_parts or path_parts == [""]:
	path_parts = ["index"]

	# Create directory path
	dir_path = Path(base_dir) / domain
	if len(path_parts) > 1:
	dir_path = dir_path / Path(*path_parts[:-1])

	# Create filename from last part of path
	filename = path_parts[-1] if path_parts else "index"

	# Remove invalid characters and ensure .md extension
	filename = "".join(c for c in filename if c.isalnum() or c in ".-_")
	if not filename.endswith(".md"):
	filename += ".md"

	return dir_path, filename


	class FileExistsFilter:
	"""Filter to skip URLs whose corresponding files already exist"""

	def apply(self, url):
	"""Return False if the file for this URL already exists"""
	try:
	dir_path, filename = url_to_file_path(url)
	file_path = dir_path / filename
	exists = file_path.exists()
	if exists:
	print(f"Skipping (file exists): {url} -> {file_path}")
	return not exists
	except Exception as e:
	print(f"Error checking file existence for {url}: {e}")
	return True # Include by default if we can't check


	async def main():
	# Create a scorer
	scorer = KeywordRelevanceScorer(
	keywords=["zia", "zpa", "data-protection", "security"],
	weight=0.7
	)

	# Create file existence filter
	file_filter = FileExistsFilter()
	filter_chain = FilterChain([file_filter])

	# Configure the strategy
	strategy = BestFirstCrawlingStrategy(
	max_depth=3,
	include_external=False,
	url_scorer=scorer,
	max_pages=100, # Maximum number of pages to crawl (optional)
	filter_chain=filter_chain # Add the file existence filter
	)

	# Configure markdown generator
	md_generator = DefaultMarkdownGenerator()

	config = CrawlerRunConfig(
	markdown_generator=md_generator,
	deep_crawl_strategy=strategy,
	stream=True
	)

	async with AsyncWebCrawler() as crawler:
	crawled_count = 0
	processed_urls = []

	async for result in await crawler.arun("https://help.zscaler.com/business-insights", config=config):
	print(f"Processing: {result.url}")
	processed_urls.append(result.url)

	if result.success and result.markdown:
	# Get file path structure from URL
	dir_path, filename = url_to_file_path(result.url)
	file_path = dir_path / filename

	# Create directory if it doesn't exist
	dir_path.mkdir(parents=True, exist_ok=True)

	# Save markdown to file
	with open(file_path, 'w', encoding='utf-8') as f:
	f.write(f"# {result.url}\n\n")
	f.write(result.markdown)

	print(f"Saved: {file_path} (markdown length: {len(result.markdown)})")
	crawled_count += 1
	else:
	print(f"Failed to crawl: {result.error_message if hasattr(result, 'error_message') else 'No markdown content'}")

	print(f"Successfully crawled and saved {crawled_count} pages to markdown files")
	print(f"Files saved in: ./crawled_docs/")

	# Show some processed URLs
	print(f"\nProcessed URLs (showing first 3):")
	for i, url in enumerate(processed_urls[:3]):
	print(f" {i+1}. {url}")

	if __name__ == "__main__":
	asyncio.run(main())