import asyncio
import os
from urllib.parse import urlparse
from pathlib import Path
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.deep_crawling import BestFirstCrawlingStrategy
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.deep_crawling.filters import FilterChain


def url_to_file_path(url, base_dir="crawled_docs"):
    """Convert URL to a safe file path with folder structure"""
    parsed = urlparse(url)
    
    # Create base directory structure
    domain = parsed.netloc.replace(":", "_")
    path_parts = parsed.path.strip("/").split("/") if parsed.path.strip("/") else []
    
    # Handle empty path (root page)
    if not path_parts or path_parts == [""]:
        path_parts = ["index"]
    
    # Create directory path
    dir_path = Path(base_dir) / domain
    if len(path_parts) > 1:
        dir_path = dir_path / Path(*path_parts[:-1])
    
    # Create filename from last part of path
    filename = path_parts[-1] if path_parts else "index"
    
    # Remove invalid characters and ensure .md extension
    filename = "".join(c for c in filename if c.isalnum() or c in ".-_")
    if not filename.endswith(".md"):
        filename += ".md"
    
    return dir_path, filename


class FileExistsFilter:
    """Filter to skip URLs whose corresponding files already exist"""
    
    def apply(self, url):
        """Return False if the file for this URL already exists"""
        try:
            dir_path, filename = url_to_file_path(url)
            file_path = dir_path / filename
            exists = file_path.exists()
            if exists:
                print(f"Skipping (file exists): {url} -> {file_path}")
            return not exists
        except Exception as e:
            print(f"Error checking file existence for {url}: {e}")
            return True  # Include by default if we can't check


async def main():
    # Create a scorer
    scorer = KeywordRelevanceScorer(
        keywords=["zia", "zpa", "data-protection", "security"],
        weight=0.7
    )

    # Create file existence filter
    file_filter = FileExistsFilter()
    filter_chain = FilterChain([file_filter])

    # Configure the strategy
    strategy = BestFirstCrawlingStrategy(
        max_depth=3,
        include_external=False,
        url_scorer=scorer,
        max_pages=100,              # Maximum number of pages to crawl (optional)
        filter_chain=filter_chain  # Add the file existence filter
    )

    # Configure markdown generator
    md_generator = DefaultMarkdownGenerator()
    
    config = CrawlerRunConfig(
        markdown_generator=md_generator,
        deep_crawl_strategy=strategy,
        stream=True
    )

    async with AsyncWebCrawler() as crawler:
        crawled_count = 0
        processed_urls = []
        
        async for result in await crawler.arun("https://help.zscaler.com/business-insights", config=config):
            print(f"Processing: {result.url}")
            processed_urls.append(result.url)
            
            if result.success and result.markdown:
                # Get file path structure from URL
                dir_path, filename = url_to_file_path(result.url)
                file_path = dir_path / filename
                
                # Create directory if it doesn't exist
                dir_path.mkdir(parents=True, exist_ok=True)
                
                # Save markdown to file
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write(f"# {result.url}\n\n")
                    f.write(result.markdown)
                
                print(f"Saved: {file_path} (markdown length: {len(result.markdown)})")
                crawled_count += 1
            else:
                print(f"Failed to crawl: {result.error_message if hasattr(result, 'error_message') else 'No markdown content'}")

        print(f"Successfully crawled and saved {crawled_count} pages to markdown files")
        print(f"Files saved in: ./crawled_docs/")

        # Show some processed URLs
        print(f"\nProcessed URLs (showing first 3):")
        for i, url in enumerate(processed_urls[:3]):
            print(f"  {i+1}. {url}")

if __name__ == "__main__":
    asyncio.run(main())