#!/usr/bin/env python3

'''
Greetings bug-slaying brothers of the pythonian blood.  This script takes my httpx output after slamming in a bunch of subdomains 
and organizes it so its a bit easier to read and work with.  

The HTTPX payload I use first is: 
httpx -sc -cl -title -bp -server -td -ip -cname -asn -cdn -vhost -fhr  | anew httpx-quicc

This script will organize the data by status code and then from smallest to largest for each status code
output looks like:
#200s
https://goodurl.com/ [18]
https://based.goodurl.com [1049]

#301s
https://redir.based.goodurl.com [223]
and so on...
'''

import sys
import re
from collections import defaultdict

def strip_ansi(text):
    # ANSI color codes made parsing this total hell until I removed them like so
    ansi_escape = re.compile(r'\x1b\[[0-9;]*m')
    return ansi_escape.sub('', text)

def parse_httpx_line(line):
    line = line.strip()

    if not line or not line.startswith(('http://', 'https://')):
        return None
    
    try:
        clean_line = strip_ansi(line)
        # annihilating these script breaking nasty output chunks
        parts = clean_line.split(' ', 1)
        if len(parts) < 2:
            return None
            
        url = parts[0]
        rest = parts[1]
        
        # bracket extraction ceremony
        brackets = re.findall(r'\[([^\]]*)\]', rest)
        if len(brackets) < 2:
            return None
            
        # first bracket is status code, then content length
        status_raw = brackets[0]
        try:
            content_length = int(brackets[1])
        except ValueError:
            return None
        
        # some lines have [301, 302] for status code, which is DEALT WITH below
        if ',' in status_raw:
            primary_status = status_raw.split(',')[0]
        else:
            primary_status = status_raw
            
        # some of httpx's returned status codes are very creative and abstract
        if not primary_status.isdigit():
            return None
        
        return {
            'url': url,
            'status': primary_status,
            'content_length': content_length,
            'raw_status': status_raw
        }
        
    except Exception as e:
        print(f"We in trouble: {clean_line[:50]}... - {e}", file=sys.stderr)
        return None

def main():
    try:
        # you can pipe your httpx-output or use it as an arg
        if len(sys.argv) > 1:
            with open(sys.argv[1], 'r', encoding='utf-8', errors='ignore') as f:
                lines = f.readlines()
        else:
            lines = sys.stdin.readlines()
    
        status_groups = defaultdict(list)
        
        for line_num, line in enumerate(lines, 1):
            parsed = parse_httpx_line(line)
            if parsed:
                status_groups[parsed['status']].append(parsed)
        
        if not status_groups:
            print("Your data be looking sus and fried. try again. please.", file=sys.stderr)
            return
        
        # genius level lambda sorting magic
        for status in sorted(status_groups.keys(), key=int):
            entries = sorted(status_groups[status], key=lambda x: x['content_length'])
            
            print(f"#{status}s")
            for entry in entries:
                print(f"{entry['url']} [{entry['content_length']}]")
            print()
            
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        sys.exit(1)

if __name__ == "__main__":
    main()