Skip to content

Instantly share code, notes, and snippets.

@shayanb
Created February 19, 2025 10:07
Show Gist options
  • Save shayanb/1ca4483d29c65ff47fa644417fe6920f to your computer and use it in GitHub Desktop.
Save shayanb/1ca4483d29c65ff47fa644417fe6920f to your computer and use it in GitHub Desktop.

Revisions

  1. shayanb created this gist Feb 19, 2025.
    177 changes: 177 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,177 @@
    import json
    import requests
    from pathlib import Path
    import re
    from datetime import datetime
    import base62

    def process_forum_topics():
    """Process all markdown files in forum_topics directory to fix image links"""
    topics_dir = Path('forum_topics')

    # Create images directory if it doesn't exist
    images_dir = Path('images')
    images_dir.mkdir(exist_ok=True)

    # Create attachments directory if it doesn't exist
    attachments_dir = Path('attachments')
    attachments_dir.mkdir(exist_ok=True)

    # Process each markdown file
    for md_file in topics_dir.glob('*.md'):
    # Read the file content
    with open(md_file, 'r', encoding='utf-8') as f:
    content = f.read()

    # Fix image and attachment links
    updated_content = fix_image_links(content)

    # Write back the updated content
    with open(md_file, 'w', encoding='utf-8') as f:
    f.write(updated_content)
    def fix_image_links(markdown_text):
    """Replace Discourse upload links with relative paths to images directory"""
    # Match both image and attachment formats:
    # ![title|dimensions](attachments/hash.ext)
    # ![title|dimensions](upload://hash.ext)
    # [title|attachment](attachments/hash.ext)
    # [title|attachment](upload://hash.ext)
    # ![[title|dimensions](images/hash.ext)]
    patterns = [
    # Match ![title|dimensions](attachments/hash.ext) and ![title|dimensions](upload://hash.ext)
    r'!\[([^\]]+)\]\((?:attachments|upload:\/\/)\/([a-zA-Z0-9]+)(?:\.(\w+))?\)',
    # Match [title|attachment](attachments/hash.ext) and [title|attachment](upload://hash.ext)
    r'\[([^\]]+)\]\((?:attachments|upload:\/\/)\/([a-zA-Z0-9]+)(?:\.(\w+))?\)',
    # Match ![title|dimensions](images/hash.ext)
    r'!\[([^\]]+)\]\(images\/([a-zA-Z0-9]+)(?:\.(\w+))?\)',
    # Match [title|attachment](images/hash.ext)
    r'\[([^\]]+)\]\(images\/([a-zA-Z0-9]+)(?:\.(\w+))?\)',
    # Match ![[title|dimensions](images/hash.ext)]
    r'!\[\[([^\]]+)\]\(images\/([a-zA-Z0-9]+)(?:\.(\w+))?\)\]',
    # Match ![[Screenshot title|dimensions](images/hash.jpeg)]
    r'!\[\[Screenshot[^\]]+\]\(images\/([a-zA-Z0-9]+)(?:\.(\w+))?\)\]'
    ]

    for pattern in patterns:
    markdown_text = re.sub(pattern, replace_match, markdown_text)

    return markdown_text

    def replace_match(match):
    # Handle different pattern matches
    if len(match.groups()) == 3:
    title = match.group(1)
    hash_or_path = match.group(2)
    ext = match.group(3) or ''
    else:
    # For Screenshot pattern
    hash_or_path = match.group(1)
    ext = match.group(2) or ''
    title = f'Screenshot'

    # If it's already a hex hash, use it directly
    if len(hash_or_path) == 40 and all(c in '0123456789abcdef' for c in hash_or_path.lower()):
    hex_hash = hash_or_path
    else:
    # Convert base62 hash to hex
    hex_hash = hex(base62.decode(hash_or_path))[2:].zfill(40)

    # Check if it's an image by looking for image markdown syntax
    is_image = '!' in match.group(0)

    if is_image:
    # For images, look in images directory
    return f'![{title}](images/{hex_hash}.{ext})'
    else:
    # For attachments, look in attachments directory
    return f'[{title}](attachments/{hex_hash}.{ext})'

    def clean_markdown(text):
    """Clean and format markdown content"""
    # Split into posts by the separator
    posts = text.split('-' * 25)
    cleaned_posts = []

    for post in posts:
    # Remove extra whitespace and newlines
    post = post.strip()
    if post:
    cleaned_posts.append(post)

    return cleaned_posts

    def parse_post_metadata(post):
    """Extract metadata (author, date) from post text"""
    # Look for patterns like "username - February 12, 2023 2:34pm"
    metadata_pattern = r'([^\n-]+)\s*-\s*([^\n]+)$'

    lines = post.split('\n')
    metadata_line = lines[0]
    content = '\n'.join(lines[1:])

    match = re.search(metadata_pattern, metadata_line)
    if match:
    author = match.group(1).strip()
    date_str = match.group(2).strip()
    return {
    'author': author,
    'date': date_str,
    'content': content.strip()
    }
    return None

    def format_github_markdown(topic):
    """Format a topic and its replies into a nice GitHub markdown"""
    response = requests.get(topic['markdownurl'])
    raw_content = response.text

    posts = clean_markdown(raw_content)

    # Create markdown content
    md_content = f"# {topic['title']}\n\n"
    md_content += f"*Original topic from {topic['date_time']}*\n\n"

    for i, post in enumerate(posts):
    metadata = parse_post_metadata(post)
    if metadata:
    if i == 0:
    md_content += f"### Original Post\n"
    else:
    md_content += f"### Reply #{i}\n"

    md_content += f"**Author:** {metadata['author']} \n"
    md_content += f"**Date:** {metadata['date']} \n\n"
    md_content += f"{metadata['content']}\n\n"
    md_content += "---\n\n"

    return md_content

    def main():
    # Create output directory
    output_dir = Path('forum_topics')
    output_dir.mkdir(exist_ok=True)

    # Read JSON file
    with open('iranunchained_posts.json', 'r') as f:
    topics = json.load(f)

    # Process each topic
    for topic in topics:
    # Create a safe filename from the title
    safe_title = re.sub(r'[^\w\-]', '_', topic['title'].lower())
    output_file = output_dir / f"{safe_title}.md"

    # Format and save the markdown
    try:
    markdown_content = format_github_markdown(topic)
    output_file.write_text(markdown_content)
    print(f"Processed: {topic['title']}")
    except Exception as e:
    print(f"Error processing {topic['title']}: {str(e)}")

    if __name__ == "__main__":
    main()

    # uncomment the function below to fix the image names
    # todo: it does not handle all attachments/extensions
    #process_forum_topics()