Skip to content

Instantly share code, notes, and snippets.

@gitcnd
Last active November 9, 2024 23:09
Show Gist options
  • Save gitcnd/43069fa310ff0ed3f870b06a13ef94ea to your computer and use it in GitHub Desktop.
Save gitcnd/43069fa310ff0ed3f870b06a13ef94ea to your computer and use it in GitHub Desktop.

Revisions

  1. gitcnd revised this gist Nov 9, 2024. 1 changed file with 15 additions and 38 deletions.
    53 changes: 15 additions & 38 deletions chatgpt_to_text.py
    Original file line number Diff line number Diff line change
    @@ -1,25 +1,21 @@
    #!/usr/bin/env python3

    __version__ = '1.20241107'
    __version__ = '1.20241110'

    """
    chatgpt_to_text.py program reads "conversations.json" ChatGPT export files,
    and turns them into text files suitable for uploading into a RAG AI system.
    Usage:
    mkdir txt
    cd txt
    ../chatgpt_to_text.py ../conversations.json > out.txt
    ../chatgpt_to_text.py ../conversations.json > everything_in_one_file.txt
    # Creates lots of files
    """

    import json
    import sys
    import os
    import html
    import time

    YELLOW = "\033[33m"
    RESET = "\033[0m"
    @@ -38,10 +34,14 @@ def load_conversations(filename):

    def unique_filename(base_name):
    """Generate a unique filename by appending -1, -2, etc. if a file already exists."""
    filename = f"{base_name}.txt"
    # filter out disallowed characters
    sanitized_title = "".join(char if char in set("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_-") else "_" for char in base_name)
    if not len(sanitized_title): sanitized_title = "new_chat"

    filename = f"{sanitized_title}.txt"
    counter = 1
    while os.path.exists(filename):
    filename = f"{base_name}-{counter}.txt"
    filename = f"{sanitized_title}-{counter}.txt"
    counter += 1
    return filename

    @@ -88,12 +88,9 @@ def extract_messages(conversation):
    def print_and_save_conversations(conversations):
    for conversation in conversations:
    title = conversation.get("title", "Untitled Conversation")
    conversation_id = conversation.get("conversation_id", "")
    if conversation_id != "": conversation_id = 'https://chatgpt.com/c/' + conversation_id + "\n"
    print(f"{YELLOW}Title: {title}{RESET}")
    print("-" * (len("Title: ") + len(title)))
    if conversation_id != "": print(conversation_id)
    else: print()
    print()

    messages = extract_messages(conversation)

    @@ -104,46 +101,26 @@ def print_and_save_conversations(conversations):
    print("\n") # Add extra space between conversations

    # Save messages to a file
    sanitized_title = title.replace(" ", "_").replace("/", "-") # Make title filename-safe
    filename = unique_filename(sanitized_title)
    #sanitized_title = title.replace(" ", "_").replace("/", "-") # Make title filename-safe
    #filename = unique_filename(sanitized_title)
    filename = unique_filename(title)

    with open(filename, 'w') as file:
    file.write(f"Title: {title}\n")
    file.write("-" * (len("Title: ") + len(title)) + "\n"+ conversation_id +"\n")
    file.write("-" * (len("Title: ") + len(title)) + "\n\n")
    for message in messages:
    file.write(message + "\n")

    print(f"Saved conversation to {filename}\n")
    update_time = float(conversation.get("update_time",0.0))
    if update_time >0:
    os.utime(filename, (update_time, update_time))


    # # For Windows, to set creation time as well:
    # try:
    # import pywin32
    # import pywintypes
    # import win32file
    # create_time = float(conversation['create_time'])
    #
    # # Set creation time
    # win32file.SetFileTime(
    # filename,
    # pywintypes.Time(create_time), # Creation time
    # None, # Access time
    # pywintypes.Time(update_time) # Modification time
    # )
    # except ImportError:
    # print("pywin32 is not installed; creation time will not be set.")


    # Ensure a filename argument is provided
    if len(sys.argv) < 2:
    print("Error: Please provide the path to the JSON file as the first argument.")
    print("Error: Please provide the path to the JSON file (e.g. conversations.json) as the first argument.")
    sys.exit(1)

    # Load the JSON and print conversations in plain text
    filename = sys.argv[1]
    conversations = load_conversations(filename)
    print_and_save_conversations(conversations)

    # the end.
  2. gitcnd created this gist Nov 7, 2024.
    149 changes: 149 additions & 0 deletions chatgpt_to_text.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,149 @@
    #!/usr/bin/env python3

    __version__ = '1.20241107'

    """
    chatgpt_to_text.py program reads "conversations.json" ChatGPT export files,
    and turns them into text files suitable for uploading into a RAG AI system.
    Usage:
    mkdir txt
    cd txt
    ../chatgpt_to_text.py ../conversations.json > out.txt
    # Creates lots of files
    """

    import json
    import sys
    import os
    import html
    import time

    YELLOW = "\033[33m"
    RESET = "\033[0m"

    def load_conversations(filename):
    if not filename:
    print("Error: No input file provided.")
    sys.exit(1)
    if not os.path.isfile(filename):
    print(f"Error: File '{filename}' does not exist.")
    sys.exit(1)

    with open(filename, 'r') as file:
    return json.load(file)


    def unique_filename(base_name):
    """Generate a unique filename by appending -1, -2, etc. if a file already exists."""
    filename = f"{base_name}.txt"
    counter = 1
    while os.path.exists(filename):
    filename = f"{base_name}-{counter}.txt"
    counter += 1
    return filename


    def extract_messages(conversation):
    messages = []
    current_node = conversation.get("current_node")

    while current_node:
    node = conversation["mapping"].get(current_node)

    # Check if node exists and is a dictionary before proceeding
    if not node or not isinstance(node, dict):
    break

    message = node.get("message")

    # Check if message is a dictionary with content
    if message and isinstance(message, dict):
    message_content = message.get("content", {})
    if (
    isinstance(message_content, dict) and
    message_content.get("content_type") == "text" and
    message_content.get("parts")
    ):
    author = message["author"]["role"]
    circ='🔴' # red circle
    if author == "assistant":
    author = "ChatGPT"
    circ='🔵' # blue circle
    elif author == "system" and message.get("metadata", {}).get("is_user_system_message"):
    author = "Custom user info"
    text = message_content["parts"][0].strip()
    text = html.unescape(text) # Convert HTML entities to plain text
    if text: # Only add non-empty messages
    messages.append(f"{circ} {author}: {text}\n")

    # Move to the parent node
    current_node = node.get("parent")

    return messages[::-1] # Reverse to get correct order


    def print_and_save_conversations(conversations):
    for conversation in conversations:
    title = conversation.get("title", "Untitled Conversation")
    conversation_id = conversation.get("conversation_id", "")
    if conversation_id != "": conversation_id = 'https://chatgpt.com/c/' + conversation_id + "\n"
    print(f"{YELLOW}Title: {title}{RESET}")
    print("-" * (len("Title: ") + len(title)))
    if conversation_id != "": print(conversation_id)
    else: print()

    messages = extract_messages(conversation)

    # Print messages to the screen
    for message in messages:
    print(message)

    print("\n") # Add extra space between conversations

    # Save messages to a file
    sanitized_title = title.replace(" ", "_").replace("/", "-") # Make title filename-safe
    filename = unique_filename(sanitized_title)

    with open(filename, 'w') as file:
    file.write(f"Title: {title}\n")
    file.write("-" * (len("Title: ") + len(title)) + "\n"+ conversation_id +"\n")
    for message in messages:
    file.write(message + "\n")

    print(f"Saved conversation to {filename}\n")
    update_time = float(conversation.get("update_time",0.0))
    if update_time >0:
    os.utime(filename, (update_time, update_time))


    # # For Windows, to set creation time as well:
    # try:
    # import pywin32
    # import pywintypes
    # import win32file
    # create_time = float(conversation['create_time'])
    #
    # # Set creation time
    # win32file.SetFileTime(
    # filename,
    # pywintypes.Time(create_time), # Creation time
    # None, # Access time
    # pywintypes.Time(update_time) # Modification time
    # )
    # except ImportError:
    # print("pywin32 is not installed; creation time will not be set.")


    # Ensure a filename argument is provided
    if len(sys.argv) < 2:
    print("Error: Please provide the path to the JSON file as the first argument.")
    sys.exit(1)

    # Load the JSON and print conversations in plain text
    filename = sys.argv[1]
    conversations = load_conversations(filename)
    print_and_save_conversations(conversations)