gitcnd · November 9, 2024 23:09 · Nov 9, 2024 · Nov 7, 2024
diff --git a/chatgpt_to_text.py b/chatgpt_to_text.py
@@ -1,25 +1,21 @@
 #!/usr/bin/env python3
 
-__version__ = '1.20241107'
+__version__ = '1.20241110'
 
 """
 chatgpt_to_text.py program reads "conversations.json" ChatGPT export files,
 and turns them into text files suitable for uploading into a RAG AI system.
-
 Usage:
-
     mkdir txt
     cd txt
-    ../chatgpt_to_text.py ../conversations.json > out.txt
+    ../chatgpt_to_text.py ../conversations.json > everything_in_one_file.txt
     # Creates lots of files
-
 """
 
 import json
 import sys
 import os
 import html
-import time
 
 YELLOW = "\033[33m"
 RESET = "\033[0m"
@@ -38,10 +34,14 @@ def load_conversations(filename):
 
 def unique_filename(base_name):
     """Generate a unique filename by appending -1, -2, etc. if a file already exists."""
-    filename = f"{base_name}.txt"
+    # filter out disallowed characters
+    sanitized_title = "".join(char if char in set("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_-") else "_" for char in base_name)
+    if not len(sanitized_title): sanitized_title = "new_chat"
+
+    filename = f"{sanitized_title}.txt"
     counter = 1
     while os.path.exists(filename):
-        filename = f"{base_name}-{counter}.txt"
+        filename = f"{sanitized_title}-{counter}.txt"
         counter += 1
     return filename
 
@@ -88,12 +88,9 @@ def extract_messages(conversation):
 def print_and_save_conversations(conversations):
     for conversation in conversations:
         title = conversation.get("title", "Untitled Conversation")
-        conversation_id = conversation.get("conversation_id", "")
-        if conversation_id != "": conversation_id = 'https://chatgpt.com/c/' + conversation_id + "\n"
         print(f"{YELLOW}Title: {title}{RESET}")
         print("-" * (len("Title: ") + len(title)))
-        if conversation_id != "": print(conversation_id)
-        else: print()
+        print()
 
         messages = extract_messages(conversation)
 
@@ -104,46 +101,26 @@ def print_and_save_conversations(conversations):
         print("\n")  # Add extra space between conversations
 
         # Save messages to a file
-        sanitized_title = title.replace(" ", "_").replace("/", "-")  # Make title filename-safe
-        filename = unique_filename(sanitized_title)
+        #sanitized_title = title.replace(" ", "_").replace("/", "-")  # Make title filename-safe
+        #filename = unique_filename(sanitized_title)
+        filename = unique_filename(title)
 
         with open(filename, 'w') as file:
             file.write(f"Title: {title}\n")
-            file.write("-" * (len("Title: ") + len(title)) + "\n"+ conversation_id +"\n")
+            file.write("-" * (len("Title: ") + len(title)) + "\n\n")
             for message in messages:
                 file.write(message + "\n")
 
         print(f"Saved conversation to {filename}\n")
-        update_time = float(conversation.get("update_time",0.0))
-        if update_time >0:
-            os.utime(filename, (update_time, update_time))
-
-
-        # # For Windows, to set creation time as well:
-        # try:
-        #     import pywin32
-        #     import pywintypes
-        #     import win32file
-        #     create_time = float(conversation['create_time'])
-        # 
-        #     # Set creation time
-        #     win32file.SetFileTime(
-        #         filename,
-        #         pywintypes.Time(create_time),   # Creation time
-        #         None,                           # Access time
-        #         pywintypes.Time(update_time)    # Modification time
-        #     )
-        # except ImportError:
-        #     print("pywin32 is not installed; creation time will not be set.")
 
 
 # Ensure a filename argument is provided
 if len(sys.argv) < 2:
-    print("Error: Please provide the path to the JSON file as the first argument.")
+    print("Error: Please provide the path to the JSON file (e.g. conversations.json) as the first argument.")
     sys.exit(1)
 
-# Load the JSON and print conversations in plain text
 filename = sys.argv[1]
 conversations = load_conversations(filename)
 print_and_save_conversations(conversations)
 
+# the end.
diff --git a/chatgpt_to_text.py b/chatgpt_to_text.py
@@ -0,0 +1,149 @@
+#!/usr/bin/env python3
+
+__version__ = '1.20241107'
+
+"""
+chatgpt_to_text.py program reads "conversations.json" ChatGPT export files,
+and turns them into text files suitable for uploading into a RAG AI system.
+
+Usage:
+
+    mkdir txt
+    cd txt
+    ../chatgpt_to_text.py ../conversations.json > out.txt
+    # Creates lots of files
+
+"""
+
+import json
+import sys
+import os
+import html
+import time
+
+YELLOW = "\033[33m"
+RESET = "\033[0m"
+
+def load_conversations(filename):
+    if not filename:
+        print("Error: No input file provided.")
+        sys.exit(1)
+    if not os.path.isfile(filename):
+        print(f"Error: File '{filename}' does not exist.")
+        sys.exit(1)
+
+    with open(filename, 'r') as file:
+        return json.load(file)
+
+
+def unique_filename(base_name):
+    """Generate a unique filename by appending -1, -2, etc. if a file already exists."""
+    filename = f"{base_name}.txt"
+    counter = 1
+    while os.path.exists(filename):
+        filename = f"{base_name}-{counter}.txt"
+        counter += 1
+    return filename
+
+
+def extract_messages(conversation):
+    messages = []
+    current_node = conversation.get("current_node")
+
+    while current_node:
+        node = conversation["mapping"].get(current_node)
+
+        # Check if node exists and is a dictionary before proceeding
+        if not node or not isinstance(node, dict):
+            break
+
+        message = node.get("message")
+
+        # Check if message is a dictionary with content
+        if message and isinstance(message, dict):
+            message_content = message.get("content", {})
+            if (
+                isinstance(message_content, dict) and 
+                message_content.get("content_type") == "text" and 
+                message_content.get("parts")
+            ):
+                author = message["author"]["role"]
+                circ='🔴' # red circle
+                if author == "assistant":
+                    author = "ChatGPT"
+                    circ='🔵' # blue circle
+                elif author == "system" and message.get("metadata", {}).get("is_user_system_message"):
+                    author = "Custom user info"
+                text = message_content["parts"][0].strip()
+                text = html.unescape(text)  # Convert HTML entities to plain text
+                if text:  # Only add non-empty messages
+                    messages.append(f"{circ} {author}: {text}\n")
+
+        # Move to the parent node
+        current_node = node.get("parent")
+
+    return messages[::-1]  # Reverse to get correct order
+
+
+def print_and_save_conversations(conversations):
+    for conversation in conversations:
+        title = conversation.get("title", "Untitled Conversation")
+        conversation_id = conversation.get("conversation_id", "")
+        if conversation_id != "": conversation_id = 'https://chatgpt.com/c/' + conversation_id + "\n"
+        print(f"{YELLOW}Title: {title}{RESET}")
+        print("-" * (len("Title: ") + len(title)))
+        if conversation_id != "": print(conversation_id)
+        else: print()
+
+        messages = extract_messages(conversation)
+
+        # Print messages to the screen
+        for message in messages:
+            print(message)
+
+        print("\n")  # Add extra space between conversations
+
+        # Save messages to a file
+        sanitized_title = title.replace(" ", "_").replace("/", "-")  # Make title filename-safe
+        filename = unique_filename(sanitized_title)
+
+        with open(filename, 'w') as file:
+            file.write(f"Title: {title}\n")
+            file.write("-" * (len("Title: ") + len(title)) + "\n"+ conversation_id +"\n")
+            for message in messages:
+                file.write(message + "\n")
+
+        print(f"Saved conversation to {filename}\n")
+        update_time = float(conversation.get("update_time",0.0))
+        if update_time >0:
+            os.utime(filename, (update_time, update_time))
+
+
+        # # For Windows, to set creation time as well:
+        # try:
+        #     import pywin32
+        #     import pywintypes
+        #     import win32file
+        #     create_time = float(conversation['create_time'])
+        # 
+        #     # Set creation time
+        #     win32file.SetFileTime(
+        #         filename,
+        #         pywintypes.Time(create_time),   # Creation time
+        #         None,                           # Access time
+        #         pywintypes.Time(update_time)    # Modification time
+        #     )
+        # except ImportError:
+        #     print("pywin32 is not installed; creation time will not be set.")
+
+
+# Ensure a filename argument is provided
+if len(sys.argv) < 2:
+    print("Error: Please provide the path to the JSON file as the first argument.")
+    sys.exit(1)
+
+# Load the JSON and print conversations in plain text
+filename = sys.argv[1]
+conversations = load_conversations(filename)
+print_and_save_conversations(conversations)
+