Last active
November 9, 2024 23:09
-
-
Save gitcnd/43069fa310ff0ed3f870b06a13ef94ea to your computer and use it in GitHub Desktop.
Revisions
-
gitcnd revised this gist
Nov 9, 2024 . 1 changed file with 15 additions and 38 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,25 +1,21 @@ #!/usr/bin/env python3 __version__ = '1.20241110' """ chatgpt_to_text.py program reads "conversations.json" ChatGPT export files, and turns them into text files suitable for uploading into a RAG AI system. Usage: mkdir txt cd txt ../chatgpt_to_text.py ../conversations.json > everything_in_one_file.txt # Creates lots of files """ import json import sys import os import html YELLOW = "\033[33m" RESET = "\033[0m" @@ -38,10 +34,14 @@ def load_conversations(filename): def unique_filename(base_name): """Generate a unique filename by appending -1, -2, etc. if a file already exists.""" # filter out disallowed characters sanitized_title = "".join(char if char in set("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_-") else "_" for char in base_name) if not len(sanitized_title): sanitized_title = "new_chat" filename = f"{sanitized_title}.txt" counter = 1 while os.path.exists(filename): filename = f"{sanitized_title}-{counter}.txt" counter += 1 return filename @@ -88,12 +88,9 @@ def extract_messages(conversation): def print_and_save_conversations(conversations): for conversation in conversations: title = conversation.get("title", "Untitled Conversation") print(f"{YELLOW}Title: {title}{RESET}") print("-" * (len("Title: ") + len(title))) print() messages = extract_messages(conversation) @@ -104,46 +101,26 @@ def print_and_save_conversations(conversations): print("\n") # Add extra space between conversations # Save messages to a file #sanitized_title = title.replace(" ", "_").replace("/", "-") # Make title filename-safe #filename = unique_filename(sanitized_title) filename = unique_filename(title) with open(filename, 'w') as file: file.write(f"Title: {title}\n") file.write("-" * (len("Title: ") + len(title)) + "\n\n") for message in messages: file.write(message + "\n") print(f"Saved conversation to {filename}\n") # Ensure a filename argument is provided if len(sys.argv) < 2: print("Error: Please provide the path to the JSON file (e.g. conversations.json) as the first argument.") sys.exit(1) filename = sys.argv[1] conversations = load_conversations(filename) print_and_save_conversations(conversations) # the end. -
gitcnd created this gist
Nov 7, 2024 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,149 @@ #!/usr/bin/env python3 __version__ = '1.20241107' """ chatgpt_to_text.py program reads "conversations.json" ChatGPT export files, and turns them into text files suitable for uploading into a RAG AI system. Usage: mkdir txt cd txt ../chatgpt_to_text.py ../conversations.json > out.txt # Creates lots of files """ import json import sys import os import html import time YELLOW = "\033[33m" RESET = "\033[0m" def load_conversations(filename): if not filename: print("Error: No input file provided.") sys.exit(1) if not os.path.isfile(filename): print(f"Error: File '{filename}' does not exist.") sys.exit(1) with open(filename, 'r') as file: return json.load(file) def unique_filename(base_name): """Generate a unique filename by appending -1, -2, etc. if a file already exists.""" filename = f"{base_name}.txt" counter = 1 while os.path.exists(filename): filename = f"{base_name}-{counter}.txt" counter += 1 return filename def extract_messages(conversation): messages = [] current_node = conversation.get("current_node") while current_node: node = conversation["mapping"].get(current_node) # Check if node exists and is a dictionary before proceeding if not node or not isinstance(node, dict): break message = node.get("message") # Check if message is a dictionary with content if message and isinstance(message, dict): message_content = message.get("content", {}) if ( isinstance(message_content, dict) and message_content.get("content_type") == "text" and message_content.get("parts") ): author = message["author"]["role"] circ='🔴' # red circle if author == "assistant": author = "ChatGPT" circ='🔵' # blue circle elif author == "system" and message.get("metadata", {}).get("is_user_system_message"): author = "Custom user info" text = message_content["parts"][0].strip() text = html.unescape(text) # Convert HTML entities to plain text if text: # Only add non-empty messages messages.append(f"{circ} {author}: {text}\n") # Move to the parent node current_node = node.get("parent") return messages[::-1] # Reverse to get correct order def print_and_save_conversations(conversations): for conversation in conversations: title = conversation.get("title", "Untitled Conversation") conversation_id = conversation.get("conversation_id", "") if conversation_id != "": conversation_id = 'https://chatgpt.com/c/' + conversation_id + "\n" print(f"{YELLOW}Title: {title}{RESET}") print("-" * (len("Title: ") + len(title))) if conversation_id != "": print(conversation_id) else: print() messages = extract_messages(conversation) # Print messages to the screen for message in messages: print(message) print("\n") # Add extra space between conversations # Save messages to a file sanitized_title = title.replace(" ", "_").replace("/", "-") # Make title filename-safe filename = unique_filename(sanitized_title) with open(filename, 'w') as file: file.write(f"Title: {title}\n") file.write("-" * (len("Title: ") + len(title)) + "\n"+ conversation_id +"\n") for message in messages: file.write(message + "\n") print(f"Saved conversation to {filename}\n") update_time = float(conversation.get("update_time",0.0)) if update_time >0: os.utime(filename, (update_time, update_time)) # # For Windows, to set creation time as well: # try: # import pywin32 # import pywintypes # import win32file # create_time = float(conversation['create_time']) # # # Set creation time # win32file.SetFileTime( # filename, # pywintypes.Time(create_time), # Creation time # None, # Access time # pywintypes.Time(update_time) # Modification time # ) # except ImportError: # print("pywin32 is not installed; creation time will not be set.") # Ensure a filename argument is provided if len(sys.argv) < 2: print("Error: Please provide the path to the JSON file as the first argument.") sys.exit(1) # Load the JSON and print conversations in plain text filename = sys.argv[1] conversations = load_conversations(filename) print_and_save_conversations(conversations)