Created
February 19, 2025 04:11
-
-
Save SamSaffron/b7745cb9d3a30d5dc52d2e9a6500b64a to your computer and use it in GitHub Desktop.
Revisions
-
SamSaffron created this gist
Feb 19, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,360 @@ #!/usr/bin/env ruby require "fileutils" require "open-uri" TOPICS_DIR = File.expand_path("./topics", __dir__) LLM_MODEL = "Gemini Flash 2.0" Dir.chdir("/home/sam/Source/discourse") require File.expand_path( "/home/sam/Source/discourse/config/environment", __FILE__ ) def ask_llm(system_message, user_message) llm_model = LlmModel.find_by(display_name: LLM_MODEL) llm = llm_model.to_llm messages = [{ type: :user, content: user_message }] prompt = DiscourseAi::Completions::Prompt.new(system_message, messages: messages) llm.generate(prompt, user: Discourse.system_user) end def download_topic(site, topic_id) # Construct the URL for the topic JSON url = "https://#{site}/t/#{topic_id}.json" # Create TOPICS_DIR if it doesn't exist FileUtils.mkdir_p(TOPICS_DIR) # Define the output file path output_file = File.join(TOPICS_DIR, "#{topic_id}.json") # Download and save the topic JSON begin response = URI.open(url).read File.write(output_file, response) rescue OpenURI::HTTPError => e puts "Error downloading topic #{topic_id}: #{e.message}" rescue StandardError => e puts "Unexpected error: #{e.message}" end end def list_topic_ids(site, count) all_ids = [] page = 0 while all_ids.size < count url = "https://#{site}/latest.json?no_definitions=true&page=#{page}" begin response = URI.open(url).read data = JSON.parse(response) topics = data["topic_list"]["topics"] break if topics.empty? all_ids.concat(topics.map { |t| t["id"] }) page += 1 rescue OpenURI::HTTPError => e puts "Error fetching topic list: #{e.message}" break rescue StandardError => e puts "Unexpected error: #{e.message}" break end end all_ids.uniq.first(count) end def download_topics(site, count, only_new: true) topic_ids = list_topic_ids(site, count) topic_ids.each do |topic_id| output_file = File.join(TOPICS_DIR, "#{topic_id}.json") next if File.exist?(output_file) && only_new download_topic(site, topic_id) end end def extract_concepts(topic_id, existing_concepts = nil) topic_file = File.join(TOPICS_DIR, "#{topic_id}.json") return [] unless File.exist?(topic_file) topic_data = JSON.parse(File.read(topic_file)) posts = topic_data["post_stream"]["posts"] system_message = <<~PROMPT You are a concept extraction assistant. Extract key concepts from the given text. - Concepts should be 1-3 words - Return 3-5 key concepts that best represent the discussion - Format output as JSON array of [concept, [post_numbers]] Example: [ ["Discourse update", [1,2]], ["Docker update", [3,4]] ] IMPORTANT: Only ever reply with valid JSON, do not return any other text IMPORTANT: Do not wrap the result with ```json or any other formatting PROMPT context = if existing_concepts "Existing concepts: #{existing_concepts.join(", ")}. Please prefer these concepts when applicable.\n\n" else "" end # Add metadata section metadata = [] metadata << "Title: #{topic_data["title"]}" if topic_data["title"] if topic_data["category_name"] metadata << "Category: #{topic_data["category_name"]}" end if topic_data["tags"]&.any? metadata << "Tags: #{topic_data["tags"].join(", ")}" end content = [ metadata.join("\n"), posts .map { |post| "Post ##{post["post_number"]}: #{post["cooked"]}" } .join("\n\n") ].join("\n\n") user_message = context + content retries = 0 max_retries = 5 begin result = ask_llm(system_message, user_message) rescue => e if e.message.include?("RESOURCE_EXHAUSTED") && retries < max_retries retries += 1 puts "Rate limited, waiting 20 seconds... (attempt #{retries}/#{max_retries})" sleep 20 retry else puts "Error after #{retries} retries: #{e.message}" return [] end end # gemini likes to do this result = result.gsub(/^```.*/, "").strip begin JSON.parse(result) rescue JSON::ParserError puts "Error parsing LLM response for topic #{topic_id}" puts result [] end end def extract_all_concepts first_pass_dir = File.expand_path("./first_pass", __dir__) FileUtils.mkdir_p(first_pass_dir) Dir .glob(File.join(TOPICS_DIR, "*.json")) .each do |topic_file| topic_id = File.basename(topic_file, ".json") output_file = File.join(first_pass_dir, "#{topic_id}.json") # Skip if already processed next if File.exist?(output_file) puts "Processing topic #{topic_id}..." concepts = extract_concepts(topic_id) p concepts # Save concepts to file File.write(output_file, JSON.pretty_generate(concepts)) end end def list_all_concepts first_pass_dir = File.expand_path("./first_pass", __dir__) concept_counts = Hash.new(0) Dir .glob(File.join(first_pass_dir, "*.json")) .each do |file| begin concepts = JSON.parse(File.read(file)) concepts.each { |concept, _posts| concept_counts[concept] += 1 } rescue JSON::ParserError => e puts "Error parsing #{file}: #{e.message}" end end concepts = +"" # Sort by count in descending order and convert to array of [concept, count] concept_counts .sort_by { |_, count| -count } .each { |concept, count| concepts << "#{concept}: #{count}\n" } end def normalize_concepts(max_concepts = 100) normalized_file = File.expand_path("./normalized_concepts.json", __dir__) # Return cached results if they exist return JSON.parse(File.read(normalized_file)) if File.exist?(normalized_file) concepts = list_all_concepts system_message = <<~PROMPT You are a concept normalization assistant. Given a list of concepts and their frequencies, normalize them according to these rules: 1. Use consistent title case for all concepts 2. Merge similar or duplicate concepts (e.g. "docker update" and "updating docker") 3. Generalize overly specific concepts while maintaining meaning 4. Return only the top #{max_concepts} most relevant concepts Format output as JSON array of [normalized_concept, count]: [ ["Docker Updates", 45], ["Performance Optimization", 32] ] IMPORTANT: Only reply with valid JSON, no other text IMPORTANT: Maintain relative usage counts when merging concepts PROMPT user_message = "Here are the concepts to normalize:\n#{concepts}" retries = 0 max_retries = 5 begin result = ask_llm(system_message, user_message) result = result.gsub(/^```.*/, "").strip normalized = JSON.parse(result) # Cache the results File.write( File.expand_path("./normalized_concepts.json", __dir__), JSON.pretty_generate(normalized) ) normalized rescue => e if e.message.include?("RESOURCE_EXHAUSTED") && retries < max_retries retries += 1 puts "Rate limited, waiting 20 seconds... (attempt #{retries}/#{max_retries})" sleep 20 retry else puts "Error after #{retries} retries: #{e.message}" [] end end end def reclassify_topics normalized_concepts = normalize_concepts(100) concept_names = normalized_concepts.map { |concept, _| concept } final_dir = File.expand_path("./final_classifications", __dir__) FileUtils.mkdir_p(final_dir) Dir .glob(File.join(TOPICS_DIR, "*.json")) .each do |topic_file| topic_id = File.basename(topic_file, ".json") output_file = File.join(final_dir, "#{topic_id}.json") # Skip if already processed next if File.exist?(output_file) puts "Reclassifying topic #{topic_id}..." concepts = extract_concepts(topic_id, concept_names) # Save final classification File.write(output_file, JSON.pretty_generate(concepts)) end end def summarize_classifications final_dir = File.expand_path("./final_classifications", __dir__) concept_counts = Hash.new(0) topic_concepts = Hash.new { |h, k| h[k] = [] } Dir .glob(File.join(final_dir, "*.json")) .each do |file| topic_id = File.basename(file, ".json") begin concepts = JSON.parse(File.read(file)) concepts.each do |concept, posts| concept_counts[concept] += 1 topic_concepts[concept] << topic_id end rescue JSON::ParserError => e puts "Error parsing #{file}: #{e.message}" end end summary = { concept_counts: concept_counts.sort_by { |_, count| -count }.to_h, topic_concepts: topic_concepts } File.write( File.expand_path("./classification_summary.json", __dir__), JSON.pretty_generate(summary) ) summary end def generate_graph_data final_dir = File.expand_path("./final_classifications", __dir__) topics_data = [] Dir .glob(File.join(TOPICS_DIR, "*.json")) .each do |topic_file| topic_id = File.basename(topic_file, ".json") # Read topic data topic_json = JSON.parse(File.read(topic_file)) # Read classification data classification_file = File.join(final_dir, "#{topic_id}.json") next unless File.exist?(classification_file) concepts = JSON.parse(File.read(classification_file)) # Create topic entry topic_entry = { id: topic_id, slug: topic_json["slug"], title: topic_json["title"], concepts: concepts.to_h # Convert the array of [concept, posts] to a hash } topics_data << topic_entry end graph_data = { topics: topics_data } # Save to file output_file = File.expand_path("./graph_data.json", __dir__) File.write(output_file, JSON.pretty_generate(graph_data)) graph_data end # Example usage: #download_topics("meta.discourse.org", 500) #extract_all_concepts #list_all_concepts # normalize_concepts(100) # reclassify_topics generate_graph_data