SamSaffron · February 19, 2025 04:11 · Feb 19, 2025
diff --git a/classify b/classify
@@ -0,0 +1,360 @@
+#!/usr/bin/env ruby
+require "fileutils"
+require "open-uri"
+
+TOPICS_DIR = File.expand_path("./topics", __dir__)
+LLM_MODEL = "Gemini Flash 2.0"
+
+Dir.chdir("/home/sam/Source/discourse")
+
+require File.expand_path(
+          "/home/sam/Source/discourse/config/environment",
+          __FILE__
+        )
+
+def ask_llm(system_message, user_message)
+  llm_model = LlmModel.find_by(display_name: LLM_MODEL)
+  llm = llm_model.to_llm
+
+  messages = [{ type: :user, content: user_message }]
+
+  prompt =
+    DiscourseAi::Completions::Prompt.new(system_message, messages: messages)
+
+  llm.generate(prompt, user: Discourse.system_user)
+end
+
+def download_topic(site, topic_id)
+  # Construct the URL for the topic JSON
+  url = "https://#{site}/t/#{topic_id}.json"
+
+  # Create TOPICS_DIR if it doesn't exist
+  FileUtils.mkdir_p(TOPICS_DIR)
+
+  # Define the output file path
+  output_file = File.join(TOPICS_DIR, "#{topic_id}.json")
+
+  # Download and save the topic JSON
+  begin
+    response = URI.open(url).read
+    File.write(output_file, response)
+  rescue OpenURI::HTTPError => e
+    puts "Error downloading topic #{topic_id}: #{e.message}"
+  rescue StandardError => e
+    puts "Unexpected error: #{e.message}"
+  end
+end
+
+def list_topic_ids(site, count)
+  all_ids = []
+  page = 0
+
+  while all_ids.size < count
+    url = "https://#{site}/latest.json?no_definitions=true&page=#{page}"
+    begin
+      response = URI.open(url).read
+      data = JSON.parse(response)
+      topics = data["topic_list"]["topics"]
+      break if topics.empty?
+
+      all_ids.concat(topics.map { |t| t["id"] })
+      page += 1
+    rescue OpenURI::HTTPError => e
+      puts "Error fetching topic list: #{e.message}"
+      break
+    rescue StandardError => e
+      puts "Unexpected error: #{e.message}"
+      break
+    end
+  end
+
+  all_ids.uniq.first(count)
+end
+
+def download_topics(site, count, only_new: true)
+  topic_ids = list_topic_ids(site, count)
+  topic_ids.each do |topic_id|
+    output_file = File.join(TOPICS_DIR, "#{topic_id}.json")
+    next if File.exist?(output_file) && only_new
+
+    download_topic(site, topic_id)
+  end
+end
+
+def extract_concepts(topic_id, existing_concepts = nil)
+  topic_file = File.join(TOPICS_DIR, "#{topic_id}.json")
+  return [] unless File.exist?(topic_file)
+
+  topic_data = JSON.parse(File.read(topic_file))
+  posts = topic_data["post_stream"]["posts"]
+
+  system_message = <<~PROMPT
+    You are a concept extraction assistant. Extract key concepts from the given text.
+    - Concepts should be 1-3 words
+    - Return 3-5 key concepts that best represent the discussion
+    - Format output as JSON array of [concept, [post_numbers]]
+
+    Example:
+    [
+      ["Discourse update", [1,2]],
+      ["Docker update", [3,4]]
+    ]
+
+    IMPORTANT: Only ever reply with valid JSON, do not return any other text
+    IMPORTANT: Do not wrap the result with ```json or any other formatting
+  PROMPT
+
+  context =
+    if existing_concepts
+      "Existing concepts: #{existing_concepts.join(", ")}. Please prefer these concepts when applicable.\n\n"
+    else
+      ""
+    end
+
+  # Add metadata section
+  metadata = []
+  metadata << "Title: #{topic_data["title"]}" if topic_data["title"]
+  if topic_data["category_name"]
+    metadata << "Category: #{topic_data["category_name"]}"
+  end
+  if topic_data["tags"]&.any?
+    metadata << "Tags: #{topic_data["tags"].join(", ")}"
+  end
+
+  content = [
+    metadata.join("\n"),
+    posts
+      .map { |post| "Post ##{post["post_number"]}: #{post["cooked"]}" }
+      .join("\n\n")
+  ].join("\n\n")
+
+  user_message = context + content
+
+  retries = 0
+  max_retries = 5
+
+  begin
+    result = ask_llm(system_message, user_message)
+  rescue => e
+    if e.message.include?("RESOURCE_EXHAUSTED") && retries < max_retries
+      retries += 1
+      puts "Rate limited, waiting 20 seconds... (attempt #{retries}/#{max_retries})"
+      sleep 20
+      retry
+    else
+      puts "Error after #{retries} retries: #{e.message}"
+      return []
+    end
+  end
+
+  # gemini likes to do this
+  result = result.gsub(/^```.*/, "").strip
+
+  begin
+    JSON.parse(result)
+  rescue JSON::ParserError
+    puts "Error parsing LLM response for topic #{topic_id}"
+    puts result
+    []
+  end
+end
+
+def extract_all_concepts
+  first_pass_dir = File.expand_path("./first_pass", __dir__)
+  FileUtils.mkdir_p(first_pass_dir)
+
+  Dir
+    .glob(File.join(TOPICS_DIR, "*.json"))
+    .each do |topic_file|
+      topic_id = File.basename(topic_file, ".json")
+      output_file = File.join(first_pass_dir, "#{topic_id}.json")
+
+      # Skip if already processed
+      next if File.exist?(output_file)
+
+      puts "Processing topic #{topic_id}..."
+      concepts = extract_concepts(topic_id)
+      p concepts
+
+      # Save concepts to file
+      File.write(output_file, JSON.pretty_generate(concepts))
+    end
+end
+
+def list_all_concepts
+  first_pass_dir = File.expand_path("./first_pass", __dir__)
+  concept_counts = Hash.new(0)
+
+  Dir
+    .glob(File.join(first_pass_dir, "*.json"))
+    .each do |file|
+      begin
+        concepts = JSON.parse(File.read(file))
+        concepts.each { |concept, _posts| concept_counts[concept] += 1 }
+      rescue JSON::ParserError => e
+        puts "Error parsing #{file}: #{e.message}"
+      end
+    end
+
+  concepts = +""
+  # Sort by count in descending order and convert to array of [concept, count]
+  concept_counts
+    .sort_by { |_, count| -count }
+    .each { |concept, count| concepts << "#{concept}: #{count}\n" }
+end
+
+def normalize_concepts(max_concepts = 100)
+  normalized_file = File.expand_path("./normalized_concepts.json", __dir__)
+
+  # Return cached results if they exist
+  return JSON.parse(File.read(normalized_file)) if File.exist?(normalized_file)
+  concepts = list_all_concepts
+
+  system_message = <<~PROMPT
+    You are a concept normalization assistant. Given a list of concepts and their frequencies, 
+    normalize them according to these rules:
+    1. Use consistent title case for all concepts
+    2. Merge similar or duplicate concepts (e.g. "docker update" and "updating docker")
+    3. Generalize overly specific concepts while maintaining meaning
+    4. Return only the top #{max_concepts} most relevant concepts
+    
+    Format output as JSON array of [normalized_concept, count]:
+    [
+      ["Docker Updates", 45],
+      ["Performance Optimization", 32]
+    ]
+
+    IMPORTANT: Only reply with valid JSON, no other text
+    IMPORTANT: Maintain relative usage counts when merging concepts
+  PROMPT
+
+  user_message = "Here are the concepts to normalize:\n#{concepts}"
+
+  retries = 0
+  max_retries = 5
+
+  begin
+    result = ask_llm(system_message, user_message)
+    result = result.gsub(/^```.*/, "").strip
+
+    normalized = JSON.parse(result)
+    # Cache the results
+    File.write(
+      File.expand_path("./normalized_concepts.json", __dir__),
+      JSON.pretty_generate(normalized)
+    )
+    normalized
+  rescue => e
+    if e.message.include?("RESOURCE_EXHAUSTED") && retries < max_retries
+      retries += 1
+      puts "Rate limited, waiting 20 seconds... (attempt #{retries}/#{max_retries})"
+      sleep 20
+      retry
+    else
+      puts "Error after #{retries} retries: #{e.message}"
+      []
+    end
+  end
+end
+
+def reclassify_topics
+  normalized_concepts = normalize_concepts(100)
+  concept_names = normalized_concepts.map { |concept, _| concept }
+  final_dir = File.expand_path("./final_classifications", __dir__)
+  FileUtils.mkdir_p(final_dir)
+
+  Dir
+    .glob(File.join(TOPICS_DIR, "*.json"))
+    .each do |topic_file|
+      topic_id = File.basename(topic_file, ".json")
+      output_file = File.join(final_dir, "#{topic_id}.json")
+
+      # Skip if already processed
+      next if File.exist?(output_file)
+
+      puts "Reclassifying topic #{topic_id}..."
+      concepts = extract_concepts(topic_id, concept_names)
+
+      # Save final classification
+      File.write(output_file, JSON.pretty_generate(concepts))
+    end
+end
+
+def summarize_classifications
+  final_dir = File.expand_path("./final_classifications", __dir__)
+  concept_counts = Hash.new(0)
+  topic_concepts = Hash.new { |h, k| h[k] = [] }
+
+  Dir
+    .glob(File.join(final_dir, "*.json"))
+    .each do |file|
+      topic_id = File.basename(file, ".json")
+      begin
+        concepts = JSON.parse(File.read(file))
+        concepts.each do |concept, posts|
+          concept_counts[concept] += 1
+          topic_concepts[concept] << topic_id
+        end
+      rescue JSON::ParserError => e
+        puts "Error parsing #{file}: #{e.message}"
+      end
+    end
+
+  summary = {
+    concept_counts: concept_counts.sort_by { |_, count| -count }.to_h,
+    topic_concepts: topic_concepts
+  }
+
+  File.write(
+    File.expand_path("./classification_summary.json", __dir__),
+    JSON.pretty_generate(summary)
+  )
+
+  summary
+end
+
+def generate_graph_data
+  final_dir = File.expand_path("./final_classifications", __dir__)
+  topics_data = []
+
+  Dir
+    .glob(File.join(TOPICS_DIR, "*.json"))
+    .each do |topic_file|
+      topic_id = File.basename(topic_file, ".json")
+
+      # Read topic data
+      topic_json = JSON.parse(File.read(topic_file))
+
+      # Read classification data
+      classification_file = File.join(final_dir, "#{topic_id}.json")
+      next unless File.exist?(classification_file)
+
+      concepts = JSON.parse(File.read(classification_file))
+
+      # Create topic entry
+      topic_entry = {
+        id: topic_id,
+        slug: topic_json["slug"],
+        title: topic_json["title"],
+        concepts: concepts.to_h # Convert the array of [concept, posts] to a hash
+      }
+
+      topics_data << topic_entry
+    end
+
+  graph_data = { topics: topics_data }
+
+  # Save to file
+  output_file = File.expand_path("./graph_data.json", __dir__)
+  File.write(output_file, JSON.pretty_generate(graph_data))
+
+  graph_data
+end
+
+# Example usage:
+#download_topics("meta.discourse.org", 500)
+#extract_all_concepts
+#list_all_concepts
+# normalize_concepts(100)
+# reclassify_topics
+generate_graph_data