Skip to content

Instantly share code, notes, and snippets.

@SamSaffron
Created February 19, 2025 04:11
Show Gist options
  • Save SamSaffron/b7745cb9d3a30d5dc52d2e9a6500b64a to your computer and use it in GitHub Desktop.
Save SamSaffron/b7745cb9d3a30d5dc52d2e9a6500b64a to your computer and use it in GitHub Desktop.

Revisions

  1. SamSaffron created this gist Feb 19, 2025.
    360 changes: 360 additions & 0 deletions classify
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,360 @@
    #!/usr/bin/env ruby
    require "fileutils"
    require "open-uri"

    TOPICS_DIR = File.expand_path("./topics", __dir__)
    LLM_MODEL = "Gemini Flash 2.0"

    Dir.chdir("/home/sam/Source/discourse")

    require File.expand_path(
    "/home/sam/Source/discourse/config/environment",
    __FILE__
    )

    def ask_llm(system_message, user_message)
    llm_model = LlmModel.find_by(display_name: LLM_MODEL)
    llm = llm_model.to_llm

    messages = [{ type: :user, content: user_message }]

    prompt =
    DiscourseAi::Completions::Prompt.new(system_message, messages: messages)

    llm.generate(prompt, user: Discourse.system_user)
    end

    def download_topic(site, topic_id)
    # Construct the URL for the topic JSON
    url = "https://#{site}/t/#{topic_id}.json"

    # Create TOPICS_DIR if it doesn't exist
    FileUtils.mkdir_p(TOPICS_DIR)

    # Define the output file path
    output_file = File.join(TOPICS_DIR, "#{topic_id}.json")

    # Download and save the topic JSON
    begin
    response = URI.open(url).read
    File.write(output_file, response)
    rescue OpenURI::HTTPError => e
    puts "Error downloading topic #{topic_id}: #{e.message}"
    rescue StandardError => e
    puts "Unexpected error: #{e.message}"
    end
    end

    def list_topic_ids(site, count)
    all_ids = []
    page = 0

    while all_ids.size < count
    url = "https://#{site}/latest.json?no_definitions=true&page=#{page}"
    begin
    response = URI.open(url).read
    data = JSON.parse(response)
    topics = data["topic_list"]["topics"]
    break if topics.empty?

    all_ids.concat(topics.map { |t| t["id"] })
    page += 1
    rescue OpenURI::HTTPError => e
    puts "Error fetching topic list: #{e.message}"
    break
    rescue StandardError => e
    puts "Unexpected error: #{e.message}"
    break
    end
    end

    all_ids.uniq.first(count)
    end

    def download_topics(site, count, only_new: true)
    topic_ids = list_topic_ids(site, count)
    topic_ids.each do |topic_id|
    output_file = File.join(TOPICS_DIR, "#{topic_id}.json")
    next if File.exist?(output_file) && only_new

    download_topic(site, topic_id)
    end
    end

    def extract_concepts(topic_id, existing_concepts = nil)
    topic_file = File.join(TOPICS_DIR, "#{topic_id}.json")
    return [] unless File.exist?(topic_file)

    topic_data = JSON.parse(File.read(topic_file))
    posts = topic_data["post_stream"]["posts"]

    system_message = <<~PROMPT
    You are a concept extraction assistant. Extract key concepts from the given text.
    - Concepts should be 1-3 words
    - Return 3-5 key concepts that best represent the discussion
    - Format output as JSON array of [concept, [post_numbers]]
    Example:
    [
    ["Discourse update", [1,2]],
    ["Docker update", [3,4]]
    ]
    IMPORTANT: Only ever reply with valid JSON, do not return any other text
    IMPORTANT: Do not wrap the result with ```json or any other formatting
    PROMPT

    context =
    if existing_concepts
    "Existing concepts: #{existing_concepts.join(", ")}. Please prefer these concepts when applicable.\n\n"
    else
    ""
    end

    # Add metadata section
    metadata = []
    metadata << "Title: #{topic_data["title"]}" if topic_data["title"]
    if topic_data["category_name"]
    metadata << "Category: #{topic_data["category_name"]}"
    end
    if topic_data["tags"]&.any?
    metadata << "Tags: #{topic_data["tags"].join(", ")}"
    end

    content = [
    metadata.join("\n"),
    posts
    .map { |post| "Post ##{post["post_number"]}: #{post["cooked"]}" }
    .join("\n\n")
    ].join("\n\n")

    user_message = context + content

    retries = 0
    max_retries = 5

    begin
    result = ask_llm(system_message, user_message)
    rescue => e
    if e.message.include?("RESOURCE_EXHAUSTED") && retries < max_retries
    retries += 1
    puts "Rate limited, waiting 20 seconds... (attempt #{retries}/#{max_retries})"
    sleep 20
    retry
    else
    puts "Error after #{retries} retries: #{e.message}"
    return []
    end
    end

    # gemini likes to do this
    result = result.gsub(/^```.*/, "").strip

    begin
    JSON.parse(result)
    rescue JSON::ParserError
    puts "Error parsing LLM response for topic #{topic_id}"
    puts result
    []
    end
    end

    def extract_all_concepts
    first_pass_dir = File.expand_path("./first_pass", __dir__)
    FileUtils.mkdir_p(first_pass_dir)

    Dir
    .glob(File.join(TOPICS_DIR, "*.json"))
    .each do |topic_file|
    topic_id = File.basename(topic_file, ".json")
    output_file = File.join(first_pass_dir, "#{topic_id}.json")

    # Skip if already processed
    next if File.exist?(output_file)

    puts "Processing topic #{topic_id}..."
    concepts = extract_concepts(topic_id)
    p concepts

    # Save concepts to file
    File.write(output_file, JSON.pretty_generate(concepts))
    end
    end

    def list_all_concepts
    first_pass_dir = File.expand_path("./first_pass", __dir__)
    concept_counts = Hash.new(0)

    Dir
    .glob(File.join(first_pass_dir, "*.json"))
    .each do |file|
    begin
    concepts = JSON.parse(File.read(file))
    concepts.each { |concept, _posts| concept_counts[concept] += 1 }
    rescue JSON::ParserError => e
    puts "Error parsing #{file}: #{e.message}"
    end
    end

    concepts = +""
    # Sort by count in descending order and convert to array of [concept, count]
    concept_counts
    .sort_by { |_, count| -count }
    .each { |concept, count| concepts << "#{concept}: #{count}\n" }
    end

    def normalize_concepts(max_concepts = 100)
    normalized_file = File.expand_path("./normalized_concepts.json", __dir__)

    # Return cached results if they exist
    return JSON.parse(File.read(normalized_file)) if File.exist?(normalized_file)
    concepts = list_all_concepts

    system_message = <<~PROMPT
    You are a concept normalization assistant. Given a list of concepts and their frequencies,
    normalize them according to these rules:
    1. Use consistent title case for all concepts
    2. Merge similar or duplicate concepts (e.g. "docker update" and "updating docker")
    3. Generalize overly specific concepts while maintaining meaning
    4. Return only the top #{max_concepts} most relevant concepts
    Format output as JSON array of [normalized_concept, count]:
    [
    ["Docker Updates", 45],
    ["Performance Optimization", 32]
    ]
    IMPORTANT: Only reply with valid JSON, no other text
    IMPORTANT: Maintain relative usage counts when merging concepts
    PROMPT

    user_message = "Here are the concepts to normalize:\n#{concepts}"

    retries = 0
    max_retries = 5

    begin
    result = ask_llm(system_message, user_message)
    result = result.gsub(/^```.*/, "").strip

    normalized = JSON.parse(result)
    # Cache the results
    File.write(
    File.expand_path("./normalized_concepts.json", __dir__),
    JSON.pretty_generate(normalized)
    )
    normalized
    rescue => e
    if e.message.include?("RESOURCE_EXHAUSTED") && retries < max_retries
    retries += 1
    puts "Rate limited, waiting 20 seconds... (attempt #{retries}/#{max_retries})"
    sleep 20
    retry
    else
    puts "Error after #{retries} retries: #{e.message}"
    []
    end
    end
    end

    def reclassify_topics
    normalized_concepts = normalize_concepts(100)
    concept_names = normalized_concepts.map { |concept, _| concept }
    final_dir = File.expand_path("./final_classifications", __dir__)
    FileUtils.mkdir_p(final_dir)

    Dir
    .glob(File.join(TOPICS_DIR, "*.json"))
    .each do |topic_file|
    topic_id = File.basename(topic_file, ".json")
    output_file = File.join(final_dir, "#{topic_id}.json")

    # Skip if already processed
    next if File.exist?(output_file)

    puts "Reclassifying topic #{topic_id}..."
    concepts = extract_concepts(topic_id, concept_names)

    # Save final classification
    File.write(output_file, JSON.pretty_generate(concepts))
    end
    end

    def summarize_classifications
    final_dir = File.expand_path("./final_classifications", __dir__)
    concept_counts = Hash.new(0)
    topic_concepts = Hash.new { |h, k| h[k] = [] }

    Dir
    .glob(File.join(final_dir, "*.json"))
    .each do |file|
    topic_id = File.basename(file, ".json")
    begin
    concepts = JSON.parse(File.read(file))
    concepts.each do |concept, posts|
    concept_counts[concept] += 1
    topic_concepts[concept] << topic_id
    end
    rescue JSON::ParserError => e
    puts "Error parsing #{file}: #{e.message}"
    end
    end

    summary = {
    concept_counts: concept_counts.sort_by { |_, count| -count }.to_h,
    topic_concepts: topic_concepts
    }

    File.write(
    File.expand_path("./classification_summary.json", __dir__),
    JSON.pretty_generate(summary)
    )

    summary
    end

    def generate_graph_data
    final_dir = File.expand_path("./final_classifications", __dir__)
    topics_data = []

    Dir
    .glob(File.join(TOPICS_DIR, "*.json"))
    .each do |topic_file|
    topic_id = File.basename(topic_file, ".json")

    # Read topic data
    topic_json = JSON.parse(File.read(topic_file))

    # Read classification data
    classification_file = File.join(final_dir, "#{topic_id}.json")
    next unless File.exist?(classification_file)

    concepts = JSON.parse(File.read(classification_file))

    # Create topic entry
    topic_entry = {
    id: topic_id,
    slug: topic_json["slug"],
    title: topic_json["title"],
    concepts: concepts.to_h # Convert the array of [concept, posts] to a hash
    }

    topics_data << topic_entry
    end

    graph_data = { topics: topics_data }

    # Save to file
    output_file = File.expand_path("./graph_data.json", __dir__)
    File.write(output_file, JSON.pretty_generate(graph_data))

    graph_data
    end

    # Example usage:
    #download_topics("meta.discourse.org", 500)
    #extract_all_concepts
    #list_all_concepts
    # normalize_concepts(100)
    # reclassify_topics
    generate_graph_data