baptistejamin · November 3, 2025 09:44
diff --git a/benchmark_logprobs.sh b/benchmark_logprobs.sh
 #!/bin/bash
 # Benchmark script to measure performance impact of logprobs with different topK settings
 # Usage: ./scripts/benchmark_logprobs.sh [model_name]

 set -e

 MODEL="${1:-gemma3:1b}"
 PROMPT="Write a short story about a robot."
 NUM_PREDICT=100
 RUNS=5

 echo "=== Logprobs Performance Benchmark ==="
 echo "Model: $MODEL"
 echo "Prompt: $PROMPT"
 echo "Tokens to generate: $NUM_PREDICT"
 echo "Runs per configuration: $RUNS"
 echo ""

 # Function to run benchmark
 benchmark() {
    local logprobs=$1
    local top_logprobs=$2
    local label=$3
    
    echo "Testing: $label"
    
    local total_time=0
    local total_tokens=0
    
    for i in $(seq 1 $RUNS); do
        result=$(curl -s http://localhost:11434/api/generate -d "{
            \"model\": \"$MODEL\",
            \"prompt\": \"$PROMPT\",
            \"stream\": false,
            \"options\": {
                \"num_predict\": $NUM_PREDICT,
                \"temperature\": 0,
                \"seed\": 42
            },
            \"logprobs\": $logprobs,
            \"top_logprobs\": $top_logprobs
        }")
        
        # Extract timing info
        eval_duration=$(echo "$result" | jq -r '.eval_duration // 0')
        eval_count=$(echo "$result" | jq -r '.eval_count // 0')
        
        if [ "$eval_duration" != "0" ] && [ "$eval_count" != "0" ]; then
            total_time=$((total_time + eval_duration))
            total_tokens=$((total_tokens + eval_count))
        fi
        
        # Small delay between runs
        sleep 0.5
    done
    
    # Calculate averages (duration is in nanoseconds)
    avg_time_ms=$((total_time / RUNS / 1000000))
    avg_tokens=$((total_tokens / RUNS))
    
    if [ "$avg_tokens" -gt 0 ]; then
        tokens_per_sec=$((avg_tokens * 1000000000 / (total_time / RUNS)))
        ms_per_token=$((avg_time_ms / avg_tokens))
        
        printf "  Average time: %d ms\n" $avg_time_ms
        printf "  Tokens: %d\n" $avg_tokens
        printf "  Tokens/sec: %d\n" $tokens_per_sec
        printf "  ms/token: %d\n" $ms_per_token
    else
        echo "  Failed to get valid results"
    fi
    
    echo ""
 }

 # Check if model exists
 if ! curl -s http://localhost:11434/api/tags | jq -e ".models[] | select(.name == \"$MODEL\")" > /dev/null 2>&1; then
    echo "Model $MODEL not found. Pulling..."
    ollama pull "$MODEL"
 fi

 echo "Warming up model..."
 curl -s http://localhost:11434/api/generate -d "{
    \"model\": \"$MODEL\",
    \"prompt\": \"test\",
    \"stream\": false,
    \"options\": {\"num_predict\": 1}
 }" > /dev/null

 echo "Starting benchmarks..."
 echo ""

 # Baseline: No logprobs
 benchmark false 0 "Baseline (no logprobs)"

 # With logprobs, no top_k
 benchmark true 0 "Logprobs only (no top_k)"

 # With logprobs and different top_k values
 benchmark true 1 "Logprobs + top_k=1"
 benchmark true 5 "Logprobs + top_k=5"
 benchmark true 10 "Logprobs + top_k=10"
 benchmark true 20 "Logprobs + top_k=20"

 echo "=== Benchmark Complete ==="
	#!/bin/bash
	# Benchmark script to measure performance impact of logprobs with different topK settings
	# Usage: ./scripts/benchmark_logprobs.sh [model_name]

	set -e

	MODEL="${1:-gemma3:1b}"
	PROMPT="Write a short story about a robot."
	NUM_PREDICT=100
	RUNS=5

	echo "=== Logprobs Performance Benchmark ==="
	echo "Model: $MODEL"
	echo "Prompt: $PROMPT"
	echo "Tokens to generate: $NUM_PREDICT"
	echo "Runs per configuration: $RUNS"
	echo ""

	# Function to run benchmark
	benchmark() {
	local logprobs=$1
	local top_logprobs=$2
	local label=$3

	echo "Testing: $label"

	local total_time=0
	local total_tokens=0

	for i in $(seq 1 $RUNS); do
	result=$(curl -s http://localhost:11434/api/generate -d "{
	\"model\": \"$MODEL\",
	\"prompt\": \"$PROMPT\",
	\"stream\": false,
	\"options\": {
	\"num_predict\": $NUM_PREDICT,
	\"temperature\": 0,
	\"seed\": 42
	},
	\"logprobs\": $logprobs,
	\"top_logprobs\": $top_logprobs
	}")

	# Extract timing info
	eval_duration=$(echo "$result" \| jq -r '.eval_duration // 0')
	eval_count=$(echo "$result" \| jq -r '.eval_count // 0')

	if [ "$eval_duration" != "0" ] && [ "$eval_count" != "0" ]; then
	total_time=$((total_time + eval_duration))
	total_tokens=$((total_tokens + eval_count))
	fi

	# Small delay between runs
	sleep 0.5
	done

	# Calculate averages (duration is in nanoseconds)
	avg_time_ms=$((total_time / RUNS / 1000000))
	avg_tokens=$((total_tokens / RUNS))

	if [ "$avg_tokens" -gt 0 ]; then
	tokens_per_sec=$((avg_tokens * 1000000000 / (total_time / RUNS)))
	ms_per_token=$((avg_time_ms / avg_tokens))

	printf " Average time: %d ms\n" $avg_time_ms
	printf " Tokens: %d\n" $avg_tokens
	printf " Tokens/sec: %d\n" $tokens_per_sec
	printf " ms/token: %d\n" $ms_per_token
	else
	echo " Failed to get valid results"
	fi

	echo ""
	}

	# Check if model exists
	if ! curl -s http://localhost:11434/api/tags \| jq -e ".models[] \| select(.name == \"$MODEL\")" > /dev/null 2>&1; then
	echo "Model $MODEL not found. Pulling..."
	ollama pull "$MODEL"
	fi

	echo "Warming up model..."
	curl -s http://localhost:11434/api/generate -d "{
	\"model\": \"$MODEL\",
	\"prompt\": \"test\",
	\"stream\": false,
	\"options\": {\"num_predict\": 1}
	}" > /dev/null

	echo "Starting benchmarks..."
	echo ""

	# Baseline: No logprobs
	benchmark false 0 "Baseline (no logprobs)"

	# With logprobs, no top_k
	benchmark true 0 "Logprobs only (no top_k)"

	# With logprobs and different top_k values
	benchmark true 1 "Logprobs + top_k=1"
	benchmark true 5 "Logprobs + top_k=5"
	benchmark true 10 "Logprobs + top_k=10"
	benchmark true 20 "Logprobs + top_k=20"

	echo "=== Benchmark Complete ==="
No results found