# 1. https://docs.beam.cloud/v2/getting-started/quickstart#installation
# 2. beam deploy kokoro_beam.py:generate_speech
from beam import endpoint, env, Image, Output
if env.is_remote():
    from kokoro import KPipeline
    import subprocess
    import uuid
def load_model():
    pipeline = KPipeline("a", device="cuda:0")
    pipeline.load_single_voice("af_alloy")
    pipeline.load_single_voice("am_onyx")
    pipeline.load_single_voice("af_heart")
    return pipeline
kokoro_image = (
    Image(python_version="python3.11")
    .add_python_packages(["kokoro"])
    .add_commands(["apt update && apt install espeak-ng ffmpeg -y"])
)
@endpoint(
    name="kokoro-tts",
    on_start=load_model,
    #####################################################################
    # 1 CPU, 1 worker, 4GB RAM for minimum resource required for kokoro
    #####################################################################
    # cpu=1,
    # workers=1,
    # memory="4Gi",
    #####################################################################
    cpu=10,
    workers=10,
    memory="24Gi",
    #####################################################################
    gpu=["RTX4090", "A10G", "A100-40"],
    gpu_count=1,
    image=kokoro_image,
)
def generate_speech(context, **inputs):
    pipeline = context.on_start_value
    text = inputs.pop("text", None)
    voice = inputs.pop("voice", "af_alloy")
    if not text:
        return {"error": "Please provide text to generate speech"}
    generator = pipeline(text, voice=voice)
    mp3_file = f"/tmp/kokoro_tts_out_{uuid.uuid4()}.mp3"
    # Use ffmpeg as a pipe (no intermediate WAV files)
    ffmpeg_cmd = [
        "ffmpeg",
        "-y",  # Overwrite if exists
        "-f",
        "s16le",  # Raw PCM 16-bit little-endian
        "-ar",
        "24000",  # Sample rate
        "-ac",
        "1",  # Mono audio
        "-i",
        "pipe:0",  # Read from stdin (raw audio)
        "-codec:a",
        "libmp3lame",  # MP3 codec
        "-b:a",
        "48k",  # Bitrate
        "-write_xing",  # Add proper MP3 header
        "0",  # Disable Xing header (which contains duration info)
        "-write_id3v2",
        "1",  # Add ID3v2 header for file recognition
        mp3_file,
    ]
    try:
        with subprocess.Popen(ffmpeg_cmd, stdin=subprocess.PIPE) as ffmpeg_proc:
            for result in generator:
                # Convert tensor to bytes and scale to 16-bit PCM format
                audio_bytes = (
                    (result.audio.cpu().numpy() * 32767)
                    .clip(-32768, 32767)
                    .astype("int16")
                    .tobytes()
                )
                ffmpeg_proc.stdin.write(audio_bytes)
            ffmpeg_proc.stdin.close()
            ffmpeg_proc.wait()  # Ensure ffmpeg finishes encoding
    except subprocess.CalledProcessError:
        return {"error": "Failed to convert audio to MP3"}
    output_file = Output(path=mp3_file)
    output_file.save()
    public_url = output_file.public_url(expires=3600)
    del pipeline
    return {"output_url": public_url}- 
      
 - 
        
Save superbahbi/5cd5ea49a05358c1bb8784e09804c315 to your computer and use it in GitHub Desktop.  
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment