-
Star
(123)
You must be signed in to star a gist -
Fork
(40)
You must be signed in to fork a gist
-
-
Save philschmid/cb8c98f0781e4e52e5d364ff39e2ccd2 to your computer and use it in GitHub Desktop.
| import asyncio | |
| import base64 | |
| import json | |
| import os | |
| import pyaudio | |
| from websockets.asyncio.client import connect | |
| class SimpleGeminiVoice: | |
| def __init__(self): | |
| self.audio_queue = asyncio.Queue() | |
| self.api_key = os.environ.get("GEMINI_API_KEY") | |
| self.model = "gemini-2.0-flash-exp" | |
| self.uri = f"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={self.api_key}" | |
| # Audio settings | |
| self.FORMAT = pyaudio.paInt16 | |
| self.CHANNELS = 1 | |
| self.CHUNK = 512 | |
| self.RATE = 16000 | |
| async def start(self): | |
| # Initialize websocket | |
| self.ws = await connect( | |
| self.uri, additional_headers={"Content-Type": "application/json"} | |
| ) | |
| await self.ws.send(json.dumps({"setup": {"model": f"models/{self.model}"}})) | |
| await self.ws.recv(decode=False) | |
| print("Connected to Gemini, You can start talking now") | |
| # Start audio streaming | |
| async with asyncio.TaskGroup() as tg: | |
| tg.create_task(self.capture_audio()) | |
| tg.create_task(self.stream_audio()) | |
| tg.create_task(self.play_response()) | |
| async def capture_audio(self): | |
| audio = pyaudio.PyAudio() | |
| stream = audio.open( | |
| format=self.FORMAT, | |
| channels=self.CHANNELS, | |
| rate=self.RATE, | |
| input=True, | |
| frames_per_buffer=self.CHUNK, | |
| ) | |
| while True: | |
| data = await asyncio.to_thread(stream.read, self.CHUNK) | |
| await self.ws.send( | |
| json.dumps( | |
| { | |
| "realtime_input": { | |
| "media_chunks": [ | |
| { | |
| "data": base64.b64encode(data).decode(), | |
| "mime_type": "audio/pcm", | |
| } | |
| ] | |
| } | |
| } | |
| ) | |
| ) | |
| async def stream_audio(self): | |
| async for msg in self.ws: | |
| response = json.loads(msg) | |
| try: | |
| audio_data = response["serverContent"]["modelTurn"]["parts"][0][ | |
| "inlineData" | |
| ]["data"] | |
| self.audio_queue.put_nowait(base64.b64decode(audio_data)) | |
| except KeyError: | |
| pass | |
| try: | |
| turn_complete = response["serverContent"]["turnComplete"] | |
| except KeyError: | |
| pass | |
| else: | |
| if turn_complete: | |
| # If you interrupt the model, it sends an end_of_turn. For interruptions to work, we need to empty out the audio queue | |
| print("\nEnd of turn") | |
| while not self.audio_queue.empty(): | |
| self.audio_queue.get_nowait() | |
| async def play_response(self): | |
| audio = pyaudio.PyAudio() | |
| stream = audio.open( | |
| format=self.FORMAT, channels=self.CHANNELS, rate=24000, output=True | |
| ) | |
| while True: | |
| data = await self.audio_queue.get() | |
| await asyncio.to_thread(stream.write, data) | |
| if __name__ == "__main__": | |
| client = SimpleGeminiVoice() | |
| asyncio.run(client.start()) |
| import asyncio | |
| import base64 | |
| import json | |
| import os | |
| import pyaudio | |
| from websockets.asyncio.client import connect | |
| class SimpleGeminiVoice: | |
| def __init__(self): | |
| self.api_key = os.environ.get("GEMINI_API_KEY") | |
| self.model = "gemini-2.0-flash-exp" | |
| self.uri = f"wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent?key={self.api_key}" | |
| # Audio settings | |
| self.FORMAT = pyaudio.paInt16 | |
| self.CHANNELS = 1 | |
| self.CHUNK = 512 | |
| async def start(self): | |
| # Initialize websocket | |
| self.ws = await connect( | |
| self.uri, additional_headers={"Content-Type": "application/json"} | |
| ) | |
| await self.ws.send(json.dumps({"setup": {"model": f"models/{self.model}"}})) | |
| await self.ws.recv(decode=False) | |
| print("Connected to Gemini, You can start talking now") | |
| # Start audio streaming | |
| async with asyncio.TaskGroup() as tg: | |
| tg.create_task(self.send_user_audio()) | |
| tg.create_task(self.recv_model_audio()) | |
| async def send_user_audio(self): | |
| audio = pyaudio.PyAudio() | |
| stream = audio.open( | |
| format=self.FORMAT, | |
| channels=self.CHANNELS, | |
| rate="16000", | |
| input=True, | |
| frames_per_buffer=self.CHUNK, | |
| ) | |
| while True: | |
| data = await asyncio.to_thread(stream.read, self.CHUNK) | |
| await self.ws.send( | |
| json.dumps( | |
| { | |
| "realtime_input": { | |
| "media_chunks": [ | |
| { | |
| "data": base64.b64encode(data).decode(), | |
| "mime_type": "audio/pcm", | |
| } | |
| ] | |
| } | |
| } | |
| ) | |
| ) | |
| async def recv_model_audio(self): | |
| audio = pyaudio.PyAudio() | |
| stream = audio.open( | |
| format=self.FORMAT, channels=self.CHANNELS, rate=24000, output=True | |
| ) | |
| async for msg in self.ws: | |
| response = json.loads(msg) | |
| try: | |
| audio_data = response["serverContent"]["modelTurn"]["parts"][0][ | |
| "inlineData" | |
| ]["data"] | |
| await asyncio.to_thread(stream.write, base64.b64decode(audio_data)) | |
| except KeyError: | |
| pass | |
| if __name__ == "__main__": | |
| client = SimpleGeminiVoice() | |
| asyncio.run(client.start()) |
I've created a playground repository you can run and fork easily https://github.com/saharmor/gemini-multimodal-playground
You can change the system prompt and voice and enable/disable interruptions. cc @avinashgawali @Youmyluck @boozuk
I've created a playground repository you can run and fork easily https://github.com/saharmor/gemini-multimodal-playground You can change the system prompt and voice and enable/disable interruptions. cc @avinashgawali @Youmyluck @boozuk
Thank you
And he don't speak Russian language. He understand Russian, but answers Chinese or something like that instead of Russian.
Думаю русский добавят чуть позже, как было с LearnLM. Он поначалу тоже прикидывался в незнании русского.
Просто везде пишут, что он добавлен уже. И я подумал может это политика гугла такая.
Пока нет. Ждем 2025. Середина января/начало февраля.
Perhaps we can use VAD to filter the noise form the audio and only send voice? @philschmid
@jlia0 this is already implemented in the standalone script here https://github.com/saharmor/gemini-multimodal-playground
@jlia0 this is already implemented in the standalone script here https://github.com/saharmor/gemini-multimodal-playground
thanks so much 🫶
I would like to use it in a voice channel through discord.py, but I can't seem to get it to work.