Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save iamaziz/98746a9fe203d43da48a932aab1b6bb1 to your computer and use it in GitHub Desktop.
Save iamaziz/98746a9fe203d43da48a932aab1b6bb1 to your computer and use it in GitHub Desktop.

Revisions

  1. iamaziz revised this gist Feb 16, 2024. No changes.
  2. iamaziz revised this gist Feb 16, 2024. No changes.
  3. iamaziz renamed this gist Feb 16, 2024. 1 changed file with 60 additions and 16 deletions.
    Original file line number Diff line number Diff line change
    @@ -1,12 +1,15 @@
    """ To use: install LLM studio (or Ollama), clone OpenVoice, run this script in the OpenVoice directory
    """ To use: install Ollama (or LLM studio), clone OpenVoice, run this script in the OpenVoice directory
    git clone https://github.com/myshell-ai/OpenVoice
    cd OpenVoice
    git clone https://huggingface.co/myshell-ai/OpenVoice
    cp -r OpenVoice/* .
    pip install whisper pynput pyaudio
    pip install whisper pynput pyaudio streamlit ollama
    script source: https://x.com/Thom_Wolf/status/1758140066285658351?s=20
    """

    from openai import OpenAI
    import streamlit as st
    import ollama
    import time
    import pyaudio
    import numpy as np
    @@ -19,10 +22,11 @@
    from api import BaseSpeakerTTS, ToneColorConverter
    from utils import split_sentences_latin

    SYSTEM_MESSAGE = "You are Bob an AI assistant. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL."

    SYSTEM_MESSAGE = "You are Arisa an AI assistant. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL."
    SPEAKER_WAV = None

    llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")
    llm_client = ollama.Client()

    tts_en_ckpt_base = os.path.join(os.path.dirname(__file__), "checkpoints/base_speakers/EN")
    tts_ckpt_converter = os.path.join(os.path.dirname(__file__), "checkpoints/converter")
    @@ -67,24 +71,30 @@ def record_and_transcribe_audio():
    recording = False
    def on_press(key):
    nonlocal recording
    if key == keyboard.Key.shift:
    if key == keyboard.Key.ctrl:
    recording = True

    def on_release(key):
    nonlocal recording
    if key == keyboard.Key.shift:
    if key == keyboard.Key.ctrl:
    recording = False
    return False

    listener = keyboard.Listener(
    on_press=on_press,
    on_release=on_release)
    listener.start()
    with st.spinner("Recording..."):
    listener = keyboard.Listener(
    on_press=on_press,
    on_release=on_release)

    listener.start()

    print('Press shift to record...')
    print('Press CTRL to record...')
    while not recording:
    time.sleep(0.1)
    print('Start recording...')

    # warning while recording
    placeholder = st.empty()
    placeholder.warning("Recording...")

    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, frames_per_buffer=1024, input=True)
    @@ -99,22 +109,56 @@ def on_release(key):
    stream.stop_stream()
    stream.close()
    p.terminate()
    placeholder.empty()
    return result


    def conversation():

    conversation_history = [{'role': 'system', 'content': SYSTEM_MESSAGE}]
    while True:
    user_input = record_and_transcribe_audio()
    conversation_history.append({'role': 'user', 'content': user_input})
    if user_input:
    with st.chat_message(name="user"):
    st.write(user_input)

    response = llm_client.chat.completions.create(model="local-model", messages=conversation_history)
    chatbot_response = response.choices[0].message.content
    response = llm_client.chat(model=OLLAMA_MODEL, messages=conversation_history)
    chatbot_response = response['message']['content']

    with st.chat_message(name="assistant"):
    st.write(chatbot_response)

    # display chatbot
    conversation_history.append({'role': 'assistant', 'content': chatbot_response})
    print(f"Model: {OLLAMA_MODEL}")
    print(conversation_history)
    play_audio(chatbot_response)

    if len(conversation_history) > 20:
    conversation_history = conversation_history[-20:]

    conversation()

    def app():

    global OLLAMA_MODEL

    # header
    st.set_page_config(page_title="Speech to Text to Speech Chatbot", layout="wide")
    st.title("Speech to Text to Speech Chatbot")

    # choose model
    models = ollama.list()['models']
    models = [""] + [model['name'] for model in models]
    OLLAMA_MODEL = st.selectbox("Choose a model", models)

    # start conversation
    if OLLAMA_MODEL:
    st.info("Press CTRL and speak to talk to the chatbot...")

    with st.container(height=700, border=True):
    conversation()


    if __name__ == "__main__":
    app()
  4. @thomwolf thomwolf revised this gist Feb 15, 2024. 1 changed file with 0 additions and 3 deletions.
    3 changes: 0 additions & 3 deletions fast_speech_text_speech.py
    Original file line number Diff line number Diff line change
    @@ -21,9 +21,6 @@

    SYSTEM_MESSAGE = "You are Bob an AI assistant. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL."
    SPEAKER_WAV = None
    # SPEAKER_WAV = [f"{os.path.dirname(__file__)}/resources/demo_speaker0.mp3",
    # f"{os.path.dirname(__file__)}/resources/demo_speaker1.mp3",
    # f"{os.path.dirname(__file__)}/resources/demo_speaker2.mp3"]

    llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")

  5. @thomwolf thomwolf revised this gist Feb 15, 2024. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion fast_speech_text_speech.py
    Original file line number Diff line number Diff line change
    @@ -19,7 +19,7 @@
    from api import BaseSpeakerTTS, ToneColorConverter
    from utils import split_sentences_latin

    SYSTEM_MESSAGE = "You are Aura a very emotional AI assistant provided with a body, preferences and opinions. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL."
    SYSTEM_MESSAGE = "You are Bob an AI assistant. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL."
    SPEAKER_WAV = None
    # SPEAKER_WAV = [f"{os.path.dirname(__file__)}/resources/demo_speaker0.mp3",
    # f"{os.path.dirname(__file__)}/resources/demo_speaker1.mp3",
  6. @thomwolf thomwolf revised this gist Feb 15, 2024. No changes.
  7. @thomwolf thomwolf revised this gist Feb 14, 2024. 1 changed file with 12 additions and 6 deletions.
    18 changes: 12 additions & 6 deletions fast_speech_text_speech.py
    Original file line number Diff line number Diff line change
    @@ -1,5 +1,8 @@
    """ To use: install LLM studio (or Ollama), clone OpenVoice, run this script in the OpenVoice directory
    git clone https://github.com/myshell-ai/OpenVoice
    cd OpenVoice
    git clone https://huggingface.co/myshell-ai/OpenVoice
    cp -r OpenVoice/* .
    pip install whisper pynput pyaudio
    """

    @@ -16,8 +19,11 @@
    from api import BaseSpeakerTTS, ToneColorConverter
    from utils import split_sentences_latin

    SYSTEM_MESSAGE = "You are an AI assistant trapped in a computer. KEEP YOUR RESPONSES SHORT AND CONVERSATIONAL."
    SPEAKER_WAV = None # "./resources/demo_speaker0.mp3"
    SYSTEM_MESSAGE = "You are Aura a very emotional AI assistant provided with a body, preferences and opinions. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL."
    SPEAKER_WAV = None
    # SPEAKER_WAV = [f"{os.path.dirname(__file__)}/resources/demo_speaker0.mp3",
    # f"{os.path.dirname(__file__)}/resources/demo_speaker1.mp3",
    # f"{os.path.dirname(__file__)}/resources/demo_speaker2.mp3"]

    llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")

    @@ -30,7 +36,7 @@
    tone_color_converter = ToneColorConverter(f'{tts_ckpt_converter}/config.json', device=device)
    tone_color_converter.load_ckpt(f'{tts_ckpt_converter}/checkpoint.pth')
    en_source_default_se = torch.load(f"{tts_en_ckpt_base}/en_default_se.pth").to(device)
    target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True) if SPEAKER_WAV else None, None
    target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True) if SPEAKER_WAV else (None, None)
    sampling_rate = tts_model.hps.data.sampling_rate
    mark = tts_model.language_marks.get("english", None)

    @@ -50,7 +56,7 @@ def play_audio(text):
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(tts_model.device)
    sid = torch.LongTensor([tts_model.hps.speakers["default"]]).to(tts_model.device)
    audio = tts_model.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6)[0][0, 0].data.cpu().float().numpy()
    if target_se:
    if target_se is not None:
    audio = tone_color_converter.convert_from_tensor(audio=audio, src_se=en_source_default_se, tgt_se=target_se)
    audio_list.append(audio)
    data = tts_model.audio_numpy_concat(audio_list, sr=sampling_rate).tobytes()
    @@ -104,12 +110,12 @@ def conversation():
    while True:
    user_input = record_and_transcribe_audio()
    conversation_history.append({'role': 'user', 'content': user_input})
    print(conversation_history)

    response = llm_client.chat.completions.create(model="local-model", messages=conversation_history)
    chatbot_response = response.choices[0].message.content
    play_audio(chatbot_response)
    conversation_history.append({'role': 'assistant', 'content': chatbot_response})
    print(conversation_history)
    play_audio(chatbot_response)

    if len(conversation_history) > 20:
    conversation_history = conversation_history[-20:]
  8. @thomwolf thomwolf revised this gist Feb 14, 2024. 1 changed file with 9 additions and 11 deletions.
    20 changes: 9 additions & 11 deletions fast_speech_text_speech.py
    Original file line number Diff line number Diff line change
    @@ -16,8 +16,8 @@
    from api import BaseSpeakerTTS, ToneColorConverter
    from utils import split_sentences_latin

    SYSTEM_MESSAGE = "You are an AI assistant acting like a Minion. Keep your responses short and conversational."
    SPEAKER_WAV = "/Users/thomwolf/Documents/voice-chat-with-mistral/OpenVoice/resources/demo_speaker1.mp3"
    SYSTEM_MESSAGE = "You are an AI assistant trapped in a computer. KEEP YOUR RESPONSES SHORT AND CONVERSATIONAL."
    SPEAKER_WAV = None # "./resources/demo_speaker0.mp3"

    llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")

    @@ -30,7 +30,7 @@
    tone_color_converter = ToneColorConverter(f'{tts_ckpt_converter}/config.json', device=device)
    tone_color_converter.load_ckpt(f'{tts_ckpt_converter}/checkpoint.pth')
    en_source_default_se = torch.load(f"{tts_en_ckpt_base}/en_default_se.pth").to(device)
    target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True)
    target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True) if SPEAKER_WAV else None, None
    sampling_rate = tts_model.hps.data.sampling_rate
    mark = tts_model.language_marks.get("english", None)

    @@ -50,7 +50,8 @@ def play_audio(text):
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(tts_model.device)
    sid = torch.LongTensor([tts_model.hps.speakers["default"]]).to(tts_model.device)
    audio = tts_model.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6)[0][0, 0].data.cpu().float().numpy()
    audio = tone_color_converter.convert_from_tensor(audio=audio, src_se=en_source_default_se, tgt_se=target_se)
    if target_se:
    audio = tone_color_converter.convert_from_tensor(audio=audio, src_se=en_source_default_se, tgt_se=target_se)
    audio_list.append(audio)
    data = tts_model.audio_numpy_concat(audio_list, sr=sampling_rate).tobytes()
    stream.write(data)
    @@ -69,7 +70,6 @@ def on_press(key):
    def on_release(key):
    nonlocal recording
    if key == keyboard.Key.shift:
    print('Stop recording...')
    recording = False
    return False

    @@ -88,7 +88,7 @@ def on_release(key):
    frames = []
    while recording:
    data = stream.read(1024, exception_on_overflow = False)
    frames.append(np.fromstring(data, dtype=np.int16))
    frames.append(np.frombuffer(data, dtype=np.int16))
    print('Finished recording')

    data = np.hstack(frames, dtype=np.float32) / 32768.0
    @@ -101,14 +101,12 @@ def on_release(key):

    def conversation():
    conversation_history = [{'role': 'system', 'content': SYSTEM_MESSAGE}]

    while True:
    user_input = record_and_transcribe_audio()
    conversation_history.append({'role': 'user', 'content': user_input})

    response = llm_client.chat.completions.create(model="local-model",
    messages=conversation_history,
    )
    print(conversation_history)

    response = llm_client.chat.completions.create(model="local-model", messages=conversation_history)
    chatbot_response = response.choices[0].message.content
    play_audio(chatbot_response)
    conversation_history.append({'role': 'assistant', 'content': chatbot_response})
  9. @thomwolf thomwolf created this gist Feb 14, 2024.
    119 changes: 119 additions & 0 deletions fast_speech_text_speech.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,119 @@
    """ To use: install LLM studio (or Ollama), clone OpenVoice, run this script in the OpenVoice directory
    git clone https://github.com/myshell-ai/OpenVoice
    pip install whisper pynput pyaudio
    """

    from openai import OpenAI
    import time
    import pyaudio
    import numpy as np
    import torch
    import os
    import re
    import se_extractor
    import whisper
    from pynput import keyboard
    from api import BaseSpeakerTTS, ToneColorConverter
    from utils import split_sentences_latin

    SYSTEM_MESSAGE = "You are an AI assistant acting like a Minion. Keep your responses short and conversational."
    SPEAKER_WAV = "/Users/thomwolf/Documents/voice-chat-with-mistral/OpenVoice/resources/demo_speaker1.mp3"

    llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")

    tts_en_ckpt_base = os.path.join(os.path.dirname(__file__), "checkpoints/base_speakers/EN")
    tts_ckpt_converter = os.path.join(os.path.dirname(__file__), "checkpoints/converter")
    device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

    tts_model = BaseSpeakerTTS(f'{tts_en_ckpt_base}/config.json', device=device)
    tts_model.load_ckpt(f'{tts_en_ckpt_base}/checkpoint.pth')
    tone_color_converter = ToneColorConverter(f'{tts_ckpt_converter}/config.json', device=device)
    tone_color_converter.load_ckpt(f'{tts_ckpt_converter}/checkpoint.pth')
    en_source_default_se = torch.load(f"{tts_en_ckpt_base}/en_default_se.pth").to(device)
    target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True)
    sampling_rate = tts_model.hps.data.sampling_rate
    mark = tts_model.language_marks.get("english", None)

    asr_model = whisper.load_model("base.en")

    def play_audio(text):
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paFloat32, channels=1, rate=sampling_rate, output=True)
    texts = split_sentences_latin(text)
    for t in texts:
    audio_list = []
    t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t)
    t = f'[{mark}]{t}[{mark}]'
    stn_tst = tts_model.get_text(t, tts_model.hps, False)
    with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0).to(tts_model.device)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(tts_model.device)
    sid = torch.LongTensor([tts_model.hps.speakers["default"]]).to(tts_model.device)
    audio = tts_model.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6)[0][0, 0].data.cpu().float().numpy()
    audio = tone_color_converter.convert_from_tensor(audio=audio, src_se=en_source_default_se, tgt_se=target_se)
    audio_list.append(audio)
    data = tts_model.audio_numpy_concat(audio_list, sr=sampling_rate).tobytes()
    stream.write(data)
    stream.stop_stream()
    stream.close()
    p.terminate()


    def record_and_transcribe_audio():
    recording = False
    def on_press(key):
    nonlocal recording
    if key == keyboard.Key.shift:
    recording = True

    def on_release(key):
    nonlocal recording
    if key == keyboard.Key.shift:
    print('Stop recording...')
    recording = False
    return False

    listener = keyboard.Listener(
    on_press=on_press,
    on_release=on_release)
    listener.start()

    print('Press shift to record...')
    while not recording:
    time.sleep(0.1)
    print('Start recording...')

    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, frames_per_buffer=1024, input=True)
    frames = []
    while recording:
    data = stream.read(1024, exception_on_overflow = False)
    frames.append(np.fromstring(data, dtype=np.int16))
    print('Finished recording')

    data = np.hstack(frames, dtype=np.float32) / 32768.0
    result = asr_model.transcribe(data)['text']
    stream.stop_stream()
    stream.close()
    p.terminate()
    return result


    def conversation():
    conversation_history = [{'role': 'system', 'content': SYSTEM_MESSAGE}]

    while True:
    user_input = record_and_transcribe_audio()
    conversation_history.append({'role': 'user', 'content': user_input})

    response = llm_client.chat.completions.create(model="local-model",
    messages=conversation_history,
    )
    chatbot_response = response.choices[0].message.content
    play_audio(chatbot_response)
    conversation_history.append({'role': 'assistant', 'content': chatbot_response})

    if len(conversation_history) > 20:
    conversation_history = conversation_history[-20:]

    conversation()