from datetime import datetime, timedelta from Speech import SFSpeechRecognizer, SFSpeechAudioBufferRecognitionRequest from AppKit import NSRunLoop from PyObjCTools import AppHelper import AVFoundation # https://developer.apple.com/documentation/speech/recognizing_speech_in_live_audio # https://github.com/SKaplanOfficial/PyXA/blob/11c7d4db4623b91415bd962d41ff3747a2808163/PyXA/Additions/Speech.py#L85 # to be able to ctrl-c AppHelper.installMachInterrupt() class STT(object): def __init__(self, live_panel=None): SFSpeechRecognizer.requestAuthorization_(None) self.audio_session = AVFoundation.AVAudioSession.sharedInstance() self.audio_engine = AVFoundation.AVAudioEngine.alloc().init() self.input_node = self.audio_engine.inputNode() self.recording_format = self.input_node.outputFormatForBus_(0) self.audio_engine.prepare() self.audio_engine.startAndReturnError_(None) self.input_node.installTapOnBus_bufferSize_format_block_( 0, 1024, self.recording_format, self._process_buffer_callback ) self.live_panel = live_panel def _process_buffer_callback(self, buffer, timing): self.recognition_request.appendAudioPCMBuffer_(buffer) def _process_speech_detection(self, result, error): if error is not None: print(f"Error! {error}") return best = result.bestTranscription() if self.live_panel is not None: self.live_panel.update(best.formattedString()) def _setup(self): self.recognizer = SFSpeechRecognizer.alloc().init() self.recognition_request = SFSpeechAudioBufferRecognitionRequest.alloc().init() self.recognition_request.setShouldReportPartialResults_(True) self.recognition_request.setAddsPunctuation_(True) self.recognition_task = self.recognizer.recognitionTaskWithRequest_resultHandler_(self.recognition_request, self._process_speech_detection) def run(self): self._setup() NSRunLoop.currentRunLoop().runUntilDate_(datetime.now() + timedelta(seconds = 30)) if __name__ == "__main__": from rich.live import Live live = Live() live.start() stt = STT(live_panel=live) stt.run() live.stop()