from pydub import AudioSegment from glob import glob from math import ceil from os.path import basename, splitext, exists import json import requests import csv WATSON_USERNAME = "YOUR-USERNAME-HERE" WATSON_PASSWORD = "YOUR-PASSWORD" WATSON_ENDPOINT = 'https://stream.watsonplatform.net/speech-to-text/api/v1/recognize' WATSON_DEFAULT_PARAMS = { 'continuous': True, 'timestamps': True, 'word_confidence': True, } WATSON_DEFAULT_HEADERS = { 'content-type': 'audio/wav' } # via: http://www.propublica.org/podcast/item/how-a-reporter-pierced-the-hype-behind-theranos/ DOWNLOAD_URL = 'https://api.soundcloud.com/tracks/247345268/download?client_id=cUa40O3Jg3Emvp6Tv4U6ymYYO50NUGpJ' AUDIO_FILENAME = 'podcast.mp3' AUDIO_SEGMENT_SECONDS = 300 if not exists(AUDIO_FILENAME): print("Downloading from", DOWNLOAD_URL) resp = requests.get(DOWNLOAD_URL) with open(AUDIO_FILENAME, 'wb') as w: w.write(resp.content) print("Wrote audio file to", AUDIO_FILENAME) # convert to WAV audio = AudioSegment.from_mp3(AUDIO_FILENAME) xs = 0 while xs < audio.duration_seconds: ys = min(xs + AUDIO_SEGMENT_SECONDS, ceil(audio.duration_seconds)) fname = str(xs).rjust(5, '0') + '-' + str(ys).rjust(5, '0') + '.wav' audio[xs*1000:ys*1000].export(fname, format='wav') print("Saved", fname) xs = ys ## Transcribe each WAV to Watson for fname in glob("*.wav"): # Download watson's response tname = splitext(basename(fname))[0] + '.json' if exists(tname): print("Already transcribed", tname) else: print("Transcribing", fname) with open(fname, 'rb') as r: watson_response = requests.post( WATSON_ENDPOINT, data=r, auth=(WATSON_USERNAME, WATSON_PASSWORD), params=WATSON_DEFAULT_PARAMS, headers=WATSON_DEFAULT_HEADERS, stream=False ) with open(tname, 'w') as w: w.write(watson_response.text) print("Wrote transcript to", tname) # Print out the raw transcript and word csv rawfile = open("raw.txt", "w") wordsfile = open("words.csv", "w") csvfile = csv.writer(wordsfile) csvfile.writerow(['word', 'confidence', 'start', 'end']) for fname in sorted(glob("*.json")): with open(fname, 'r') as f: results = json.load(f)['results'] for linenum, result in enumerate(results): # each result is a line if result.get('alternatives'): # each result may have many alternatives # just pick best alternative lineobj = result.get('alternatives')[0] # rawfile.writeline(lineobj['transcript']) word_timestamps = lineobj['timestamps'] if word_timestamps: rawfile.write(lineobj['transcript'] + "\n") word_confidences = lineobj['word_confidence'] for idx, wordts in enumerate(word_timestamps): txt, tstart, tend = wordts confidence = round(100 * word_confidences[idx][1]) csvfile.writerow([txt, confidence, tstart, tend]) rawfile.close() wordsfile.close()