Skip to content

Instantly share code, notes, and snippets.

@dannguyen
Last active November 23, 2020 13:32
Show Gist options
  • Select an option

  • Save dannguyen/71d49ff62e9f9eb51ac6 to your computer and use it in GitHub Desktop.

Select an option

Save dannguyen/71d49ff62e9f9eb51ac6 to your computer and use it in GitHub Desktop.
Transcribing ProPublica podcast with Python and Watson Speech to Text API
from pydub import AudioSegment
from glob import glob
from math import ceil
from os.path import basename, splitext, exists
import json
import requests
WATSON_USERNAME = "YOUR-USERNAME-HERE"
WATSON_PASSWORD = "YOUR-PASSWORD"
WATSON_ENDPOINT = 'https://stream.watsonplatform.net/speech-to-text/api/v1/recognize'
WATSON_DEFAULT_PARAMS = {
'continuous': True,
'timestamps': True,
'word_confidence': True,
}
WATSON_DEFAULT_HEADERS = {
'content-type': 'audio/wav'
}
# via: http://www.propublica.org/podcast/item/how-a-reporter-pierced-the-hype-behind-theranos/
DOWNLOAD_URL = 'https://api.soundcloud.com/tracks/247345268/download?client_id=cUa40O3Jg3Emvp6Tv4U6ymYYO50NUGpJ'
AUDIO_FILENAME = 'podcast.mp3'
AUDIO_SEGMENT_SECONDS = 300
if not exists(AUDIO_FILENAME):
print("Downloading from", DOWNLOAD_URL)
resp = requests.get(DOWNLOAD_URL)
with open(AUDIO_FILENAME, 'wb') as w:
w.write(resp.content)
print("Wrote audio file to", AUDIO_FILENAME)
# convert to WAV
audio = AudioSegment.from_mp3(AUDIO_FILENAME)
xs = 0
while xs < audio.duration_seconds:
ys = min(xs + AUDIO_SEGMENT_SECONDS, ceil(audio.duration_seconds))
fname = str(xs).rjust(5, '0') + '-' + str(ys).rjust(5, '0') + '.wav'
audio[xs*1000:ys*1000].export(fname, format='wav')
print("Saved", fname)
xs = ys
## Transcribe each WAV to Watson
for fname in glob("*.wav"):
# Download watson's response
tname = splitext(basename(fname))[0] + '.json'
if exists(tname):
print("Already transcribed", tname)
else:
print("Transcribing", fname)
with open(fname, 'rb') as r:
watson_response = requests.post(
WATSON_ENDPOINT,
data=r,
auth=(WATSON_USERNAME, WATSON_PASSWORD),
params=WATSON_DEFAULT_PARAMS,
headers=WATSON_DEFAULT_HEADERS,
stream=False
)
with open(tname, 'w') as w:
w.write(watson_response.text)
print("Wrote transcript to", tname)
# Print out the data
for fname in sorted(glob("*.json")):
with open(fname, r) as f:
results = json.load(f)
for linenum, result in enumerate(results): # each result is a line
if result.get('alternatives'): # each result may have many alternatives
# just pick best alternative
lineobj = result.get('alternatives')[0]
print(lineobj['transcript'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment