From bb5d7db5d79c92516e5784ce4ecf7daca70e426d Mon Sep 17 00:00:00 2001
From: Anthony Wu <462072+anthonywu@users.noreply.github.com>
Date: Thu, 3 Oct 2024 00:34:47 -1000
Subject: [PATCH] add support for audio and input name from stdin

---
 whisper/README.md            | 20 ++++++++++++--
 whisper/mlx_whisper/audio.py | 16 ++++++-----
 whisper/mlx_whisper/cli.py   | 38 ++++++++++++++++++++------
 whisper/test_cli.sh          | 53 ++++++++++++++++++++++++++++++++++++
 4 files changed, 108 insertions(+), 19 deletions(-)
 create mode 100755 whisper/test_cli.sh

diff --git a/whisper/README.md b/whisper/README.md
index ac6e95f6..06ffc3ca 100644
--- a/whisper/README.md
+++ b/whisper/README.md
@@ -25,8 +25,8 @@ pip install mlx-whisper
 
 At its simplest:
 
-```
-mlx_whisper audio_file.mp3
+```sh
+mlx_whisper audio_file.mp3  # output name will re-use basename of audio file path
 ```
 
 This will make a text file `audio_file.txt` with the results.
@@ -35,6 +35,20 @@ Use `-f` to specify the output format and `--model` to specify the model. There
 are many other supported command line options. To see them all, run
 `mlx_whisper -h`.
 
+Alternatively, you can pipe in the audio content of other programs via stdin,
+useful when `mlx_whisper` acts as a composable command line utility.
+
+```sh
+# hypothetical demo of audio content via stdin
+# default output file name will be content.*
+some-process | mlx_whisper
+
+# hypothetical demo of media content via stdin
+# use --input-name to name your output artifacts
+some-downloader https://some.url/media?id=lecture42 | mlx_whisper --input-name mlx-demo
+```
+
+
 #### API
 
 Transcribe audio with:
@@ -103,7 +117,7 @@ python convert.py --help
 ```
 
 By default, the conversion script will make the directory `mlx_models`
-and save the converted `weights.npz` and `config.json` there. 
+and save the converted `weights.npz` and `config.json` there.
 
 Each time it is run, `convert.py` will overwrite any model in the provided
 path. To save different models, make sure to set `--mlx-path` to a unique
diff --git a/whisper/mlx_whisper/audio.py b/whisper/mlx_whisper/audio.py
index e04309c1..f8bd8e69 100644
--- a/whisper/mlx_whisper/audio.py
+++ b/whisper/mlx_whisper/audio.py
@@ -3,7 +3,7 @@
 import os
 from functools import lru_cache
 from subprocess import CalledProcessError, run
-from typing import Union
+from typing import Optional, Union
 
 import mlx.core as mx
 import numpy as np
@@ -21,7 +21,7 @@ FRAMES_PER_SECOND = SAMPLE_RATE // HOP_LENGTH  # 10ms per audio frame
 TOKENS_PER_SECOND = SAMPLE_RATE // N_SAMPLES_PER_TOKEN  # 20ms per audio token
 
 
-def load_audio(file: str, sr: int = SAMPLE_RATE):
+def load_audio(file: str = Optional[str], sr: int = SAMPLE_RATE, from_stdin=False):
     """
     Open an audio file and read as mono waveform, resampling as necessary
 
@@ -40,18 +40,20 @@ def load_audio(file: str, sr: int = SAMPLE_RATE):
 
     # This launches a subprocess to decode audio while down-mixing
     # and resampling as necessary.  Requires the ffmpeg CLI in PATH.
+    if from_stdin:
+        cmd = ["ffmpeg", "-i", "pipe:0"]
+    else:
+        cmd = ["ffmpeg", "-nostdin", "-i", file]
+
     # fmt: off
-    cmd = [
-        "ffmpeg",
-        "-nostdin",
+    cmd.extend([
         "-threads", "0",
-        "-i", file,
         "-f", "s16le",
         "-ac", "1",
         "-acodec", "pcm_s16le",
         "-ar", str(sr),
         "-"
-    ]
+    ])
     # fmt: on
     try:
         out = run(cmd, capture_output=True, check=True).stdout
diff --git a/whisper/mlx_whisper/cli.py b/whisper/mlx_whisper/cli.py
index c2813338..98c7fd80 100644
--- a/whisper/mlx_whisper/cli.py
+++ b/whisper/mlx_whisper/cli.py
@@ -2,15 +2,17 @@
 
 import argparse
 import os
+import sys
 import traceback
 import warnings
 
+from . import audio
 from .tokenizer import LANGUAGES, TO_LANGUAGE_CODE
 from .transcribe import transcribe
 from .writers import get_writer
 
 
-def build_parser():
+def build_parser(is_audio_from_stdin=False):
     def optional_int(string):
         return None if string == "None" else int(string)
 
@@ -27,15 +29,22 @@ def build_parser():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
-    parser.add_argument(
-        "audio", nargs="+", type=str, help="Audio file(s) to transcribe"
-    )
+
+    if not is_audio_from_stdin:
+        parser.add_argument("audio", nargs="+", help="Audio file(s) to transcribe")
+
     parser.add_argument(
         "--model",
         default="mlx-community/whisper-tiny",
         type=str,
         help="The model directory or hugging face repo",
     )
+    parser.add_argument(
+        "--input-name",
+        type=str,
+        default="content",
+        help="logical name of audio content received via stdin",
+    )
     parser.add_argument(
         "--output-dir",
         "-o",
@@ -192,7 +201,8 @@ def build_parser():
 
 
 def main():
-    parser = build_parser()
+    is_audio_from_stdin = not os.isatty(sys.stdin.fileno())
+    parser = build_parser(is_audio_from_stdin=is_audio_from_stdin)
     args = vars(parser.parse_args())
     if args["verbose"] is True:
         print(f"Args: {args}")
@@ -219,17 +229,27 @@ def main():
         warnings.warn("--max-line-count has no effect without --max-line-width")
     if writer_args["max_words_per_line"] and writer_args["max_line_width"]:
         warnings.warn("--max-words-per-line has no effect with --max-line-width")
-    for audio_path in args.pop("audio"):
+
+    if is_audio_from_stdin:
+        audio_list = [audio.load_audio(from_stdin=True)]
+        input_name = args.pop("input_name")
+    else:
+        audio_list = args.pop("audio")
+        args.pop("input_name")
+
+    for audio_obj in audio_list:
         try:
             result = transcribe(
-                audio_path,
+                audio_obj,
                 path_or_hf_repo=path_or_hf_repo,
                 **args,
             )
-            writer(result, audio_path, **writer_args)
+            if not is_audio_from_stdin:
+                input_name = audio_obj
+            writer(result, input_name, **writer_args)
         except Exception as e:
             traceback.print_exc()
-            print(f"Skipping {audio_path} due to {type(e).__name__}: {str(e)}")
+            print(f"Skipping {audio_obj} due to {type(e).__name__}: {str(e)}")
 
 
 if __name__ == "__main__":
diff --git a/whisper/test_cli.sh b/whisper/test_cli.sh
new file mode 100755
index 00000000..78a12a3f
--- /dev/null
+++ b/whisper/test_cli.sh
@@ -0,0 +1,53 @@
+#!/bin/zsh -e
+
+set -o err_exit
+
+TEST_AUDIO="mlx_whisper/assets/ls_test.flac"
+
+# when not receiving stdin, check audio arg is required
+TEST_1="mlx_whisper requires audio position arg when not provided with stdin"
+if mlx_whisper 2>&1 | grep "the following arguments are required: audio" > /dev/null; then
+    echo "[PASS] $TEST_1"
+else
+    echo "[FAIL] $TEST_1"
+fi
+
+TEST_2="mlx_whisper does not require audio position arg when provided with stdin"
+if ! (/bin/cat "$TEST_AUDIO" | mlx_whisper --help | /usr/bin/grep "Audio file(s) to transcribe") > /dev/null; then
+    echo "[PASS] $TEST_2"
+else
+    echo "[FAIL] $TEST_2"
+fi
+
+TEST_3="mlx_whisper accepts optional --input-name arg"
+if (mlx_whisper --help | /usr/bin/grep "\-\-input-name") > /dev/null; then
+    echo "[PASS] $TEST_3"
+else
+    echo "[FAIL] $TEST_3"
+fi
+
+TEST_OUTPUT_DIR=$(mktemp -d -t mlx_whisper_cli_test)
+
+# the control output - cli called with audio position arg
+# expected output file name is ls_test.json
+mlx_whisper "$TEST_AUDIO" --output-dir "$TEST_OUTPUT_DIR" --output-format all --temperature 0 --verbose=False
+
+
+TEST_STDIN_1="mlx_whisper produces identical output whether provided audio arg or stdin of same content"
+# method stdin - output file is content.json (default --input-name is content when not provided)
+/bin/cat "$TEST_AUDIO" | mlx_whisper --output-dir "$TEST_OUTPUT_DIR" --output-format json --temperature 0 --verbose=False
+if diff "${TEST_OUTPUT_DIR}/content.json" "${TEST_OUTPUT_DIR}/ls_test.json"; then
+    echo "[PASS] $TEST_STDIN_1"
+else
+    echo "[FAIL] $TEST_STDIN_1"
+    echo "Check unexpected output in ${TEST_OUTPUT_DIR}"
+fi
+
+TEST_STDIN_2="mlx_whisper produces identical output when stdin comes via: cmd < file"、
+mlx_whisper --input-name stdin_test_2 --output-dir "$TEST_OUTPUT_DIR" --output-format tsv --temperature 0 --verbose=False < "$TEST_AUDIO"
+if diff "${TEST_OUTPUT_DIR}/stdin_test_2.tsv" "${TEST_OUTPUT_DIR}/ls_test.tsv"; then
+    echo "[PASS] $TEST_STDIN_2"
+else
+    echo "[FAIL] $TEST_STDIN_2"
+    echo "Check unexpected output in ${TEST_OUTPUT_DIR}"
+fi