From bd6b08e8139408a9b53d709b12ac4f0f53abc7d5 Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Mon, 4 Nov 2024 13:59:53 -0800 Subject: [PATCH] some nits --- whisper/README.md | 15 +++----- whisper/mlx_whisper/audio.py | 2 +- whisper/mlx_whisper/cli.py | 17 ++++++--- whisper/mlx_whisper/writers.py | 23 +++--------- whisper/test_cli.sh | 69 ---------------------------------- 5 files changed, 24 insertions(+), 102 deletions(-) delete mode 100755 whisper/test_cli.sh diff --git a/whisper/README.md b/whisper/README.md index 23d37783..cd3bc684 100644 --- a/whisper/README.md +++ b/whisper/README.md @@ -26,7 +26,7 @@ pip install mlx-whisper At its simplest: ```sh -mlx_whisper audio_file.mp3 # output name will re-use basename of audio file path +mlx_whisper audio_file.mp3 ``` This will make a text file `audio_file.txt` with the results. @@ -35,19 +35,14 @@ Use `-f` to specify the output format and `--model` to specify the model. There are many other supported command line options. To see them all, run `mlx_whisper -h`. -Alternatively, you can pipe in the audio content of other programs via stdin, -useful when `mlx_whisper` acts as a composable command line utility. +You can also pipe the audio content of other programs via stdin: ```sh -# hypothetical demo of audio content via stdin -# default output file name will be content.* -some-process | mlx_whisper - -# hypothetical demo of media content via stdin -# use --output-name to name your output artifacts -some-downloader https://some.url/media?id=lecture42 | mlx_whisper --output-name mlx-demo +some-process | mlx_whisper - ``` +The default output file name will be `content.*`. You can specify the name with +the `--output-name` flag. #### API diff --git a/whisper/mlx_whisper/audio.py b/whisper/mlx_whisper/audio.py index f8bd8e69..c8cca07c 100644 --- a/whisper/mlx_whisper/audio.py +++ b/whisper/mlx_whisper/audio.py @@ -39,7 +39,7 @@ def load_audio(file: str = Optional[str], sr: int = SAMPLE_RATE, from_stdin=Fals """ # This launches a subprocess to decode audio while down-mixing - # and resampling as necessary. Requires the ffmpeg CLI in PATH. + # and resampling as necessary. Requires the ffmpeg CLI in PATH. if from_stdin: cmd = ["ffmpeg", "-i", "pipe:0"] else: diff --git a/whisper/mlx_whisper/cli.py b/whisper/mlx_whisper/cli.py index 956d83a0..7d08a043 100644 --- a/whisper/mlx_whisper/cli.py +++ b/whisper/mlx_whisper/cli.py @@ -2,6 +2,7 @@ import argparse import os +import pathlib import traceback import warnings @@ -40,8 +41,11 @@ def build_parser(): parser.add_argument( "--output-name", type=str, - default="{basename}", - help="logical name of transcription/translation output files, before --output-format extensions", + default=None, + help=( + "The name of transcription/translation output files before " + "--output-format extensions" + ), ) parser.add_argument( "--output-dir", @@ -207,10 +211,10 @@ def main(): path_or_hf_repo: str = args.pop("model") output_dir: str = args.pop("output_dir") output_format: str = args.pop("output_format") - output_name_template: str = args.pop("output_name") + output_name: str = args.pop("output_name") os.makedirs(output_dir, exist_ok=True) - writer = get_writer(output_format, output_dir, output_name_template) + writer = get_writer(output_format, output_dir) word_options = [ "highlight_words", "max_line_count", @@ -233,13 +237,16 @@ def main(): # receive the contents from stdin rather than read a file audio_obj = audio.load_audio(from_stdin=True) + output_name = output_name or "content" + else: + output_name = output_name or pathlib.Path(audio_obj).stem try: result = transcribe( audio_obj, path_or_hf_repo=path_or_hf_repo, **args, ) - writer(result, audio_obj, **writer_args) + writer(result, output_name, **writer_args) except Exception as e: traceback.print_exc() print(f"Skipping {audio_obj} due to {type(e).__name__}: {str(e)}") diff --git a/whisper/mlx_whisper/writers.py b/whisper/mlx_whisper/writers.py index cbfe1f66..cdb35063 100644 --- a/whisper/mlx_whisper/writers.py +++ b/whisper/mlx_whisper/writers.py @@ -37,22 +37,13 @@ def get_start(segments: List[dict]) -> Optional[float]: class ResultWriter: extension: str - def __init__(self, output_dir: str, output_name_template: str): + def __init__(self, output_dir: str): self.output_dir = output_dir - self.output_name_template = output_name_template def __call__( - self, result: dict, audio_obj: str, options: Optional[dict] = None, **kwargs + self, result: dict, output_name: str, options: Optional[dict] = None, **kwargs ): - if isinstance(audio_obj, (str, pathlib.Path)): - basename = pathlib.Path(audio_obj).stem - else: - # mx.array, np.ndarray, etc - basename = "content" - - output_basename = self.output_name_template.format(basename=basename) - - output_path = (pathlib.Path(self.output_dir) / output_basename).with_suffix( + output_path = (pathlib.Path(self.output_dir) / output_name).with_suffix( f".{self.extension}" ) @@ -253,7 +244,7 @@ class WriteJSON(ResultWriter): def get_writer( - output_format: str, output_dir: str, output_name_template: str + output_format: str, output_dir: str ) -> Callable[[dict, TextIO, dict], None]: writers = { "txt": WriteTXT, @@ -264,9 +255,7 @@ def get_writer( } if output_format == "all": - all_writers = [ - writer(output_dir, output_name_template) for writer in writers.values() - ] + all_writers = [writer(output_dir) for writer in writers.values()] def write_all( result: dict, file: TextIO, options: Optional[dict] = None, **kwargs @@ -276,4 +265,4 @@ def get_writer( return write_all - return writers[output_format](output_dir, output_name_template) + return writers[output_format](output_dir) diff --git a/whisper/test_cli.sh b/whisper/test_cli.sh deleted file mode 100755 index 4e34a217..00000000 --- a/whisper/test_cli.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/zsh -e - -set -o err_exit - -TEST_AUDIO="mlx_whisper/assets/ls_test.flac" -TEST_OUTPUT_DIR=$(mktemp -d -t mlx_whisper_cli_test) - -# the control output - cli called with audio position arg -# expected output file name is ls_test.json -TEST_OUTPUT_NAME_FOR_ALL="--output-name arg is used for all output formats" -mlx_whisper "$TEST_AUDIO" \ - --output-dir "$TEST_OUTPUT_DIR" \ - --output-format all \ - --output-name '{basename}_transcribed' \ - --temperature 0 \ - --verbose=False -if /bin/ls ${TEST_OUTPUT_DIR}/ls_test_transcribed.{json,srt,tsv,txt,vtt} > /dev/null; then - echo "[PASS] $TEST_OUTPUT_NAME_FOR_ALL" -else - echo "[FAIL] $TEST_OUTPUT_NAME_FOR_ALL" -fi - - -TEST_OUTPUT_NAME_TEMPLATE="testing the output name template usage scenario" -for test_val in $(seq 10 10 60); do - mlx_whisper "$TEST_AUDIO" \ - --output-name "{basename}_mwpl_${test_val}" \ - --output-dir "$TEST_OUTPUT_DIR" \ - --output-format srt \ - --max-words-per-line $test_val \ - --word-timestamps True \ - --verbose=False - TEST_DESC="testing output name template while varying --max-words-per-line=${test_val}" - if /bin/ls $TEST_OUTPUT_DIR/ls_test_mwpl_${test_val}.srt > /dev/null; then - echo "[PASS] $TEST_DESC" - else - echo "[FAIL] $TEST_DESC" - fi -done - - -TEST_STDIN_1="mlx_whisper produces identical output whether provided audio arg or stdin of same content" -/bin/cat "$TEST_AUDIO" | mlx_whisper - \ - --output-dir "$TEST_OUTPUT_DIR" \ - --output-format json \ - --temperature 0 \ - --verbose=False -if diff "${TEST_OUTPUT_DIR}/content.json" "${TEST_OUTPUT_DIR}/ls_test_transcribed.json"; then - echo "[PASS] $TEST_STDIN_1" -else - echo "[FAIL] $TEST_STDIN_1" - echo "Check unexpected output in ${TEST_OUTPUT_DIR}" -fi - -TEST_STDIN_2="mlx_whisper produces identical output when stdin comes via: cmd < file"、 -mlx_whisper - \ - --output-name '{basename}_transcribed' \ - --output-dir "$TEST_OUTPUT_DIR" \ - --output-format tsv \ - --temperature 0 \ - --verbose=False < "$TEST_AUDIO" -if diff "${TEST_OUTPUT_DIR}/content_transcribed.tsv" "${TEST_OUTPUT_DIR}/ls_test_transcribed.tsv"; then - echo "[PASS] $TEST_STDIN_2" -else - echo "[FAIL] $TEST_STDIN_2" - echo "Check unexpected output in ${TEST_OUTPUT_DIR}" -fi - -echo "Outputs can be verified in ${TEST_OUTPUT_DIR}"