[Whisper] Large-v3 requires 128 Mel frequency bins (#193)

* Large-v3 requires 128 Mel frequency bins

* extract correct model dimensions and use argparse

* format

* format

---------

Co-authored-by: Awni Hannun <awni@apple.com>
This commit is contained in:
Dimo 2023-12-28 22:50:35 +01:00 committed by GitHub
parent e1e56a625b
commit 07c163d9d9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,5 +1,5 @@
# Copyright © 2023 Apple Inc. # Copyright © 2023 Apple Inc.
import argparse
import sys import sys
import time import time
@ -10,6 +10,22 @@ from whisper import audio, decoding, load_models, transcribe
audio_file = "whisper/assets/ls_test.flac" audio_file = "whisper/assets/ls_test.flac"
def parse_arguments():
parser = argparse.ArgumentParser(description="Benchmark script.")
parser.add_argument(
"--all",
action="store_true",
help="Use all available models, i.e. tiny,small,medium,large-v3",
)
parser.add_argument(
"-m",
"--models",
type=str,
help="Specify models as a comma-separated list (e.g., tiny,small,medium)",
)
return parser.parse_args()
def timer(fn, *args): def timer(fn, *args):
for _ in range(5): for _ in range(5):
fn(*args) fn(*args)
@ -23,10 +39,10 @@ def timer(fn, *args):
return (toc - tic) / num_its return (toc - tic) / num_its
def feats(): def feats(n_mels: int = 80):
data = audio.load_audio(audio_file) data = audio.load_audio(audio_file)
data = audio.pad_or_trim(data) data = audio.pad_or_trim(data)
mels = audio.log_mel_spectrogram(data) mels = audio.log_mel_spectrogram(data, n_mels)
mx.eval(mels) mx.eval(mels)
return mels return mels
@ -46,20 +62,20 @@ def everything(model_name):
if __name__ == "__main__": if __name__ == "__main__":
args = parse_arguments()
# get command line arguments without 3rd party libraries if args.all:
# the command line argument to benchmark all models is "all" models = ["tiny", "small", "medium", "large-v3"]
elif args.models:
models = args.models.split(",")
else:
models = ["tiny"] models = ["tiny"]
if len(sys.argv) > 1:
if sys.argv[1] == "--all": print("Selected models:", models)
models = ["tiny", "small", "medium", "large"]
feat_time = timer(feats) feat_time = timer(feats)
print(f"\nFeature time {feat_time:.3f}") print(f"\nFeature time {feat_time:.3f}")
mels = feats()[None].astype(mx.float16)
for model_name in models: for model_name in models:
print(f"\nModel: {model_name.upper()}") print(f"\nModel: {model_name.upper()}")
tokens = mx.array( tokens = mx.array(
[ [
@ -95,6 +111,7 @@ if __name__ == "__main__":
mx.int32, mx.int32,
)[None] )[None]
model = load_models.load_model(f"{model_name}", dtype=mx.float16) model = load_models.load_model(f"{model_name}", dtype=mx.float16)
mels = feats(model.dims.n_mels)[None].astype(mx.float16)
model_forward_time = timer(model_forward, model, mels, tokens) model_forward_time = timer(model_forward, model, mels, tokens)
print(f"Model forward time {model_forward_time:.3f}") print(f"Model forward time {model_forward_time:.3f}")
decode_time = timer(decode, model, mels) decode_time = timer(decode, model, mels)