help="Language spoken in the audio, specify None to auto-detect",
)
parser.add_argument(
"--temperature",type=float,default=0,help="Temperature for sampling"
)
parser.add_argument(
"--best-of",
type=optional_int,
default=5,
help="Number of candidates when sampling with non-zero temperature",
)
parser.add_argument(
"--patience",
type=float,
default=None,
help="Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search",
)
parser.add_argument(
"--length-penalty",
type=float,
default=None,
help="Optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default.",
)
parser.add_argument(
"--suppress-tokens",
type=str,
default="-1",
help="Comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations",
)
parser.add_argument(
"--initial-prompt",
type=str,
default=None,
help="Optional text to provide as a prompt for the first window.",
)
parser.add_argument(
"--condition-on-previous-text",
type=str2bool,
default=True,
help="If True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop",
)
parser.add_argument(
"--fp16",
type=str2bool,
default=True,
help="Whether to perform inference in fp16",
)
parser.add_argument(
"--compression-ratio-threshold",
type=optional_float,
default=2.4,
help="if the gzip compression ratio is higher than this value, treat the decoding as failed",
)
parser.add_argument(
"--logprob-threshold",
type=optional_float,
default=-1.0,
help="If the average log probability is lower than this value, treat the decoding as failed",
)
parser.add_argument(
"--no-speech-threshold",
type=optional_float,
default=0.6,
help="If the probability of the token is higher than this value the decoding has failed due to `logprob_threshold`, consider the segment as silence",
)
parser.add_argument(
"--word-timestamps",
type=str2bool,
default=False,
help="Extract word-level timestamps and refine the results based on them",
)
parser.add_argument(
"--prepend-punctuations",
type=str,
default="\"'“¿([{-",
help="If word-timestamps is True, merge these punctuation symbols with the next word",
)
parser.add_argument(
"--append-punctuations",
type=str,
default="\"'.。,,!!??::”)]}、",
help="If word_timestamps is True, merge these punctuation symbols with the previous word",
)
parser.add_argument(
"--highlight-words",
type=str2bool,
default=False,
help="(requires --word_timestamps True) underline each word as it is spoken in srt and vtt",
)
parser.add_argument(
"--max-line-width",
type=int,
default=None,
help="(requires --word_timestamps True) the maximum number of characters in a line before breaking the line",
)
parser.add_argument(
"--max-line-count",
type=int,
default=None,
help="(requires --word_timestamps True) the maximum number of lines in a segment",
)
parser.add_argument(
"--max-words-per-line",
type=int,
default=None,
help="(requires --word_timestamps True, no effect with --max_line_width) the maximum number of words in a segment",
)
parser.add_argument(
"--hallucination-silence-threshold",
type=optional_float,
help="(requires --word_timestamps True) skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected",
)
parser.add_argument(
"--clip-timestamps",
type=str,
default="0",
help="Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to process, where the last end timestamp defaults to the end of the file",