This commit is contained in:
Awni Hannun
2024-09-23 11:38:00 -07:00
parent f3c6ed99c8
commit 4111473f9d
3 changed files with 3 additions and 4 deletions

View File

@@ -27,6 +27,7 @@ Some more useful examples are listed below.
### Audio Models ### Audio Models
- Speech recognition with [OpenAI's Whisper](whisper). - Speech recognition with [OpenAI's Whisper](whisper).
- Audio compression and generation with [Meta's EnCodec](encodec).
### Multimodal models ### Multimodal models

View File

@@ -474,7 +474,6 @@ class EncodecModel(nn.Module):
self.encoder = EncodecEncoder(config) self.encoder = EncodecEncoder(config)
self.decoder = EncodecDecoder(config) self.decoder = EncodecDecoder(config)
self.quantizer = EncodecResidualVectorQuantizer(config) self.quantizer = EncodecResidualVectorQuantizer(config)
self.bits_per_codebook = int(math.log2(self.config.codebook_size))
def _encode_frame( def _encode_frame(
self, input_values: mx.array, bandwidth: float, padding_mask: mx.array self, input_values: mx.array, bandwidth: float, padding_mask: mx.array
@@ -527,8 +526,8 @@ class EncodecModel(nn.Module):
A list of frames containing the discrete encoded codes for the A list of frames containing the discrete encoded codes for the
input audio waveform, along with rescaling factors for each chunk input audio waveform, along with rescaling factors for each chunk
when ``config.normalize==True``. Each frame is a tuple ``(codebook, when ``config.normalize==True``. Each frame is a tuple ``(codebook,
scale)``, with ``codebook`` of shape ``[batch_size, num_codebooks, scale)``, with ``codebook`` of shape ``(batch_size, num_codebooks,
frames]``. frames)``.
""" """
if bandwidth is None: if bandwidth is None:

View File

@@ -2,7 +2,6 @@
import functools import functools
import json import json
import math
from pathlib import Path from pathlib import Path
from types import SimpleNamespace from types import SimpleNamespace
from typing import List, Optional, Union from typing import List, Optional, Union