From d9d0777c2ea4eb1bf82377375282f5a980d19aab Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Wed, 10 Jan 2024 14:14:38 -0800 Subject: [PATCH] docs up --- .../_sources/examples/llama-inference.rst | 2 +- docs/build/html/_sources/index.rst | 5 +- docs/build/html/_sources/install.rst | 24 + .../python/_autosummary/mlx.core.array.rst | 1 + .../python/_autosummary/mlx.core.divmod.rst | 6 + .../python/_autosummary/mlx.core.inner.rst | 6 + .../_autosummary/mlx.core.logical_and.rst | 6 + .../_autosummary/mlx.core.logical_or.rst | 6 + .../python/_autosummary/mlx.core.outer.rst | 6 + .../_autosummary/mlx.core.save_gguf.rst | 6 + .../build/html/_sources/python/data_types.rst | 4 +- .../mlx.nn.losses.cosine_similarity_loss.rst | 8 + docs/build/html/_sources/python/nn/losses.rst | 3 +- docs/build/html/_sources/python/ops.rst | 6 + docs/build/html/_sources/usage/indexing.rst | 123 +++ .../html/_sources/usage/lazy_evaluation.rst | 144 +++ docs/build/html/_sources/usage/numpy.rst | 5 + .../build/html/_sources/usage/quick_start.rst | 3 + .../_sources/usage/saving_and_loading.rst | 81 ++ docs/build/html/cpp/ops.html | 12 +- docs/build/html/dev/extensions.html | 482 +++++----- .../html/examples/linear_regression.html | 18 +- docs/build/html/examples/llama-inference.html | 54 +- docs/build/html/examples/mlp.html | 12 +- docs/build/html/genindex.html | 40 +- docs/build/html/index.html | 17 +- docs/build/html/install.html | 77 +- docs/build/html/objects.inv | Bin 6489 -> 6886 bytes .../python/_autosummary/mlx.core.Device.html | 12 +- .../python/_autosummary/mlx.core.Dtype.html | 12 +- .../python/_autosummary/mlx.core.Stream.html | 12 +- .../python/_autosummary/mlx.core.abs.html | 12 +- .../python/_autosummary/mlx.core.add.html | 12 +- .../python/_autosummary/mlx.core.all.html | 12 +- .../_autosummary/mlx.core.allclose.html | 12 +- .../python/_autosummary/mlx.core.any.html | 12 +- .../python/_autosummary/mlx.core.arange.html | 12 +- .../python/_autosummary/mlx.core.arccos.html | 12 +- .../python/_autosummary/mlx.core.arccosh.html | 12 +- .../python/_autosummary/mlx.core.arcsin.html | 12 +- .../python/_autosummary/mlx.core.arcsinh.html | 12 +- .../python/_autosummary/mlx.core.arctan.html | 12 +- .../python/_autosummary/mlx.core.arctanh.html | 12 +- .../python/_autosummary/mlx.core.argmax.html | 12 +- .../python/_autosummary/mlx.core.argmin.html | 12 +- .../_autosummary/mlx.core.argpartition.html | 12 +- .../python/_autosummary/mlx.core.argsort.html | 12 +- .../python/_autosummary/mlx.core.array.T.html | 12 +- .../_autosummary/mlx.core.array.abs.html | 12 +- .../_autosummary/mlx.core.array.all.html | 12 +- .../_autosummary/mlx.core.array.any.html | 12 +- .../_autosummary/mlx.core.array.argmax.html | 12 +- .../_autosummary/mlx.core.array.argmin.html | 12 +- .../_autosummary/mlx.core.array.astype.html | 12 +- .../_autosummary/mlx.core.array.cos.html | 12 +- .../_autosummary/mlx.core.array.dtype.html | 12 +- .../_autosummary/mlx.core.array.exp.html | 12 +- .../python/_autosummary/mlx.core.array.html | 27 +- .../_autosummary/mlx.core.array.item.html | 12 +- .../_autosummary/mlx.core.array.log.html | 12 +- .../_autosummary/mlx.core.array.log1p.html | 12 +- .../mlx.core.array.logsumexp.html | 12 +- .../_autosummary/mlx.core.array.max.html | 12 +- .../_autosummary/mlx.core.array.mean.html | 12 +- .../_autosummary/mlx.core.array.min.html | 12 +- .../_autosummary/mlx.core.array.ndim.html | 12 +- .../_autosummary/mlx.core.array.prod.html | 12 +- .../mlx.core.array.reciprocal.html | 12 +- .../_autosummary/mlx.core.array.reshape.html | 12 +- .../_autosummary/mlx.core.array.round.html | 12 +- .../_autosummary/mlx.core.array.rsqrt.html | 12 +- .../_autosummary/mlx.core.array.shape.html | 12 +- .../_autosummary/mlx.core.array.sin.html | 12 +- .../_autosummary/mlx.core.array.size.html | 12 +- .../_autosummary/mlx.core.array.split.html | 12 +- .../_autosummary/mlx.core.array.sqrt.html | 12 +- .../_autosummary/mlx.core.array.square.html | 12 +- .../_autosummary/mlx.core.array.sum.html | 12 +- .../_autosummary/mlx.core.array.tolist.html | 12 +- .../mlx.core.array.transpose.html | 12 +- .../_autosummary/mlx.core.array.var.html | 12 +- .../_autosummary/mlx.core.array_equal.html | 12 +- .../_autosummary/mlx.core.broadcast_to.html | 12 +- .../python/_autosummary/mlx.core.ceil.html | 12 +- .../python/_autosummary/mlx.core.clip.html | 12 +- .../_autosummary/mlx.core.concatenate.html | 12 +- .../python/_autosummary/mlx.core.conv1d.html | 12 +- .../python/_autosummary/mlx.core.conv2d.html | 12 +- .../_autosummary/mlx.core.convolve.html | 12 +- .../python/_autosummary/mlx.core.cos.html | 12 +- .../python/_autosummary/mlx.core.cosh.html | 12 +- .../_autosummary/mlx.core.default_device.html | 12 +- .../_autosummary/mlx.core.default_stream.html | 12 +- .../_autosummary/mlx.core.dequantize.html | 12 +- .../python/_autosummary/mlx.core.divide.html | 18 +- .../python/_autosummary/mlx.core.divmod.html | 756 +++++++++++++++ .../python/_autosummary/mlx.core.equal.html | 18 +- .../python/_autosummary/mlx.core.erf.html | 12 +- .../python/_autosummary/mlx.core.erfinv.html | 12 +- .../python/_autosummary/mlx.core.eval.html | 23 +- .../python/_autosummary/mlx.core.exp.html | 12 +- .../_autosummary/mlx.core.expand_dims.html | 12 +- .../python/_autosummary/mlx.core.eye.html | 12 +- .../python/_autosummary/mlx.core.fft.fft.html | 12 +- .../_autosummary/mlx.core.fft.fft2.html | 12 +- .../_autosummary/mlx.core.fft.fftn.html | 12 +- .../_autosummary/mlx.core.fft.ifft.html | 12 +- .../_autosummary/mlx.core.fft.ifft2.html | 12 +- .../_autosummary/mlx.core.fft.ifftn.html | 12 +- .../_autosummary/mlx.core.fft.irfft.html | 12 +- .../_autosummary/mlx.core.fft.irfft2.html | 12 +- .../_autosummary/mlx.core.fft.irfftn.html | 12 +- .../_autosummary/mlx.core.fft.rfft.html | 12 +- .../_autosummary/mlx.core.fft.rfft2.html | 12 +- .../_autosummary/mlx.core.fft.rfftn.html | 12 +- .../python/_autosummary/mlx.core.flatten.html | 12 +- .../python/_autosummary/mlx.core.floor.html | 12 +- .../_autosummary/mlx.core.floor_divide.html | 12 +- .../python/_autosummary/mlx.core.full.html | 12 +- .../python/_autosummary/mlx.core.grad.html | 12 +- .../python/_autosummary/mlx.core.greater.html | 12 +- .../_autosummary/mlx.core.greater_equal.html | 12 +- .../_autosummary/mlx.core.identity.html | 18 +- .../python/_autosummary/mlx.core.inner.html | 753 +++++++++++++++ .../python/_autosummary/mlx.core.jvp.html | 12 +- .../python/_autosummary/mlx.core.less.html | 18 +- .../_autosummary/mlx.core.less_equal.html | 12 +- .../_autosummary/mlx.core.linalg.norm.html | 12 +- .../_autosummary/mlx.core.linspace.html | 12 +- .../python/_autosummary/mlx.core.load.html | 20 +- .../python/_autosummary/mlx.core.log.html | 12 +- .../python/_autosummary/mlx.core.log10.html | 12 +- .../python/_autosummary/mlx.core.log1p.html | 12 +- .../python/_autosummary/mlx.core.log2.html | 12 +- .../_autosummary/mlx.core.logaddexp.html | 12 +- .../_autosummary/mlx.core.logical_and.html | 753 +++++++++++++++ .../_autosummary/mlx.core.logical_not.html | 18 +- .../_autosummary/mlx.core.logical_or.html | 753 +++++++++++++++ .../_autosummary/mlx.core.logsumexp.html | 18 +- .../python/_autosummary/mlx.core.matmul.html | 12 +- .../python/_autosummary/mlx.core.max.html | 12 +- .../python/_autosummary/mlx.core.maximum.html | 12 +- .../python/_autosummary/mlx.core.mean.html | 12 +- .../python/_autosummary/mlx.core.min.html | 12 +- .../python/_autosummary/mlx.core.minimum.html | 12 +- .../_autosummary/mlx.core.moveaxis.html | 12 +- .../_autosummary/mlx.core.multiply.html | 12 +- .../_autosummary/mlx.core.negative.html | 12 +- .../_autosummary/mlx.core.new_stream.html | 12 +- .../python/_autosummary/mlx.core.ones.html | 12 +- .../_autosummary/mlx.core.ones_like.html | 18 +- .../python/_autosummary/mlx.core.outer.html | 753 +++++++++++++++ .../python/_autosummary/mlx.core.pad.html | 12 +- .../_autosummary/mlx.core.partition.html | 18 +- .../python/_autosummary/mlx.core.prod.html | 12 +- .../_autosummary/mlx.core.quantize.html | 12 +- .../mlx.core.quantized_matmul.html | 12 +- .../mlx.core.random.bernoulli.html | 12 +- .../mlx.core.random.categorical.html | 12 +- .../_autosummary/mlx.core.random.gumbel.html | 12 +- .../_autosummary/mlx.core.random.key.html | 12 +- .../_autosummary/mlx.core.random.normal.html | 12 +- .../_autosummary/mlx.core.random.randint.html | 12 +- .../_autosummary/mlx.core.random.seed.html | 12 +- .../_autosummary/mlx.core.random.split.html | 12 +- .../mlx.core.random.truncated_normal.html | 12 +- .../_autosummary/mlx.core.random.uniform.html | 12 +- .../_autosummary/mlx.core.reciprocal.html | 12 +- .../python/_autosummary/mlx.core.repeat.html | 12 +- .../python/_autosummary/mlx.core.reshape.html | 12 +- .../python/_autosummary/mlx.core.round.html | 12 +- .../python/_autosummary/mlx.core.rsqrt.html | 12 +- .../python/_autosummary/mlx.core.save.html | 17 +- .../_autosummary/mlx.core.save_gguf.html | 749 +++++++++++++++ .../mlx.core.save_safetensors.html | 28 +- .../python/_autosummary/mlx.core.savez.html | 12 +- .../mlx.core.savez_compressed.html | 18 +- .../mlx.core.set_default_device.html | 12 +- .../mlx.core.set_default_stream.html | 12 +- .../python/_autosummary/mlx.core.sigmoid.html | 12 +- .../python/_autosummary/mlx.core.sign.html | 12 +- .../_autosummary/mlx.core.simplify.html | 12 +- .../python/_autosummary/mlx.core.sin.html | 12 +- .../python/_autosummary/mlx.core.sinh.html | 12 +- .../python/_autosummary/mlx.core.softmax.html | 12 +- .../python/_autosummary/mlx.core.sort.html | 12 +- .../python/_autosummary/mlx.core.split.html | 12 +- .../python/_autosummary/mlx.core.sqrt.html | 12 +- .../python/_autosummary/mlx.core.square.html | 12 +- .../python/_autosummary/mlx.core.squeeze.html | 12 +- .../python/_autosummary/mlx.core.stack.html | 12 +- .../_autosummary/mlx.core.stop_gradient.html | 12 +- .../_autosummary/mlx.core.subtract.html | 12 +- .../python/_autosummary/mlx.core.sum.html | 12 +- .../_autosummary/mlx.core.swapaxes.html | 12 +- .../python/_autosummary/mlx.core.take.html | 12 +- .../mlx.core.take_along_axis.html | 12 +- .../python/_autosummary/mlx.core.tan.html | 12 +- .../python/_autosummary/mlx.core.tanh.html | 12 +- .../_autosummary/mlx.core.tensordot.html | 12 +- .../_autosummary/mlx.core.transpose.html | 12 +- .../python/_autosummary/mlx.core.tri.html | 12 +- .../python/_autosummary/mlx.core.tril.html | 12 +- .../python/_autosummary/mlx.core.triu.html | 12 +- .../_autosummary/mlx.core.value_and_grad.html | 12 +- .../python/_autosummary/mlx.core.var.html | 12 +- .../python/_autosummary/mlx.core.vjp.html | 12 +- .../python/_autosummary/mlx.core.vmap.html | 12 +- .../python/_autosummary/mlx.core.where.html | 12 +- .../python/_autosummary/mlx.core.zeros.html | 12 +- .../_autosummary/mlx.core.zeros_like.html | 12 +- .../_autosummary/mlx.nn.value_and_grad.html | 12 +- .../_autosummary/mlx.optimizers.AdaDelta.html | 12 +- .../_autosummary/mlx.optimizers.Adagrad.html | 12 +- .../_autosummary/mlx.optimizers.Adam.html | 12 +- .../_autosummary/mlx.optimizers.AdamW.html | 12 +- .../_autosummary/mlx.optimizers.Adamax.html | 12 +- .../_autosummary/mlx.optimizers.Lion.html | 12 +- .../mlx.optimizers.Optimizer.html | 12 +- .../mlx.optimizers.OptimizerState.html | 12 +- .../_autosummary/mlx.optimizers.RMSprop.html | 12 +- .../_autosummary/mlx.optimizers.SGD.html | 12 +- .../_autosummary/mlx.utils.tree_flatten.html | 12 +- .../_autosummary/mlx.utils.tree_map.html | 12 +- .../mlx.utils.tree_unflatten.html | 12 +- docs/build/html/python/array.html | 12 +- docs/build/html/python/data_types.html | 16 +- .../html/python/devices_and_streams.html | 12 +- docs/build/html/python/fft.html | 12 +- docs/build/html/python/linalg.html | 12 +- docs/build/html/python/nn.html | 19 +- .../python/nn/_autosummary/mlx.nn.ALiBi.html | 12 +- .../nn/_autosummary/mlx.nn.BatchNorm.html | 12 +- .../python/nn/_autosummary/mlx.nn.Conv1d.html | 12 +- .../python/nn/_autosummary/mlx.nn.Conv2d.html | 12 +- .../nn/_autosummary/mlx.nn.Dropout.html | 12 +- .../nn/_autosummary/mlx.nn.Dropout2d.html | 12 +- .../nn/_autosummary/mlx.nn.Dropout3d.html | 12 +- .../nn/_autosummary/mlx.nn.Embedding.html | 12 +- .../python/nn/_autosummary/mlx.nn.GELU.html | 12 +- .../nn/_autosummary/mlx.nn.GroupNorm.html | 12 +- .../nn/_autosummary/mlx.nn.InstanceNorm.html | 12 +- .../nn/_autosummary/mlx.nn.LayerNorm.html | 12 +- .../python/nn/_autosummary/mlx.nn.Linear.html | 12 +- .../python/nn/_autosummary/mlx.nn.Mish.html | 12 +- .../nn/_autosummary/mlx.nn.Module.apply.html | 12 +- .../mlx.nn.Module.apply_to_modules.html | 12 +- .../_autosummary/mlx.nn.Module.children.html | 12 +- .../nn/_autosummary/mlx.nn.Module.eval.html | 12 +- .../mlx.nn.Module.filter_and_map.html | 12 +- .../nn/_autosummary/mlx.nn.Module.freeze.html | 12 +- .../mlx.nn.Module.leaf_modules.html | 12 +- .../mlx.nn.Module.load_weights.html | 12 +- .../_autosummary/mlx.nn.Module.modules.html | 12 +- .../mlx.nn.Module.named_modules.html | 12 +- .../mlx.nn.Module.parameters.html | 12 +- .../mlx.nn.Module.save_weights.html | 12 +- .../nn/_autosummary/mlx.nn.Module.train.html | 12 +- .../mlx.nn.Module.trainable_parameters.html | 12 +- .../_autosummary/mlx.nn.Module.training.html | 12 +- .../_autosummary/mlx.nn.Module.unfreeze.html | 12 +- .../nn/_autosummary/mlx.nn.Module.update.html | 12 +- .../mlx.nn.Module.update_modules.html | 12 +- .../mlx.nn.MultiHeadAttention.html | 12 +- .../python/nn/_autosummary/mlx.nn.PReLU.html | 12 +- .../_autosummary/mlx.nn.QuantizedLinear.html | 12 +- .../nn/_autosummary/mlx.nn.RMSNorm.html | 12 +- .../python/nn/_autosummary/mlx.nn.ReLU.html | 12 +- .../python/nn/_autosummary/mlx.nn.RoPE.html | 34 +- .../python/nn/_autosummary/mlx.nn.SELU.html | 12 +- .../nn/_autosummary/mlx.nn.Sequential.html | 12 +- .../python/nn/_autosummary/mlx.nn.SiLU.html | 12 +- .../mlx.nn.SinusoidalPositionalEncoding.html | 12 +- .../python/nn/_autosummary/mlx.nn.Step.html | 12 +- .../nn/_autosummary/mlx.nn.Transformer.html | 12 +- .../_autosummary_functions/mlx.nn.gelu.html | 12 +- .../mlx.nn.gelu_approx.html | 12 +- .../mlx.nn.gelu_fast_approx.html | 12 +- .../mlx.nn.losses.binary_cross_entropy.html | 12 +- .../mlx.nn.losses.cosine_similarity_loss.html | 762 +++++++++++++++ .../mlx.nn.losses.cross_entropy.html | 12 +- .../mlx.nn.losses.hinge_loss.html | 12 +- .../mlx.nn.losses.huber_loss.html | 12 +- .../mlx.nn.losses.kl_div_loss.html | 12 +- .../mlx.nn.losses.l1_loss.html | 12 +- .../mlx.nn.losses.log_cosh_loss.html | 18 +- .../mlx.nn.losses.mse_loss.html | 12 +- .../mlx.nn.losses.nll_loss.html | 12 +- .../mlx.nn.losses.smooth_l1_loss.html | 12 +- .../mlx.nn.losses.triplet_loss.html | 12 +- .../_autosummary_functions/mlx.nn.mish.html | 12 +- .../_autosummary_functions/mlx.nn.prelu.html | 12 +- .../_autosummary_functions/mlx.nn.relu.html | 12 +- .../_autosummary_functions/mlx.nn.selu.html | 12 +- .../_autosummary_functions/mlx.nn.silu.html | 12 +- .../_autosummary_functions/mlx.nn.step.html | 12 +- docs/build/html/python/nn/functions.html | 12 +- docs/build/html/python/nn/layers.html | 12 +- docs/build/html/python/nn/losses.html | 15 +- docs/build/html/python/nn/module.html | 12 +- docs/build/html/python/ops.html | 86 +- docs/build/html/python/optimizers.html | 18 +- docs/build/html/python/random.html | 12 +- docs/build/html/python/transforms.html | 14 +- docs/build/html/python/tree_utils.html | 12 +- docs/build/html/search.html | 12 +- docs/build/html/searchindex.js | 2 +- docs/build/html/usage/indexing.html | 855 +++++++++++++++++ docs/build/html/usage/lazy_evaluation.html | 879 ++++++++++++++++++ docs/build/html/usage/numpy.html | 29 +- docs/build/html/usage/quick_start.html | 19 +- docs/build/html/usage/saving_and_loading.html | 808 ++++++++++++++++ docs/build/html/usage/unified_memory.html | 24 +- docs/build/html/usage/using_streams.html | 24 +- 314 files changed, 11870 insertions(+), 691 deletions(-) create mode 100644 docs/build/html/_sources/python/_autosummary/mlx.core.divmod.rst create mode 100644 docs/build/html/_sources/python/_autosummary/mlx.core.inner.rst create mode 100644 docs/build/html/_sources/python/_autosummary/mlx.core.logical_and.rst create mode 100644 docs/build/html/_sources/python/_autosummary/mlx.core.logical_or.rst create mode 100644 docs/build/html/_sources/python/_autosummary/mlx.core.outer.rst create mode 100644 docs/build/html/_sources/python/_autosummary/mlx.core.save_gguf.rst create mode 100644 docs/build/html/_sources/python/nn/_autosummary_functions/mlx.nn.losses.cosine_similarity_loss.rst create mode 100644 docs/build/html/_sources/usage/indexing.rst create mode 100644 docs/build/html/_sources/usage/lazy_evaluation.rst create mode 100644 docs/build/html/_sources/usage/saving_and_loading.rst create mode 100644 docs/build/html/python/_autosummary/mlx.core.divmod.html create mode 100644 docs/build/html/python/_autosummary/mlx.core.inner.html create mode 100644 docs/build/html/python/_autosummary/mlx.core.logical_and.html create mode 100644 docs/build/html/python/_autosummary/mlx.core.logical_or.html create mode 100644 docs/build/html/python/_autosummary/mlx.core.outer.html create mode 100644 docs/build/html/python/_autosummary/mlx.core.save_gguf.html create mode 100644 docs/build/html/python/nn/_autosummary_functions/mlx.nn.losses.cosine_similarity_loss.html create mode 100644 docs/build/html/usage/indexing.html create mode 100644 docs/build/html/usage/lazy_evaluation.html create mode 100644 docs/build/html/usage/saving_and_loading.html diff --git a/docs/build/html/_sources/examples/llama-inference.rst b/docs/build/html/_sources/examples/llama-inference.rst index 20019e911..0e080146b 100644 --- a/docs/build/html/_sources/examples/llama-inference.rst +++ b/docs/build/html/_sources/examples/llama-inference.rst @@ -371,7 +371,7 @@ Scripts The full example code is available in `mlx-examples`_. -.. _mlx-examples: https://github.com/ml-explore/mlx-examples/tree/main/llama +.. _mlx-examples: https://github.com/ml-explore/mlx-examples/tree/main/llms/llama .. [1] Su, J., Lu, Y., Pan, S., Murtadha, A., Wen, B. and Liu, Y., 2021. Roformer: Enhanced transformer with rotary position embedding. arXiv diff --git a/docs/build/html/_sources/index.rst b/docs/build/html/_sources/index.rst index f1fe468ca..cd3db34b3 100644 --- a/docs/build/html/_sources/index.rst +++ b/docs/build/html/_sources/index.rst @@ -36,9 +36,12 @@ are the CPU and GPU. :maxdepth: 1 usage/quick_start + usage/lazy_evaluation usage/unified_memory - usage/using_streams + usage/indexing + usage/saving_and_loading usage/numpy + usage/using_streams .. toctree:: :caption: Examples diff --git a/docs/build/html/_sources/install.rst b/docs/build/html/_sources/install.rst index 92669ab6e..1883329fb 100644 --- a/docs/build/html/_sources/install.rst +++ b/docs/build/html/_sources/install.rst @@ -48,6 +48,9 @@ Build Requirements - `cmake `_ -- version 3.24 or later, and ``make`` - Xcode >= 14.3 (Xcode >= 15.0 for macOS 14 and above) +.. note:: + Ensure your shell environment is native ``arm``, not ``x86`` via Rosetta. If + the output of ``uname -p`` is ``x86``, see the :ref:`troubleshooting section ` below. Python API ^^^^^^^^^^ @@ -169,6 +172,7 @@ should point to the path to the built metal library. Troubleshooting ^^^^^^^^^^^^^^^ + Metal not found ~~~~~~~~~~~~~~~ @@ -189,3 +193,23 @@ Then set the active developer directory: .. code-block:: shell sudo xcode-select --switch /Applications/Xcode.app/Contents/Developer + +x86 Shell +~~~~~~~~~ + +.. _build shell: + +If the ouptut of ``uname -p`` is ``x86`` then your shell is running as x86 via +Rosetta instead of natively. + +To fix this, find the application in Finder (``/Applications`` for iTerm, +``/Applications/Utilities`` for Terminal), right-click, and click “Get Info”. +Uncheck “Open using Rosetta”, close the “Get Info” window, and restart your +terminal. + +Verify the terminal is now running natively the following command: + +.. code-block:: shell + + $ uname -p + arm diff --git a/docs/build/html/_sources/python/_autosummary/mlx.core.array.rst b/docs/build/html/_sources/python/_autosummary/mlx.core.array.rst index 65b6384cc..af3098cfa 100644 --- a/docs/build/html/_sources/python/_autosummary/mlx.core.array.rst +++ b/docs/build/html/_sources/python/_autosummary/mlx.core.array.rst @@ -62,6 +62,7 @@ .. autosummary:: ~array.T + ~array.at ~array.dtype ~array.itemsize ~array.nbytes diff --git a/docs/build/html/_sources/python/_autosummary/mlx.core.divmod.rst b/docs/build/html/_sources/python/_autosummary/mlx.core.divmod.rst new file mode 100644 index 000000000..15f631e97 --- /dev/null +++ b/docs/build/html/_sources/python/_autosummary/mlx.core.divmod.rst @@ -0,0 +1,6 @@ +mlx.core.divmod +=============== + +.. currentmodule:: mlx.core + +.. autofunction:: divmod \ No newline at end of file diff --git a/docs/build/html/_sources/python/_autosummary/mlx.core.inner.rst b/docs/build/html/_sources/python/_autosummary/mlx.core.inner.rst new file mode 100644 index 000000000..a22b2a30e --- /dev/null +++ b/docs/build/html/_sources/python/_autosummary/mlx.core.inner.rst @@ -0,0 +1,6 @@ +mlx.core.inner +============== + +.. currentmodule:: mlx.core + +.. autofunction:: inner \ No newline at end of file diff --git a/docs/build/html/_sources/python/_autosummary/mlx.core.logical_and.rst b/docs/build/html/_sources/python/_autosummary/mlx.core.logical_and.rst new file mode 100644 index 000000000..64862529e --- /dev/null +++ b/docs/build/html/_sources/python/_autosummary/mlx.core.logical_and.rst @@ -0,0 +1,6 @@ +mlx.core.logical\_and +===================== + +.. currentmodule:: mlx.core + +.. autofunction:: logical_and \ No newline at end of file diff --git a/docs/build/html/_sources/python/_autosummary/mlx.core.logical_or.rst b/docs/build/html/_sources/python/_autosummary/mlx.core.logical_or.rst new file mode 100644 index 000000000..ba1eb496e --- /dev/null +++ b/docs/build/html/_sources/python/_autosummary/mlx.core.logical_or.rst @@ -0,0 +1,6 @@ +mlx.core.logical\_or +==================== + +.. currentmodule:: mlx.core + +.. autofunction:: logical_or \ No newline at end of file diff --git a/docs/build/html/_sources/python/_autosummary/mlx.core.outer.rst b/docs/build/html/_sources/python/_autosummary/mlx.core.outer.rst new file mode 100644 index 000000000..c48566525 --- /dev/null +++ b/docs/build/html/_sources/python/_autosummary/mlx.core.outer.rst @@ -0,0 +1,6 @@ +mlx.core.outer +============== + +.. currentmodule:: mlx.core + +.. autofunction:: outer \ No newline at end of file diff --git a/docs/build/html/_sources/python/_autosummary/mlx.core.save_gguf.rst b/docs/build/html/_sources/python/_autosummary/mlx.core.save_gguf.rst new file mode 100644 index 000000000..55abf70c4 --- /dev/null +++ b/docs/build/html/_sources/python/_autosummary/mlx.core.save_gguf.rst @@ -0,0 +1,6 @@ +mlx.core.save\_gguf +=================== + +.. currentmodule:: mlx.core + +.. autofunction:: save_gguf \ No newline at end of file diff --git a/docs/build/html/_sources/python/data_types.rst b/docs/build/html/_sources/python/data_types.rst index cbb5c9a3f..c1b240d86 100644 --- a/docs/build/html/_sources/python/data_types.rst +++ b/docs/build/html/_sources/python/data_types.rst @@ -29,9 +29,9 @@ The default floating point type is ``float32`` and the default integer type is * - ``uint32`` - 4 - 32-bit unsigned integer - * - ``uint32`` + * - ``uint64`` - 8 - - 32-bit unsigned integer + - 64-bit unsigned integer * - ``int8`` - 1 - 8-bit signed integer diff --git a/docs/build/html/_sources/python/nn/_autosummary_functions/mlx.nn.losses.cosine_similarity_loss.rst b/docs/build/html/_sources/python/nn/_autosummary_functions/mlx.nn.losses.cosine_similarity_loss.rst new file mode 100644 index 000000000..7970aaca7 --- /dev/null +++ b/docs/build/html/_sources/python/nn/_autosummary_functions/mlx.nn.losses.cosine_similarity_loss.rst @@ -0,0 +1,8 @@ +mlx.nn.losses.cosine\_similarity\_loss +====================================== + +.. currentmodule:: mlx.nn.losses + +.. autoclass:: cosine_similarity_loss + + \ No newline at end of file diff --git a/docs/build/html/_sources/python/nn/losses.rst b/docs/build/html/_sources/python/nn/losses.rst index 3fb7589f8..5a80ba947 100644 --- a/docs/build/html/_sources/python/nn/losses.rst +++ b/docs/build/html/_sources/python/nn/losses.rst @@ -19,4 +19,5 @@ Loss Functions triplet_loss hinge_loss huber_loss - log_cosh_loss \ No newline at end of file + log_cosh_loss + cosine_similarity_loss \ No newline at end of file diff --git a/docs/build/html/_sources/python/ops.rst b/docs/build/html/_sources/python/ops.rst index 4e399524e..3dcd3660d 100644 --- a/docs/build/html/_sources/python/ops.rst +++ b/docs/build/html/_sources/python/ops.rst @@ -36,6 +36,7 @@ Operations cosh dequantize divide + divmod equal erf erfinv @@ -49,6 +50,7 @@ Operations greater greater_equal identity + inner less less_equal linspace @@ -59,6 +61,8 @@ Operations log1p logaddexp logical_not + logical_and + logical_or logsumexp matmul max @@ -71,6 +75,7 @@ Operations negative ones ones_like + outer partition pad prod @@ -84,6 +89,7 @@ Operations save savez savez_compressed + save_gguf save_safetensors sigmoid sign diff --git a/docs/build/html/_sources/usage/indexing.rst b/docs/build/html/_sources/usage/indexing.rst new file mode 100644 index 000000000..458541923 --- /dev/null +++ b/docs/build/html/_sources/usage/indexing.rst @@ -0,0 +1,123 @@ +.. _indexing: + +Indexing Arrays +=============== + +.. currentmodule:: mlx.core + +For the most part, indexing an MLX :obj:`array` works the same as indexing a +NumPy :obj:`numpy.ndarray`. See the `NumPy documentation +`_ for more details on +how that works. + +For example, you can use regular integers and slices (:obj:`slice`) to index arrays: + +.. code-block:: shell + + >>> arr = mx.arange(10) + >>> arr[3] + array(3, dtype=int32) + >>> arr[-2] # negative indexing works + array(8, dtype=int32) + >>> arr[2:8:2] # start, stop, stride + array([2, 4, 6], dtype=int32) + +For multi-dimensional arrays, the ``...`` or :obj:`Ellipsis` syntax works as in NumPy: + +.. code-block:: shell + + >>> arr = mx.arange(8).reshape(2, 2, 2) + >>> arr[:, :, 0] + array(3, dtype=int32) + array([[0, 2], + [4, 6]], dtype=int32 + >>> arr[..., 0] + array([[0, 2], + [4, 6]], dtype=int32 + +You can index with ``None`` to create a new axis: + +.. code-block:: shell + + >>> arr = mx.arange(8) + >>> arr.shape + [8] + >>> arr[None].shape + [1, 8] + + +You can also use an :obj:`array` to index another :obj:`array`: + +.. code-block:: shell + + >>> arr = mx.arange(10) + >>> idx = mx.array([5, 7]) + >>> arr[idx] + array([5, 7], dtype=int32) + +Mixing and matching integers, :obj:`slice`, ``...``, and :obj:`array` indices +works just as in NumPy. + +Other functions which may be useful for indexing arrays are :func:`take` and +:func:`take_along_axis`. + +Differences from NumPy +---------------------- + +.. Note:: + + MLX indexing is different from NumPy indexing in two important ways: + + * Indexing does not perform bounds checking. Indexing out of bounds is + undefined behavior. + * Boolean mask based indexing is not yet supported. + +The reason for the lack of bounds checking is that exceptions cannot propagate +from the GPU. Performing bounds checking for array indices before launching the +kernel would be extremely inefficient. + +Indexing with boolean masks is something that MLX may support in the future. In +general, MLX has limited support for operations for which outputs +*shapes* are dependent on input *data*. Other examples of these types of +operations which MLX does not yet support include :func:`numpy.nonzero` and the +single input version of :func:`numpy.where`. + +In Place Updates +---------------- + +In place updates to indexed arrays are possible in MLX. For example: + +.. code-block:: shell + + >>> a = mx.array([1, 2, 3]) + >>> a[2] = 0 + >>> a + array([1, 2, 0], dtype=int32) + +Just as in NumPy, in place updates will be reflected in all references to the +same array: + +.. code-block:: shell + + >>> a = mx.array([1, 2, 3]) + >>> b = a + >>> b[2] = 0 + >>> b + array([1, 2, 0], dtype=int32) + >>> a + array([1, 2, 0], dtype=int32) + +Transformations of functions which use in-place updates are allowed and work as +expected. For example: + +.. code-block:: python + + def fun(x, idx): + x[idx] = 2.0 + return x.sum() + + dfdx = mx.grad(fun)(mx.array([1.0, 2.0, 3.0]), mx.array([1])) + print(dfdx) # Prints: array([1, 0, 1], dtype=float32) + +In the above ``dfdx`` will have the correct gradient, namely zeros at ``idx`` +and ones elsewhere. diff --git a/docs/build/html/_sources/usage/lazy_evaluation.rst b/docs/build/html/_sources/usage/lazy_evaluation.rst new file mode 100644 index 000000000..4f14ceeed --- /dev/null +++ b/docs/build/html/_sources/usage/lazy_evaluation.rst @@ -0,0 +1,144 @@ +.. _lazy eval: + +Lazy Evaluation +=============== + +.. currentmodule:: mlx.core + +Why Lazy Evaluation +------------------- + +When you perform operations in MLX, no computation actually happens. Instead a +compute graph is recorded. The actual computation only happens if an +:func:`eval` is performed. + +MLX uses lazy evaluation because it has some nice features, some of which we +describe below. + +Transforming Compute Graphs +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Lazy evaluation let's us record a compute graph without actually doing any +computations. This is useful for function transformations like :func:`grad` and +:func:`vmap` and graph optimizations like :func:`simplify`. + +Currently, MLX does not compile and rerun compute graphs. They are all +generated dynamically. However, lazy evaluation makes it much easier to +integrate compilation for future performance enhancements. + +Only Compute What You Use +^^^^^^^^^^^^^^^^^^^^^^^^^ + +In MLX you do not need to worry as much about computing outputs that are never +used. For example: + +.. code-block:: python + + def fun(x): + a = fun1(x) + b = expensive_fun(a) + return a, b + + y, _ = fun(x) + +Here, we never actually compute the output of ``expensive_fun``. Use this +pattern with care though, as the graph of ``expensive_fun`` is still built, and +that has some cost associated to it. + +Similarly, lazy evaluation can be beneficial for saving memory while keeping +code simple. Say you have a very large model ``Model`` derived from +:obj:`mlx.nn.Module`. You can instantiate this model with ``model = Model()``. +Typically, this will initialize all of the weights as ``float32``, but the +initialization does not actually compute anything until you perform an +:func:`eval`. If you update the model with ``float16`` weights, your maximum +consumed memory will be half that required if eager computation was used +instead. + +This pattern is simple to do in MLX thanks to lazy computation: + +.. code-block:: python + + model = Model() # no memory used yet + model.load_weights("weights_fp16.safetensors") + +When to Evaluate +---------------- + +A common question is when to use :func:`eval`. The trade-off is between +letting graphs get too large and not batching enough useful work. + +For example: + +.. code-block:: python + + for _ in range(100): + a = a + b + mx.eval(a) + b = b * 2 + mx.eval(b) + +This is a bad idea because there is some fixed overhead with each graph +evaluation. On the other hand, there is some slight overhead which grows with +the compute graph size, so extremely large graphs (while computationally +correct) can be costly. + +Luckily, a wide range of compute graph sizes work pretty well with MLX: +anything from a few tens of operations to many thousands of operations per +evaluation should be okay. + +Most numerical computations have an iterative outer loop (e.g. the iteration in +stochastic gradient descent). A natural and usually efficient place to use +:func:`eval` is at each iteration of this outer loop. + +Here is a concrete example: + +.. code-block:: python + + for batch in dataset: + + # Nothing has been evaluated yet + loss, grad = value_and_grad_fn(model, batch) + + # Still nothing has been evaluated + optimizer.update(model, grad) + + # Evaluate the loss and the new parameters which will + # run the full gradient computation and optimizer update + mx.eval(loss, model.parameters()) + + +An important behavior to be aware of is when the graph will be implicitly +evaluated. Anytime you ``print`` an array, convert it to an +:obj:`numpy.ndarray`, or otherwise access it's memory via :obj:`memoryview`, +the graph will be evaluated. Saving arrays via :func:`save` (or any other MLX +saving functions) will also evaluate the array. + + +Calling :func:`array.item` on a scalar array will also evaluate it. In the +example above, printing the loss (``print(loss)``) or adding the loss scalar to +a list (``losses.append(loss.item())``) would cause a graph evaluation. If +these lines are before ``mx.eval(loss, model.parameters())`` then this +will be a partial evaluation, computing only the forward pass. + +Also, calling :func:`eval` on an array or set of arrays multiple times is +perfectly fine. This is effectively a no-op. + +.. warning:: + + Using scalar arrays for control-flow will cause an evaluation. + +Here is an example: + +.. code-block:: python + + def fun(x): + h, y = first_layer(x) + if y > 0: # An evaluation is done here! + z = second_layer_a(h) + else: + z = second_layer_b(h) + return z + +Using arrays for control flow should be done with care. The above example works +and can even be used with gradient transformations. However, this can be very +inefficient if evaluations are done too frequently. diff --git a/docs/build/html/_sources/usage/numpy.rst b/docs/build/html/_sources/usage/numpy.rst index ef075ad0c..1ed801454 100644 --- a/docs/build/html/_sources/usage/numpy.rst +++ b/docs/build/html/_sources/usage/numpy.rst @@ -62,6 +62,11 @@ even though no in-place operations on MLX memory are executed. PyTorch ------- +.. warning:: + + PyTorch Support for :obj:`memoryview` is experimental and can break for + multi-dimensional arrays. Casting to NumPy first is advised for now. + PyTorch supports the buffer protocol, but it requires an explicit :obj:`memoryview`. .. code-block:: python diff --git a/docs/build/html/_sources/usage/quick_start.rst b/docs/build/html/_sources/usage/quick_start.rst index 9ffd29ae6..251f5344c 100644 --- a/docs/build/html/_sources/usage/quick_start.rst +++ b/docs/build/html/_sources/usage/quick_start.rst @@ -40,6 +40,9 @@ automatically evaluate the array. >> np.array(c) # Also evaluates c array([2., 4., 6., 8.], dtype=float32) + +See the page on :ref:`Lazy Evaluation ` for more details. + Function and Graph Transformations ---------------------------------- diff --git a/docs/build/html/_sources/usage/saving_and_loading.rst b/docs/build/html/_sources/usage/saving_and_loading.rst new file mode 100644 index 000000000..895ca342f --- /dev/null +++ b/docs/build/html/_sources/usage/saving_and_loading.rst @@ -0,0 +1,81 @@ +.. _saving_and_loading: + +Saving and Loading Arrays +========================= + +.. currentmodule:: mlx.core + +MLX supports multiple array serialization formats. + +.. list-table:: Serialization Formats + :widths: 20 8 25 25 + :header-rows: 1 + + * - Format + - Extension + - Function + - Notes + * - NumPy + - ``.npy`` + - :func:`save` + - Single arrays only + * - NumPy archive + - ``.npz`` + - :func:`savez` and :func:`savez_compressed` + - Multiple arrays + * - Safetensors + - ``.safetensors`` + - :func:`save_safetensors` + - Multiple arrays + * - GGUF + - ``.gguf`` + - :func:`save_gguf` + - Multiple arrays + +The :func:`load` function will load any of the supported serialization +formats. It determines the format from the extensions. The output of +:func:`load` depends on the format. + +Here's an example of saving a single array to a file: + +.. code-block:: shell + + >>> a = mx.array([1.0]) + >>> mx.save("array", a) + +The array ``a`` will be saved in the file ``array.npy`` (notice the extension +is automatically added). Including the extension is optional; if it is missing +it will be added. You can load the array with: + +.. code-block:: shell + + >>> mx.load("array.npy", a) + array([1], dtype=float32) + +Here's an example of saving several arrays to a single file: + +.. code-block:: shell + + >>> a = mx.array([1.0]) + >>> b = mx.array([2.0]) + >>> mx.savez("arrays", a, b=b) + +For compatibility with :func:`numpy.savez` the MLX :func:`savez` takes arrays +as arguments. If the keywords are missing, then default names will be +provided. This can be loaded with: + +.. code-block:: shell + + >>> mx.load("arrays.npz") + {'b': array([2], dtype=float32), 'arr_0': array([1], dtype=float32)} + +In this case :func:`load` returns a dictionary of names to arrays. + +The functions :func:`save_safetensors` and :func:`save_gguf` are similar to +:func:`savez`, but they take as input a :obj:`dict` of string names to arrays: + +.. code-block:: shell + + >>> a = mx.array([1.0]) + >>> b = mx.array([2.0]) + >>> mx.save_safetensors("arrays", {"a": a, "b": b}) diff --git a/docs/build/html/cpp/ops.html b/docs/build/html/cpp/ops.html index dbce60725..e7d6a6bc9 100644 --- a/docs/build/html/cpp/ops.html +++ b/docs/build/html/cpp/ops.html @@ -148,9 +148,12 @@

Usage

Examples

diff --git a/docs/build/html/dev/extensions.html b/docs/build/html/dev/extensions.html index bde17609a..95b0bb07f 100644 --- a/docs/build/html/dev/extensions.html +++ b/docs/build/html/dev/extensions.html @@ -147,9 +147,12 @@

Usage

Examples

@@ -726,33 +736,33 @@ C++ API:

* * Follow numpy style broadcasting between x and y * Inputs are upcasted to floats if needed -**/ -array axpby( +**/ +array axpby( const array& x, // Input array x const array& y, // Input array y const float alpha, // Scaling factor for x const float beta, // Scaling factor for y StreamOrDevice s = {} // Stream on which to schedule the operation -); +);

This operation itself can call other operations within it if needed. So, the simplest way to go about implementing this operation would be do so in terms of existing operations.

-
array axpby(
+
array axpby(
     const array& x, // Input array x
     const array& y, // Input array y
     const float alpha, // Scaling factor for x
     const float beta, // Scaling factor for y
     StreamOrDevice s /* = {} */ // Stream on which to schedule the operation
-) {
+) {
     // Scale x and y on the provided stream
-    auto ax = multiply(array(alpha), x, s);
-    auto by = multiply(array(beta), y, s);
+    auto ax = multiply(array(alpha), x, s);
+    auto by = multiply(array(beta), y, s);
 
     // Add and return
-    return add(ax, by, s);
-}
+    return add(ax, by, s);
+}
 

However, as we discussed earlier, this is not our goal. The operations themselves @@ -768,10 +778,10 @@ a on the CPU or GPU, and how it acts under transformations such as vjp and jvp. These words on their own can be a bit abstract, so lets take a step back and go to our example to give ourselves a more concrete image.

-
class Axpby : public Primitive {
-  public:
-    explicit Axpby(Stream stream, float alpha, float beta)
-        : Primitive(stream), alpha_(alpha), beta_(beta){};
+
class Axpby : public Primitive {
+  public:
+    explicit Axpby(Stream stream, float alpha, float beta)
+        : Primitive(stream), alpha_(alpha), beta_(beta){};
 
     /**
     * A primitive must know how to evaluate itself on the CPU/GPU
@@ -779,47 +789,47 @@ back and go to our example to give ourselves a more concrete image.

* * To avoid unnecessary allocations, the evaluation function * is responsible for allocating space for the array. - */ - void eval_cpu(const std::vector<array>& inputs, array& out) override; - void eval_gpu(const std::vector<array>& inputs, array& out) override; + */ + void eval_cpu(const std::vector<array>& inputs, array& out) override; + void eval_gpu(const std::vector<array>& inputs, array& out) override; - /** The Jacobian-vector product. */ - array jvp( - const std::vector<array>& primals, - const std::vector<array>& tangents, - const std::vector<int>& argnums) override; + /** The Jacobian-vector product. */ + array jvp( + const std::vector<array>& primals, + const std::vector<array>& tangents, + const std::vector<int>& argnums) override; - /** The vector-Jacobian product. */ - std::vector<array> vjp( - const std::vector<array>& primals, - const array& cotan, - const std::vector<int>& argnums) override; + /** The vector-Jacobian product. */ + std::vector<array> vjp( + const std::vector<array>& primals, + const array& cotan, + const std::vector<int>& argnums) override; /** * The primitive must know how to vectorize itself across * the given axes. The output is a pair containing the array * representing the vectorized computation and the axis which * corresponds to the output vectorized dimension. - */ - std::pair<array, int> vmap( - const std::vector<array>& inputs, - const std::vector<int>& axes) override; + */ + std::pair<array, int> vmap( + const std::vector<array>& inputs, + const std::vector<int>& axes) override; - /** Print the primitive. */ - void print(std::ostream& os) override { - os << "Axpby"; - } + /** Print the primitive. */ + void print(std::ostream& os) override { + os << "Axpby"; + } - /** Equivalence check **/ - bool is_equivalent(const Primitive& other) const override; + /** Equivalence check **/ + bool is_equivalent(const Primitive& other) const override; - private: - float alpha_; - float beta_; + private: + float alpha_; + float beta_; - /** Fall back implementation for evaluation on CPU */ - void eval(const std::vector<array>& inputs, array& out); -}; + /** Fall back implementation for evaluation on CPU */ + void eval(const std::vector<array>& inputs, array& out); +};

The Axpby class derives from the base Primitive class and @@ -836,38 +846,38 @@ the computation graph. An Primitive that computes it, and the array inputs that are passed to the primitive.

Let’s re-implement our operation now in terms of our Axpby primitive.

-
array axpby(
+
array axpby(
     const array& x, // Input array x
     const array& y, // Input array y
     const float alpha, // Scaling factor for x
     const float beta, // Scaling factor for y
     StreamOrDevice s /* = {} */ // Stream on which to schedule the operation
-) {
+) {
     // Promote dtypes between x and y as needed
-    auto promoted_dtype = promote_types(x.dtype(), y.dtype());
+    auto promoted_dtype = promote_types(x.dtype(), y.dtype());
 
     // Upcast to float32 for non-floating point inputs x and y
-    auto out_dtype = is_floating_point(promoted_dtype)
-        ? promoted_dtype
-        : promote_types(promoted_dtype, float32);
+    auto out_dtype = is_floating_point(promoted_dtype)
+        ? promoted_dtype
+        : promote_types(promoted_dtype, float32);
 
     // Cast x and y up to the determined dtype (on the same stream s)
-    auto x_casted = astype(x, out_dtype, s);
-    auto y_casted = astype(y, out_dtype, s);
+    auto x_casted = astype(x, out_dtype, s);
+    auto y_casted = astype(y, out_dtype, s);
 
     // Broadcast the shapes of x and y (on the same stream s)
-    auto broadcasted_inputs = broadcast_arrays({x_casted, y_casted}, s);
-    auto out_shape = broadcasted_inputs[0].shape();
+    auto broadcasted_inputs = broadcast_arrays({x_casted, y_casted}, s);
+    auto out_shape = broadcasted_inputs[0].shape();
 
     // Construct the array as the output of the Axpby primitive
     // with the broadcasted and upcasted arrays as inputs
-    return array(
-        /* const std::vector<int>& shape = */ out_shape,
-        /* Dtype dtype = */ out_dtype,
-        /* std::unique_ptr<Primitive> primitive = */
-        std::make_unique<Axpby>(to_stream(s), alpha, beta),
-        /* const std::vector<array>& inputs = */ broadcasted_inputs);
-}
+    return array(
+        /* const std::vector<int>& shape = */ out_shape,
+        /* Dtype dtype = */ out_dtype,
+        /* std::unique_ptr<Primitive> primitive = */
+        std::make_unique<Axpby>(to_stream(s), alpha, beta),
+        /* const std::vector<array>& inputs = */ broadcasted_inputs);
+}
 

This operation now handles the following:

@@ -900,66 +910,66 @@ of these functions to allocate memory as needed

Our naive method will go over each element of the output array, find the corresponding input elements of x and y and perform the operation pointwise. This is captured in the templated function axpby_impl().

-
template <typename T>
-void axpby_impl(
-        const array& x,
-        const array& y,
-        array& out,
-        float alpha_,
-        float beta_) {
+
template <typename T>
+void axpby_impl(
+        const array& x,
+        const array& y,
+        array& out,
+        float alpha_,
+        float beta_) {
     // We only allocate memory when we are ready to fill the output
     // malloc_or_wait synchronously allocates available memory
     // There may be a wait executed here if the allocation is requested
     // under memory-pressured conditions
-    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
 
     // Collect input and output data pointers
-    const T* x_ptr = x.data<T>();
-    const T* y_ptr = y.data<T>();
-    T* out_ptr = out.data<T>();
+    const T* x_ptr = x.data<T>();
+    const T* y_ptr = y.data<T>();
+    T* out_ptr = out.data<T>();
 
     // Cast alpha and beta to the relevant types
-    T alpha = static_cast<T>(alpha_);
-    T beta = static_cast<T>(beta_);
+    T alpha = static_cast<T>(alpha_);
+    T beta = static_cast<T>(beta_);
 
     // Do the element-wise operation for each output
-    for (size_t out_idx = 0; out_idx < out.size(); out_idx++) {
+    for (size_t out_idx = 0; out_idx < out.size(); out_idx++) {
         // Map linear indices to offsets in x and y
-        auto x_offset = elem_to_loc(out_idx, x.shape(), x.strides());
-        auto y_offset = elem_to_loc(out_idx, y.shape(), y.strides());
+        auto x_offset = elem_to_loc(out_idx, x.shape(), x.strides());
+        auto y_offset = elem_to_loc(out_idx, y.shape(), y.strides());
 
         // We allocate the output to be contiguous and regularly strided
         // (defaults to row major) and hence it doesn't need additional mapping
-        out_ptr[out_idx] = alpha * x_ptr[x_offset] + beta * y_ptr[y_offset];
-    }
-}
+        out_ptr[out_idx] = alpha * x_ptr[x_offset] + beta * y_ptr[y_offset];
+    }
+}
 

Now, we would like our implementation to be able to do this pointwise operation for all incoming floating point arrays. Accordingly, we add dispatches for float32, float16, bfloat16 and complex64. We throw an error if we encounter an unexpected type.

-
/** Fall back implementation for evaluation on CPU */
-void Axpby::eval(const std::vector<array>& inputs, array& out) {
+
/** Fall back implementation for evaluation on CPU */
+void Axpby::eval(const std::vector<array>& inputs, array& out) {
     // Check the inputs (registered in the op while constructing the out array)
-    assert(inputs.size() == 2);
-    auto& x = inputs[0];
-    auto& y = inputs[1];
+    assert(inputs.size() == 2);
+    auto& x = inputs[0];
+    auto& y = inputs[1];
 
     // Dispatch to the correct dtype
-    if (out.dtype() == float32) {
-        return axpby_impl<float>(x, y, out, alpha_, beta_);
-    } else if (out.dtype() == float16) {
-        return axpby_impl<float16_t>(x, y, out, alpha_, beta_);
-    } else if (out.dtype() == bfloat16) {
-        return axpby_impl<bfloat16_t>(x, y, out, alpha_, beta_);
-    } else if (out.dtype() == complex64) {
-        return axpby_impl<complex64_t>(x, y, out, alpha_, beta_);
-    } else {
-        throw std::runtime_error(
-            "Axpby is only supported for floating point types.");
-    }
-}
+    if (out.dtype() == float32) {
+        return axpby_impl<float>(x, y, out, alpha_, beta_);
+    } else if (out.dtype() == float16) {
+        return axpby_impl<float16_t>(x, y, out, alpha_, beta_);
+    } else if (out.dtype() == bfloat16) {
+        return axpby_impl<bfloat16_t>(x, y, out, alpha_, beta_);
+    } else if (out.dtype() == complex64) {
+        return axpby_impl<complex64_t>(x, y, out, alpha_, beta_);
+    } else {
+        throw std::runtime_error(
+            "Axpby is only supported for floating point types.");
+    }
+}
 

We have a fallback implementation! Now, to do what we are really here to do. @@ -980,13 +990,13 @@ of y

Let’s write out an implementation that uses Accelerate in the right conditions. It must simply allocate data for the output, copy elements of y into it, and then call the catlas_saxpby() from accelerate.

-
template <typename T>
-void axpby_impl_accelerate(
-        const array& x,
-        const array& y,
-        array& out,
-        float alpha_,
-        float beta_) {
+
template <typename T>
+void axpby_impl_accelerate(
+        const array& x,
+        const array& y,
+        array& out,
+        float alpha_,
+        float beta_) {
     // Accelerate library provides catlas_saxpby which does
     // Y = (alpha * X) + (beta * Y) in place
     // To use it, we first copy the data in y over to the output array
@@ -996,54 +1006,54 @@ and then call the     // The data in the output array is allocated to match the strides in y
     // such that x, y, and out are contiguous in the same mode and
     // no transposition is needed
-    out.set_data(
-        allocator::malloc_or_wait(y.data_size() * out.itemsize()),
-        y.data_size(),
-        y.strides(),
-        y.flags());
+    out.set_data(
+        allocator::malloc_or_wait(y.data_size() * out.itemsize()),
+        y.data_size(),
+        y.strides(),
+        y.flags());
 
     // We then copy over the elements using the contiguous vector specialization
-    copy_inplace(y, out, CopyType::Vector);
+    copy_inplace(y, out, CopyType::Vector);
 
     // Get x and y pointers for catlas_saxpby
-    const T* x_ptr = x.data<T>();
-    T* y_ptr = out.data<T>();
+    const T* x_ptr = x.data<T>();
+    T* y_ptr = out.data<T>();
 
-    T alpha = static_cast<T>(alpha_);
-    T beta = static_cast<T>(beta_);
+    T alpha = static_cast<T>(alpha_);
+    T beta = static_cast<T>(beta_);
 
     // Call the inplace accelerate operator
-    catlas_saxpby(
-        /* N = */ out.size(),
-        /* ALPHA = */ alpha,
-        /* X = */ x_ptr,
-        /* INCX = */ 1,
-        /* BETA = */ beta,
-        /* Y = */ y_ptr,
-        /* INCY = */ 1);
-}
+    catlas_saxpby(
+        /* N = */ out.size(),
+        /* ALPHA = */ alpha,
+        /* X = */ x_ptr,
+        /* INCX = */ 1,
+        /* BETA = */ beta,
+        /* Y = */ y_ptr,
+        /* INCY = */ 1);
+}
 

Great! But what about the inputs that do not fit the criteria for accelerate? Luckily, we can always just direct back to Axpby::eval().

With this in mind, lets finally implement our Axpby::eval_cpu().

-
/** Evaluate primitive on CPU using accelerate specializations */
-void Axpby::eval_cpu(const std::vector<array>& inputs, array& out) {
-    assert(inputs.size() == 2);
-    auto& x = inputs[0];
-    auto& y = inputs[1];
+
/** Evaluate primitive on CPU using accelerate specializations */
+void Axpby::eval_cpu(const std::vector<array>& inputs, array& out) {
+    assert(inputs.size() == 2);
+    auto& x = inputs[0];
+    auto& y = inputs[1];
 
     // Accelerate specialization for contiguous single precision float arrays
-    if (out.dtype() == float32 &&
-        ((x.flags().row_contiguous && y.flags().row_contiguous) ||
-        (x.flags().col_contiguous && y.flags().col_contiguous))) {
-        axpby_impl_accelerate<float>(x, y, out, alpha_, beta_);
-        return;
-    }
+    if (out.dtype() == float32 &&
+        ((x.flags().row_contiguous && y.flags().row_contiguous) ||
+        (x.flags().col_contiguous && y.flags().col_contiguous))) {
+        axpby_impl_accelerate<float>(x, y, out, alpha_, beta_);
+        return;
+    }
 
     // Fall back to common backend if specializations are not available
-    eval(inputs, out);
-}
+    eval(inputs, out);
+}
 

We have now hit a milestone! Just this much is enough to run the operation @@ -1069,26 +1079,26 @@ all GPU kernels in MLX are written using metal.

as there are elements in the output. Each thread will pick the element it needs from x and y, do the pointwise operation, and then update its assigned element in the output.

-
template <typename T>
-[[kernel]] void axpby_general(
-        device const T* x [[buffer(0)]],
-        device const T* y [[buffer(1)]],
-        device T* out [[buffer(2)]],
-        constant const float& alpha [[buffer(3)]],
-        constant const float& beta [[buffer(4)]],
-        constant const int* shape [[buffer(5)]],
-        constant const size_t* x_strides [[buffer(6)]],
-        constant const size_t* y_strides [[buffer(7)]],
-        constant const int& ndim [[buffer(8)]],
-        uint index [[thread_position_in_grid]]) {
+
template <typename T>
+[[kernel]] void axpby_general(
+        device const T* x [[buffer(0)]],
+        device const T* y [[buffer(1)]],
+        device T* out [[buffer(2)]],
+        constant const float& alpha [[buffer(3)]],
+        constant const float& beta [[buffer(4)]],
+        constant const int* shape [[buffer(5)]],
+        constant const size_t* x_strides [[buffer(6)]],
+        constant const size_t* y_strides [[buffer(7)]],
+        constant const int& ndim [[buffer(8)]],
+        uint index [[thread_position_in_grid]]) {
     // Convert linear indices to offsets in array
-    auto x_offset = elem_to_loc(index, shape, x_strides, ndim);
-    auto y_offset = elem_to_loc(index, shape, y_strides, ndim);
+    auto x_offset = elem_to_loc(index, shape, x_strides, ndim);
+    auto y_offset = elem_to_loc(index, shape, y_strides, ndim);
 
     // Do the operation and update the output
-    out[index] =
-        static_cast<T>(alpha) * x[x_offset] + static_cast<T>(beta) * y[y_offset];
-}
+    out[index] =
+        static_cast<T>(alpha) * x[x_offset] + static_cast<T>(beta) * y[y_offset];
+}
 

We then need to instantiate this template for all floating point types and give @@ -1108,10 +1118,10 @@ each data type.

constant const int& ndim [[buffer(8)]], \ uint index [[thread_position_in_grid]]); -instantiate_axpby(float32, float); -instantiate_axpby(float16, half); -instantiate_axpby(bfloat16, bfloat16_t); -instantiate_axpby(complex64, complex64_t); +instantiate_axpby(float32, float); +instantiate_axpby(float16, half); +instantiate_axpby(bfloat16, bfloat16_t); +instantiate_axpby(complex64, complex64_t);

This kernel will be compiled into a metal library mlx_ext.metallib as we @@ -1127,73 +1137,73 @@ go over this process in more detail later.

The logic to determine the kernel, set the inputs, resolve the grid dimensions and dispatch it to the GPU are contained in Axpby::eval_gpu() as shown below.

-
/** Evaluate primitive on GPU */
-void Axpby::eval_gpu(const std::vector<array>& inputs, array& out) {
+
/** Evaluate primitive on GPU */
+void Axpby::eval_gpu(const std::vector<array>& inputs, array& out) {
     // Prepare inputs
-    assert(inputs.size() == 2);
-    auto& x = inputs[0];
-    auto& y = inputs[1];
+    assert(inputs.size() == 2);
+    auto& x = inputs[0];
+    auto& y = inputs[1];
 
     // Each primitive carries the stream it should execute on
     // and each stream carries its device identifiers
-    auto& s = stream();
+    auto& s = stream();
     // We get the needed metal device using the stream
-    auto& d = metal::device(s.device);
+    auto& d = metal::device(s.device);
 
     // Allocate output memory
-    out.set_data(allocator::malloc_or_wait(out.nbytes()));
+    out.set_data(allocator::malloc_or_wait(out.nbytes()));
 
     // Resolve name of kernel (corresponds to axpby.metal)
-    std::ostringstream kname;
-    kname << "axpby_" << "general_" << type_to_name(out);
+    std::ostringstream kname;
+    kname << "axpby_" << "general_" << type_to_name(out);
 
     // Make sure the metal library is available and look for it
     // in the same folder as this executable if needed
-    d.register_library("mlx_ext", metal::get_colocated_mtllib_path);
+    d.register_library("mlx_ext", metal::get_colocated_mtllib_path);
 
     // Make a kernel from this metal library
-    auto kernel = d.get_kernel(kname.str(), "mlx_ext");
+    auto kernel = d.get_kernel(kname.str(), "mlx_ext");
 
     // Prepare to encode kernel
-    auto compute_encoder = d.get_command_encoder(s.index);
-    compute_encoder->setComputePipelineState(kernel);
+    auto compute_encoder = d.get_command_encoder(s.index);
+    compute_encoder->setComputePipelineState(kernel);
 
     // Kernel parameters are registered with buffer indices corresponding to
     // those in the kernel declaration at axpby.metal
-    int ndim = out.ndim();
-    size_t nelem = out.size();
+    int ndim = out.ndim();
+    size_t nelem = out.size();
 
     // Encode input arrays to kernel
-    set_array_buffer(compute_encoder, x, 0);
-    set_array_buffer(compute_encoder, y, 1);
+    set_array_buffer(compute_encoder, x, 0);
+    set_array_buffer(compute_encoder, y, 1);
 
     // Encode output arrays to kernel
-    set_array_buffer(compute_encoder, out, 2);
+    set_array_buffer(compute_encoder, out, 2);
 
     // Encode alpha and beta
-    compute_encoder->setBytes(&alpha_, sizeof(float), 3);
-    compute_encoder->setBytes(&beta_, sizeof(float), 4);
+    compute_encoder->setBytes(&alpha_, sizeof(float), 3);
+    compute_encoder->setBytes(&beta_, sizeof(float), 4);
 
     // Encode shape, strides and ndim
-    compute_encoder->setBytes(x.shape().data(), ndim * sizeof(int), 5);
-    compute_encoder->setBytes(x.strides().data(), ndim * sizeof(size_t), 6);
-    compute_encoder->setBytes(y.strides().data(), ndim * sizeof(size_t), 7);
-    compute_encoder->setBytes(&ndim, sizeof(int), 8);
+    compute_encoder->setBytes(x.shape().data(), ndim * sizeof(int), 5);
+    compute_encoder->setBytes(x.strides().data(), ndim * sizeof(size_t), 6);
+    compute_encoder->setBytes(y.strides().data(), ndim * sizeof(size_t), 7);
+    compute_encoder->setBytes(&ndim, sizeof(int), 8);
 
     // We launch 1 thread for each input and make sure that the number of
     // threads in any given threadgroup is not higher than the max allowed
-    size_t tgp_size = std::min(nelem, kernel->maxTotalThreadsPerThreadgroup());
+    size_t tgp_size = std::min(nelem, kernel->maxTotalThreadsPerThreadgroup());
 
     // Fix the 3D size of each threadgroup (in terms of threads)
-    MTL::Size group_dims = MTL::Size(tgp_size, 1, 1);
+    MTL::Size group_dims = MTL::Size(tgp_size, 1, 1);
 
     // Fix the 3D size of the launch grid (in terms of threads)
-    MTL::Size grid_dims = MTL::Size(nelem, 1, 1);
+    MTL::Size grid_dims = MTL::Size(nelem, 1, 1);
 
     // Launch the grid with the given number of threads divided among
     // the given threadgroups
-    compute_encoder->dispatchThreads(grid_dims, group_dims);
-}
+    compute_encoder->dispatchThreads(grid_dims, group_dims);
+}
 

We can now call the axpby() operation on both the CPU and the GPU!

@@ -1213,11 +1223,11 @@ command buffers as needed. We suggest taking a deeper dive into transformations in a Primitive. These transformations can be built on top of our operations, including the one we just defined now. Which then gives us the following Axpby::jvp() and Axpby::vjp() implementations.

-
/** The Jacobian-vector product. */
-array Axpby::jvp(
-        const std::vector<array>& primals,
-        const std::vector<array>& tangents,
-        const std::vector<int>& argnums) {
+
/** The Jacobian-vector product. */
+array Axpby::jvp(
+        const std::vector<array>& primals,
+        const std::vector<array>& tangents,
+        const std::vector<int>& argnums) {
     // Forward mode diff that pushes along the tangents
     // The jvp transform on the primitive can built with ops
     // that are scheduled on the same stream as the primitive
@@ -1226,43 +1236,43 @@ us the following     // jvp is just the tangent scaled by alpha
     // Similarly, if argnums = {1}, the jvp is just the tangent
     // scaled by beta
-    if (argnums.size() > 1) {
-        auto scale = argnums[0] == 0 ? alpha_ : beta_;
-        auto scale_arr = array(scale, tangents[0].dtype());
-        return multiply(scale_arr, tangents[0], stream());
-    }
+    if (argnums.size() > 1) {
+        auto scale = argnums[0] == 0 ? alpha_ : beta_;
+        auto scale_arr = array(scale, tangents[0].dtype());
+        return multiply(scale_arr, tangents[0], stream());
+    }
     // If, argnums = {0, 1}, we take contributions from both
     // which gives us jvp = tangent_x * alpha + tangent_y * beta
-    else {
-        return axpby(tangents[0], tangents[1], alpha_, beta_, stream());
-    }
-}
+    else {
+        return axpby(tangents[0], tangents[1], alpha_, beta_, stream());
+    }
+}
 
-
/** The vector-Jacobian product. */
-std::vector<array> Axpby::vjp(
-        const std::vector<array>& primals,
-        const array& cotan,
-        const std::vector<int>& argnums) {
+
/** The vector-Jacobian product. */
+std::vector<array> Axpby::vjp(
+        const std::vector<array>& primals,
+        const array& cotan,
+        const std::vector<int>& argnums) {
     // Reverse mode diff
-    std::vector<array> vjps;
-    for (auto arg : argnums) {
-        auto scale = arg == 0 ? alpha_ : beta_;
-        auto scale_arr = array(scale, cotan.dtype());
-        vjps.push_back(multiply(scale_arr, cotan, stream()));
-    }
-    return vjps;
-}
+    std::vector<array> vjps;
+    for (auto arg : argnums) {
+        auto scale = arg == 0 ? alpha_ : beta_;
+        auto scale_arr = array(scale, cotan.dtype());
+        vjps.push_back(multiply(scale_arr, cotan, stream()));
+    }
+    return vjps;
+}
 

Finally, you need not have a transformation fully defined to start using your own Primitive.

-
/** Vectorize primitive along given axis */
-std::pair<array, int> Axpby::vmap(
-        const std::vector<array>& inputs,
-        const std::vector<int>& axes) {
-    throw std::runtime_error("Axpby has no vmap implementation.");
-}
+
/** Vectorize primitive along given axis */
+std::pair<array, int> Axpby::vmap(
+        const std::vector<array>& inputs,
+        const std::vector<int>& axes) {
+    throw std::runtime_error("Axpby has no vmap implementation.");
+}
 
@@ -1297,20 +1307,20 @@ the python package

We use PyBind11 to build a Python API for the C++ library. Since bindings for all needed components such as mlx.core.array, mlx.core.stream, etc. are already provided, adding our axpby() becomes very simple!

-
PYBIND11_MODULE(mlx_sample_extensions, m) {
-    m.doc() = "Sample C++ and metal extensions for MLX";
+
PYBIND11_MODULE(mlx_sample_extensions, m) {
+    m.doc() = "Sample C++ and metal extensions for MLX";
 
-    m.def(
-        "axpby",
-        &axpby,
-        "x"_a,
-        "y"_a,
-        py::pos_only(),
-        "alpha"_a,
-        "beta"_a,
-        py::kw_only(),
-        "stream"_a = py::none(),
-        R"pbdoc(
+    m.def(
+        "axpby",
+        &axpby,
+        "x"_a,
+        "y"_a,
+        py::pos_only(),
+        "alpha"_a,
+        "beta"_a,
+        py::kw_only(),
+        "stream"_a = py::none(),
+        R"pbdoc(
             Scale and sum two vectors element-wise
             ``z = alpha * x + beta * y``
 
@@ -1325,8 +1335,8 @@ are already provided, adding our             Returns:
                 array: ``alpha * x + beta * y``
-        )pbdoc");
-}
+        )pbdoc");
+}
 

Most of the complexity in the above example comes from additional bells and @@ -1463,7 +1473,7 @@ import the python package and play with it as you would any other MLX operation! print(f"c shape: {c.shape}") print(f"c dtype: {c.dtype}") -print(f"c correctness: {mx.all(c == 6.0).item()}") +print(f"c correctness: {mx.all(c == 6.0).item()}")

Output:

diff --git a/docs/build/html/examples/linear_regression.html b/docs/build/html/examples/linear_regression.html index 5d2417057..80d34e46e 100644 --- a/docs/build/html/examples/linear_regression.html +++ b/docs/build/html/examples/linear_regression.html @@ -47,7 +47,7 @@ - + @@ -148,9 +148,12 @@

Usage

Examples

@@ -706,12 +716,12 @@ examples are available in the MLX GitHub repo.

previous

-

Conversion to NumPy and Other Frameworks

+

Using Streams

Usage

Examples

@@ -949,19 +959,19 @@ like layers.2.atten

which can then be used to update the model. Note that the method above incurs several unnecessary copies from disk to numpy and then from numpy to MLX. It will be replaced in the future with direct loading to MLX.

-

You can download the full example code in mlx-examples. Assuming, the +

You can download the full example code in mlx-examples. Assuming, the existence of weights.pth and tokenizer.model in the current working directory we can play around with our inference script as follows (the timings are representative of an M1 Ultra and the 7B parameter Llama model):

-
$ python convert.py weights.pth llama-7B.mlx.npz
-$ python llama.py llama-7B.mlx.npz tokenizer.model 'Call me Ishmael. Some years ago never mind how long precisely'
-[INFO] Loading model from disk: 5.247 s
-Press enter to start generation
+
$ python convert.py weights.pth llama-7B.mlx.npz
+$ python llama.py llama-7B.mlx.npz tokenizer.model 'Call me Ishmael. Some years ago never mind how long precisely'
+[INFO] Loading model from disk: 5.247 s
+Press enter to start generation
 ------
-, having little or no money in my purse, and nothing of greater consequence in my mind, I happened to be walking down Gower Street in the afternoon, in the heavy rain, and I saw a few steps off, a man in rags, who sat upon his bundle and looked hard into the wet as if he were going to cry. I watched him attentively for some time, and could not but observe that, though a numerous crowd was hurrying up and down,
+, having little or no money in my purse, and nothing of greater consequence in my mind, I happened to be walking down Gower Street in the afternoon, in the heavy rain, and I saw a few steps off, a man in rags, who sat upon his bundle and looked hard into the wet as if he were going to cry. I watched him attentively for some time, and could not but observe that, though a numerous crowd was hurrying up and down,
 ------
-[INFO] Prompt processing: 0.437 s
-[INFO] Full generation: 4.330 s
+[INFO] Prompt processing: 0.437 s
+[INFO] Full generation: 4.330 s
 

We observe that 4.3 seconds are required to generate 100 tokens and 0.4 seconds @@ -969,22 +979,22 @@ of those are spent processing the prompt. This amounts to a little over per token.

By running with a much bigger prompt we can see that the per token generation time as well as the prompt processing time remains almost constant.

-
$ python llama.py llama-7B.mlx.npz tokenizer.model 'Call me Ishmael. Some years ago never mind how long precisely, having little or no money in my purse, and nothing of greater consequence in my mind, I happened to be walking down Gower Street in the afternoon, in the heavy rain, and I saw a few steps off, a man in rags, who sat upon his bundle and looked hard into the wet as if he were going to cry. I watched him attentively for some time, and could not but observe that, though a numerous crowd was hurrying up and down, nobody took the least notice of him. I stopped at last, at a little distance, as if I had been in doubt, and after looking on a few minutes, walked straight up to him. He slowly raised his eyes, and fixed them upon me for a moment, without speaking, and then resumed his place and posture as before. I stood looking at him for a while, feeling very much pain at heart, and then said to him, “What are you doing there?” Something like a smile passed over his face, as he said slowly, “I am waiting for someone; but it has been three quarters of an hour now, and he has not come.” “What is it you are waiting for?” said I. Still he made no immediate reply, but again put his face down upon his hands, and did not'
-[INFO] Loading model from disk: 5.247 s
-Press enter to start generation
+
$ python llama.py llama-7B.mlx.npz tokenizer.model 'Call me Ishmael. Some years ago never mind how long precisely, having little or no money in my purse, and nothing of greater consequence in my mind, I happened to be walking down Gower Street in the afternoon, in the heavy rain, and I saw a few steps off, a man in rags, who sat upon his bundle and looked hard into the wet as if he were going to cry. I watched him attentively for some time, and could not but observe that, though a numerous crowd was hurrying up and down, nobody took the least notice of him. I stopped at last, at a little distance, as if I had been in doubt, and after looking on a few minutes, walked straight up to him. He slowly raised his eyes, and fixed them upon me for a moment, without speaking, and then resumed his place and posture as before. I stood looking at him for a while, feeling very much pain at heart, and then said to him, “What are you doing there?” Something like a smile passed over his face, as he said slowly, “I am waiting for someone; but it has been three quarters of an hour now, and he has not come.” “What is it you are waiting for?” said I. Still he made no immediate reply, but again put his face down upon his hands, and did not'
+[INFO] Loading model from disk: 5.247 s
+Press enter to start generation
 ------
-take his eyes from the ground. “What is it you are waiting for?” said I. “I am not accustomed to be thus questioned,” said he. “You look like a reasonable man—tell me, then, what are you waiting for?” “You would not understand,” he replied; “and how could you help me, if I were to tell you?” “I should not only understand, but would do all that I could,” said I. He did not
+take his eyes from the ground. “What is it you are waiting for?” said I. “I am not accustomed to be thus questioned,” said he. “You look like a reasonable man—tell me, then, what are you waiting for?” “You would not understand,” he replied; “and how could you help me, if I were to tell you?” “I should not only understand, but would do all that I could,” said I. He did not
 ------
-[INFO] Prompt processing: 0.579 s
-[INFO] Full generation: 4.690 s
-$ python llama.py --num-tokens 500 llama-7B.mlx.npz tokenizer.model 'Call me Ishmael. Some years ago never mind how long precisely, having little or no money in my purse, and nothing of greater consequence in my mind, I happened to be walking down Gower Street in the afternoon, in the heavy rain, and I saw a few steps off, a man in rags, who sat upon his bundle and looked hard into the wet as if he were going to cry. I watched him attentively for some time, and could not but observe that, though a numerous crowd was hurrying up and down, nobody took the least notice of him. I stopped at last, at a little distance, as if I had been in doubt, and after looking on a few minutes, walked straight up to him. He slowly raised his eyes, and fixed them upon me for a moment, without speaking, and then resumed his place and posture as before. I stood looking at him for a while, feeling very much pain at heart, and then said to him, “What are you doing there?” Something like a smile passed over his face, as he said slowly, “I am waiting for someone; but it has been three quarters of an hour now, and he has not come.” “What is it you are waiting for?” said I. Still he made no immediate reply, but again put his face down upon his hands, and did not'
-[INFO] Loading model from disk: 5.628 s
-Press enter to start generation
+[INFO] Prompt processing: 0.579 s
+[INFO] Full generation: 4.690 s
+$ python llama.py --num-tokens 500 llama-7B.mlx.npz tokenizer.model 'Call me Ishmael. Some years ago never mind how long precisely, having little or no money in my purse, and nothing of greater consequence in my mind, I happened to be walking down Gower Street in the afternoon, in the heavy rain, and I saw a few steps off, a man in rags, who sat upon his bundle and looked hard into the wet as if he were going to cry. I watched him attentively for some time, and could not but observe that, though a numerous crowd was hurrying up and down, nobody took the least notice of him. I stopped at last, at a little distance, as if I had been in doubt, and after looking on a few minutes, walked straight up to him. He slowly raised his eyes, and fixed them upon me for a moment, without speaking, and then resumed his place and posture as before. I stood looking at him for a while, feeling very much pain at heart, and then said to him, “What are you doing there?” Something like a smile passed over his face, as he said slowly, “I am waiting for someone; but it has been three quarters of an hour now, and he has not come.” “What is it you are waiting for?” said I. Still he made no immediate reply, but again put his face down upon his hands, and did not'
+[INFO] Loading model from disk: 5.628 s
+Press enter to start generation
 ------
-take his eyes from the ground. “What is it you are waiting for?” said I. “I am not accustomed to be thus questioned,” said he. “You look like a reasonable man—tell me, then, what are you waiting for?” “You would not understand,” he replied; “and how could you help me, if I were to tell you?” “I should not only understand, but would do all that I could,” said I. He did not reply, but still went on looking at the ground, and took hold of his bundle with a nervous trembling. I waited some time, and then resumed. “It is of no use to say you would not understand, if I were to tell you,” said he. “I have not told you why I am waiting for him,” said I. “And I am sure I should not understand,” replied he. “I will tell you then,” said I, “and, perhaps, you would not be surprised.” “No matter,” said he, “I shall be surprised anyhow; so tell me why you are waiting for him.” “He is my friend,” said I. “Yes,” said he, with a slight smile, “I know.” “He has been kind to me,” said I, “and I am waiting for him. I want to see him, and could have waited as I am now, for a much longer time.” “He will not soon come,” said he. “Unless he sees you here, he will not know of your having waited, and he will be very unlikely to come.” “No matter,” said I, “I shall wait for him.” “This is a strange thing,” said he, still with the same amused smile. “How did you know,” said I, “that he was coming? How should you be waiting?” “That is my secret,” said he. “And you expect him?” “Yes,” said I. “Are you disappointed then, if he does not come?” “No,” said I, “it is his secret, not mine.” “If he comes,” said he, “do you mean to go straight away?” “Yes,” said I, “I cannot be happy if I do not go straight away after him.” “Did you know this place before?” asked he. “Yes,” said I. “Is there any shop to buy food here?” “
+take his eyes from the ground. “What is it you are waiting for?” said I. “I am not accustomed to be thus questioned,” said he. “You look like a reasonable man—tell me, then, what are you waiting for?” “You would not understand,” he replied; “and how could you help me, if I were to tell you?” “I should not only understand, but would do all that I could,” said I. He did not reply, but still went on looking at the ground, and took hold of his bundle with a nervous trembling. I waited some time, and then resumed. “It is of no use to say you would not understand, if I were to tell you,” said he. “I have not told you why I am waiting for him,” said I. “And I am sure I should not understand,” replied he. “I will tell you then,” said I, “and, perhaps, you would not be surprised.” “No matter,” said he, “I shall be surprised anyhow; so tell me why you are waiting for him.” “He is my friend,” said I. “Yes,” said he, with a slight smile, “I know.” “He has been kind to me,” said I, “and I am waiting for him. I want to see him, and could have waited as I am now, for a much longer time.” “He will not soon come,” said he. “Unless he sees you here, he will not know of your having waited, and he will be very unlikely to come.” “No matter,” said I, “I shall wait for him.” “This is a strange thing,” said he, still with the same amused smile. “How did you know,” said I, “that he was coming? How should you be waiting?” “That is my secret,” said he. “And you expect him?” “Yes,” said I. “Are you disappointed then, if he does not come?” “No,” said I, “it is his secret, not mine.” “If he comes,” said he, “do you mean to go straight away?” “Yes,” said I, “I cannot be happy if I do not go straight away after him.” “Did you know this place before?” asked he. “Yes,” said I. “Is there any shop to buy food here?” “
 ------
-[INFO] Prompt processing: 0.633 s
-[INFO] Full generation: 21.475 s
+[INFO] Prompt processing: 0.633 s
+[INFO] Full generation: 21.475 s
 
@@ -992,7 +1002,7 @@ take his eyes Scripts#

Download the code

-

The full example code is available in mlx-examples.

+

The full example code is available in mlx-examples.

+
+ + +
+ + + +
+
+ + + + + +
+
+ + \ No newline at end of file diff --git a/docs/build/html/usage/unified_memory.html b/docs/build/html/usage/unified_memory.html index b02a16092..0afec7b74 100644 --- a/docs/build/html/usage/unified_memory.html +++ b/docs/build/html/usage/unified_memory.html @@ -46,8 +46,8 @@ - - + + @@ -148,9 +148,12 @@

Usage

Examples

@@ -712,20 +722,20 @@ Max.

previous

-

Quick Start Guide

+

Lazy Evaluation

next

-

Using Streams

+

Indexing Arrays

diff --git a/docs/build/html/usage/using_streams.html b/docs/build/html/usage/using_streams.html index 41e64e617..166369f68 100644 --- a/docs/build/html/usage/using_streams.html +++ b/docs/build/html/usage/using_streams.html @@ -46,8 +46,8 @@ - - + + @@ -148,9 +148,12 @@

Usage

Examples

@@ -668,20 +678,20 @@ run on the default stream of the provided device