Remove "using namespace mlx::core" in benchmarks/examples (#1685)

* Remove "using namespace mlx::core" in benchmarks/examples

* Fix building example extension

* A missing one in comment

* Fix building on M chips
This commit is contained in:
Cheng
2024-12-12 00:08:29 +09:00
committed by GitHub
parent f76a49e555
commit 4f9b60dd53
12 changed files with 373 additions and 367 deletions

View File

@@ -4,19 +4,19 @@
#include "mlx/mlx.h"
using namespace mlx::core;
namespace mx = mlx::core;
int main() {
if (!distributed::is_available()) {
if (!mx::distributed::is_available()) {
std::cout << "No communication backend found" << std::endl;
return 1;
}
auto global_group = distributed::init();
auto global_group = mx::distributed::init();
std::cout << global_group.rank() << " / " << global_group.size() << std::endl;
array x = ones({10});
array out = distributed::all_sum(x, global_group);
mx::array x = mx::ones({10});
mx::array out = mx::distributed::all_sum(x, global_group);
std::cout << out << std::endl;
}

View File

@@ -10,7 +10,7 @@
/**
* An example of linear regression with MLX.
*/
using namespace mlx::core;
namespace mx = mlx::core;
int main() {
int num_features = 100;
@@ -19,35 +19,35 @@ int main() {
float learning_rate = 0.01;
// True parameters
auto w_star = random::normal({num_features});
auto w_star = mx::random::normal({num_features});
// The input examples (design matrix)
auto X = random::normal({num_examples, num_features});
auto X = mx::random::normal({num_examples, num_features});
// Noisy labels
auto eps = 1e-2 * random::normal({num_examples});
auto y = matmul(X, w_star) + eps;
auto eps = 1e-2 * mx::random::normal({num_examples});
auto y = mx::matmul(X, w_star) + eps;
// Initialize random parameters
array w = 1e-2 * random::normal({num_features});
mx::array w = 1e-2 * mx::random::normal({num_features});
auto loss_fn = [&](array w) {
auto yhat = matmul(X, w);
return (0.5f / num_examples) * sum(square(yhat - y));
auto loss_fn = [&](mx::array w) {
auto yhat = mx::matmul(X, w);
return (0.5f / num_examples) * mx::sum(mx::square(yhat - y));
};
auto grad_fn = grad(loss_fn);
auto grad_fn = mx::grad(loss_fn);
auto tic = timer::time();
for (int it = 0; it < num_iters; ++it) {
auto grad = grad_fn(w);
w = w - learning_rate * grad;
eval(w);
auto grads = grad_fn(w);
w = w - learning_rate * grads;
mx::eval(w);
}
auto toc = timer::time();
auto loss = loss_fn(w);
auto error_norm = std::sqrt(sum(square(w - w_star)).item<float>());
auto error_norm = std::sqrt(mx::sum(mx::square(w - w_star)).item<float>());
auto throughput = num_iters / timer::seconds(toc - tic);
std::cout << "Loss " << loss << ", |w - w*| = " << error_norm
<< ", Throughput " << throughput << " (it/s)." << std::endl;

View File

@@ -10,7 +10,7 @@
/**
* An example of logistic regression with MLX.
*/
using namespace mlx::core;
namespace mx = mlx::core;
int main() {
int num_features = 100;
@@ -19,35 +19,35 @@ int main() {
float learning_rate = 0.1;
// True parameters
auto w_star = random::normal({num_features});
auto w_star = mx::random::normal({num_features});
// The input examples
auto X = random::normal({num_examples, num_features});
auto X = mx::random::normal({num_examples, num_features});
// Labels
auto y = matmul(X, w_star) > 0;
auto y = mx::matmul(X, w_star) > 0;
// Initialize random parameters
array w = 1e-2 * random::normal({num_features});
mx::array w = 1e-2 * mx::random::normal({num_features});
auto loss_fn = [&](array w) {
auto logits = matmul(X, w);
auto loss_fn = [&](mx::array w) {
auto logits = mx::matmul(X, w);
auto scale = (1.0f / num_examples);
return scale * sum(logaddexp(array(0.0f), logits) - y * logits);
return scale * mx::sum(mx::logaddexp(mx::array(0.0f), logits) - y * logits);
};
auto grad_fn = grad(loss_fn);
auto grad_fn = mx::grad(loss_fn);
auto tic = timer::time();
for (int it = 0; it < num_iters; ++it) {
auto grad = grad_fn(w);
w = w - learning_rate * grad;
eval(w);
auto grads = grad_fn(w);
w = w - learning_rate * grads;
mx::eval(w);
}
auto toc = timer::time();
auto loss = loss_fn(w);
auto acc = sum((matmul(X, w) > 0) == y) / num_examples;
auto acc = mx::sum((mx::matmul(X, w) > 0) == y) / num_examples;
auto throughput = num_iters / timer::seconds(toc - tic);
std::cout << "Loss " << loss << ", Accuracy, " << acc << ", Throughput "
<< throughput << " (it/s)." << std::endl;

View File

@@ -5,27 +5,27 @@
#include "mlx/mlx.h"
using namespace mlx::core;
namespace mx = mlx::core;
int main() {
// To use Metal debugging and profiling:
// 1. Build with the MLX_METAL_DEBUG CMake option (i.e. -DMLX_METAL_DEBUG=ON).
// 2. Run with MTL_CAPTURE_ENABLED=1.
metal::start_capture("mlx_trace.gputrace");
mx::metal::start_capture("mlx_trace.gputrace");
// Start at index two because the default GPU and CPU streams have indices
// zero and one, respectively. This naming matches the label assigned to each
// stream's command queue.
auto s2 = new_stream(Device::gpu);
auto s3 = new_stream(Device::gpu);
auto s2 = new_stream(mx::Device::gpu);
auto s3 = new_stream(mx::Device::gpu);
auto a = arange(1.f, 10.f, 1.f, float32, s2);
auto b = arange(1.f, 10.f, 1.f, float32, s3);
auto x = add(a, a, s2);
auto y = add(b, b, s3);
auto a = mx::arange(1.f, 10.f, 1.f, mx::float32, s2);
auto b = mx::arange(1.f, 10.f, 1.f, mx::float32, s3);
auto x = mx::add(a, a, s2);
auto y = mx::add(b, b, s3);
// The multiply will happen on the default stream.
std::cout << multiply(x, y) << std::endl;
std::cout << mx::multiply(x, y) << std::endl;
metal::stop_capture();
mx::metal::stop_capture();
}

View File

@@ -5,11 +5,11 @@
#include "mlx/mlx.h"
using namespace mlx::core;
namespace mx = mlx::core;
void array_basics() {
// Make a scalar array:
array x(1.0);
mx::array x(1.0);
// Get the value out of it:
auto s = x.item<float>();
@@ -29,31 +29,31 @@ void array_basics() {
// The datatype should be float32:
auto dtype = x.dtype();
assert(dtype == float32);
assert(dtype == mx::float32);
// Specify the dtype when constructing the array:
x = array(1, int32);
assert(x.dtype() == int32);
x = mx::array(1, mx::int32);
assert(x.dtype() == mx::int32);
x.item<int>(); // OK
// x.item<float>(); // Undefined!
// Make a multidimensional array:
x = array({1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
x = mx::array({1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
// mlx is row-major by default so the first row of this array
// is [1.0, 2.0] and the second row is [3.0, 4.0]
// Make an array of shape {2, 2} filled with ones:
auto y = ones({2, 2});
auto y = mx::ones({2, 2});
// Pointwise add x and y:
auto z = add(x, y);
auto z = mx::add(x, y);
// Same thing:
z = x + y;
// mlx is lazy by default. At this point `z` only
// has a shape and a type but no actual data:
assert(z.dtype() == float32);
assert(z.dtype() == mx::float32);
assert(z.shape(0) == 2);
assert(z.shape(1) == 2);
@@ -63,33 +63,33 @@ void array_basics() {
// and inputs. When `eval` is called on an array (or arrays), the array and
// all of its dependencies are recursively evaluated to produce the result.
// Once an array is evaluated, it has data and is detached from its inputs.
eval(z);
mx::eval(z);
// Of course the array can still be an input to other operations. You can even
// call eval on the array again, this will just be a no-op:
eval(z); // no-op
// Of course the array can still be an input to other operations. You can
// even call eval on the array again, this will just be a no-op:
mx::eval(z); // no-op
// Some functions or methods on arrays implicitly evaluate them. For example
// accessing a value in an array or printing the array implicitly evaluate it:
z = ones({1});
z = mx::ones({1});
z.item<float>(); // implicit evaluation
z = ones({2, 2});
z = mx::ones({2, 2});
std::cout << z << std::endl; // implicit evaluation
}
void automatic_differentiation() {
auto fn = [](array x) { return square(x); };
auto fn = [](mx::array x) { return mx::square(x); };
// Computing the derivative function of a function
auto grad_fn = grad(fn);
auto grad_fn = mx::grad(fn);
// Call grad_fn on the input to get the derivative
auto x = array(1.5);
auto x = mx::array(1.5);
auto dfdx = grad_fn(x);
// dfdx is 2 * x
// Get the second derivative by composing grad with grad
auto d2fdx2 = grad(grad(fn))(x);
auto d2fdx2 = mx::grad(mx::grad(fn))(x);
// d2fdx2 is 2
}

View File

@@ -19,7 +19,7 @@
#include "mlx/backend/metal/utils.h"
#endif
namespace mlx::core {
namespace my_ext {
///////////////////////////////////////////////////////////////////////////////
// Operation Implementation
@@ -32,24 +32,24 @@ namespace mlx::core {
* Follow numpy style broadcasting between x and y
* Inputs are upcasted to floats if needed
**/
array axpby(
const array& x, // Input array x
const array& y, // Input array y
mx::array axpby(
const mx::array& x, // Input mx::array x
const mx::array& y, // Input mx::array y
const float alpha, // Scaling factor for x
const float beta, // Scaling factor for y
StreamOrDevice s /* = {} */ // Stream on which to schedule the operation
mx::StreamOrDevice s /* = {} */ // Stream on which to schedule the operation
) {
// Promote dtypes between x and y as needed
auto promoted_dtype = promote_types(x.dtype(), y.dtype());
// Upcast to float32 for non-floating point inputs x and y
auto out_dtype = issubdtype(promoted_dtype, float32)
auto out_dtype = mx::issubdtype(promoted_dtype, mx::float32)
? promoted_dtype
: promote_types(promoted_dtype, float32);
: promote_types(promoted_dtype, mx::float32);
// Cast x and y up to the determined dtype (on the same stream s)
auto x_casted = astype(x, out_dtype, s);
auto y_casted = astype(y, out_dtype, s);
auto x_casted = mx::astype(x, out_dtype, s);
auto y_casted = mx::astype(y, out_dtype, s);
// Broadcast the shapes of x and y (on the same stream s)
auto broadcasted_inputs = broadcast_arrays({x_casted, y_casted}, s);
@@ -57,12 +57,12 @@ array axpby(
// Construct the array as the output of the Axpby primitive
// with the broadcasted and upcasted arrays as inputs
return array(
return mx::array(
/* const std::vector<int>& shape = */ out_shape,
/* Dtype dtype = */ out_dtype,
/* std::unique_ptr<Primitive> primitive = */
/* mx::Dtype dtype = */ out_dtype,
/* std::unique_ptr<mx::Primitive> primitive = */
std::make_shared<Axpby>(to_stream(s), alpha, beta),
/* const std::vector<array>& inputs = */ broadcasted_inputs);
/* const std::vector<mx::array>& inputs = */ broadcasted_inputs);
}
///////////////////////////////////////////////////////////////////////////////
@@ -71,16 +71,16 @@ array axpby(
template <typename T>
void axpby_impl(
const array& x,
const array& y,
array& out,
const mx::array& x,
const mx::array& y,
mx::array& out,
float alpha_,
float beta_) {
// We only allocate memory when we are ready to fill the output
// malloc_or_wait synchronously allocates available memory
// There may be a wait executed here if the allocation is requested
// under memory-pressured conditions
out.set_data(allocator::malloc_or_wait(out.nbytes()));
out.set_data(mx::allocator::malloc_or_wait(out.nbytes()));
// Collect input and output data pointers
const T* x_ptr = x.data<T>();
@@ -94,8 +94,8 @@ void axpby_impl(
// Do the element-wise operation for each output
for (size_t out_idx = 0; out_idx < out.size(); out_idx++) {
// Map linear indices to offsets in x and y
auto x_offset = elem_to_loc(out_idx, x.shape(), x.strides());
auto y_offset = elem_to_loc(out_idx, y.shape(), y.strides());
auto x_offset = mx::elem_to_loc(out_idx, x.shape(), x.strides());
auto y_offset = mx::elem_to_loc(out_idx, y.shape(), y.strides());
// We allocate the output to be contiguous and regularly strided
// (defaults to row major) and hence it doesn't need additional mapping
@@ -105,8 +105,8 @@ void axpby_impl(
/** Fall back implementation for evaluation on CPU */
void Axpby::eval(
const std::vector<array>& inputs,
std::vector<array>& outputs) {
const std::vector<mx::array>& inputs,
std::vector<mx::array>& outputs) {
// Check the inputs (registered in the op while constructing the out array)
assert(inputs.size() == 2);
auto& x = inputs[0];
@@ -114,14 +114,14 @@ void Axpby::eval(
auto& out = outputs[0];
// Dispatch to the correct dtype
if (out.dtype() == float32) {
if (out.dtype() == mx::float32) {
return axpby_impl<float>(x, y, out, alpha_, beta_);
} else if (out.dtype() == float16) {
return axpby_impl<float16_t>(x, y, out, alpha_, beta_);
} else if (out.dtype() == bfloat16) {
return axpby_impl<bfloat16_t>(x, y, out, alpha_, beta_);
} else if (out.dtype() == complex64) {
return axpby_impl<complex64_t>(x, y, out, alpha_, beta_);
} else if (out.dtype() == mx::float16) {
return axpby_impl<mx::float16_t>(x, y, out, alpha_, beta_);
} else if (out.dtype() == mx::bfloat16) {
return axpby_impl<mx::bfloat16_t>(x, y, out, alpha_, beta_);
} else if (out.dtype() == mx::complex64) {
return axpby_impl<mx::complex64_t>(x, y, out, alpha_, beta_);
} else {
throw std::runtime_error(
"Axpby is only supported for floating point types.");
@@ -136,9 +136,9 @@ void Axpby::eval(
template <typename T>
void axpby_impl_accelerate(
const array& x,
const array& y,
array& out,
const mx::array& x,
const mx::array& y,
mx::array& out,
float alpha_,
float beta_) {
// Accelerate library provides catlas_saxpby which does
@@ -150,10 +150,10 @@ void axpby_impl_accelerate(
// The data in the output array is allocated to match the strides in y
// such that x, y, and out are contiguous in the same mode and
// no transposition is needed
out.set_data(allocator::malloc_or_wait(out.nbytes()));
out.set_data(mx::allocator::malloc_or_wait(out.nbytes()));
// We then copy over the elements using the contiguous vector specialization
copy_inplace(y, out, CopyType::Vector);
copy_inplace(y, out, mx::CopyType::Vector);
// Get x and y pointers for catlas_saxpby
const T* x_ptr = x.data<T>();
@@ -175,15 +175,15 @@ void axpby_impl_accelerate(
/** Evaluate primitive on CPU using accelerate specializations */
void Axpby::eval_cpu(
const std::vector<array>& inputs,
std::vector<array>& outputs) {
const std::vector<mx::array>& inputs,
std::vector<mx::array>& outputs) {
assert(inputs.size() == 2);
auto& x = inputs[0];
auto& y = inputs[1];
auto& out = outputs[0];
// Accelerate specialization for contiguous single precision float arrays
if (out.dtype() == float32 &&
if (out.dtype() == mx::float32 &&
((x.flags().row_contiguous && y.flags().row_contiguous) ||
(x.flags().col_contiguous && y.flags().col_contiguous))) {
axpby_impl_accelerate<float>(x, y, out, alpha_, beta_);
@@ -198,8 +198,8 @@ void Axpby::eval_cpu(
/** Evaluate primitive on CPU falling back to common backend */
void Axpby::eval_cpu(
const std::vector<array>& inputs,
const std::vector<array>& outputs) {
const std::vector<mx::array>& inputs,
std::vector<mx::array>& outputs) {
eval(inputs, outputs);
}
@@ -213,8 +213,8 @@ void Axpby::eval_cpu(
/** Evaluate primitive on GPU */
void Axpby::eval_gpu(
const std::vector<array>& inputs,
std::vector<array>& outputs) {
const std::vector<mx::array>& inputs,
std::vector<mx::array>& outputs) {
// Prepare inputs
assert(inputs.size() == 2);
auto& x = inputs[0];
@@ -225,7 +225,7 @@ void Axpby::eval_gpu(
// and each stream carries its device identifiers
auto& s = stream();
// We get the needed metal device using the stream
auto& d = metal::device(s.device);
auto& d = mx::metal::device(s.device);
// Prepare to specialize based on contiguity
bool contiguous_kernel =
@@ -235,12 +235,12 @@ void Axpby::eval_gpu(
// Allocate output memory with strides based on specialization
if (contiguous_kernel) {
out.set_data(
allocator::malloc_or_wait(x.data_size() * out.itemsize()),
mx::allocator::malloc_or_wait(x.data_size() * out.itemsize()),
x.data_size(),
x.strides(),
x.flags());
} else {
out.set_data(allocator::malloc_or_wait(out.nbytes()));
out.set_data(mx::allocator::malloc_or_wait(out.nbytes()));
}
// Resolve name of kernel (corresponds to axpby.metal)
@@ -302,8 +302,8 @@ void Axpby::eval_gpu(
/** Fail evaluation on GPU */
void Axpby::eval_gpu(
const std::vector<array>& inputs,
std::vector<array>& out) {
const std::vector<mx::array>& inputs,
std::vector<mx::array>& out) {
throw std::runtime_error("Axpby has no GPU implementation.");
}
@@ -314,9 +314,9 @@ void Axpby::eval_gpu(
///////////////////////////////////////////////////////////////////////////////
/** The Jacobian-vector product. */
std::vector<array> Axpby::jvp(
const std::vector<array>& primals,
const std::vector<array>& tangents,
std::vector<mx::array> Axpby::jvp(
const std::vector<mx::array>& primals,
const std::vector<mx::array>& tangents,
const std::vector<int>& argnums) {
// Forward mode diff that pushes along the tangents
// The jvp transform on the primitive can built with ops
@@ -328,8 +328,8 @@ std::vector<array> Axpby::jvp(
// scaled by beta
if (argnums.size() > 1) {
auto scale = argnums[0] == 0 ? alpha_ : beta_;
auto scale_arr = array(scale, tangents[0].dtype());
return {multiply(scale_arr, tangents[0], stream())};
auto scale_arr = mx::array(scale, tangents[0].dtype());
return {mx::multiply(scale_arr, tangents[0], stream())};
}
// If, argnums = {0, 1}, we take contributions from both
// which gives us jvp = tangent_x * alpha + tangent_y * beta
@@ -339,24 +339,24 @@ std::vector<array> Axpby::jvp(
}
/** The vector-Jacobian product. */
std::vector<array> Axpby::vjp(
const std::vector<array>& primals,
const std::vector<array>& cotangents,
std::vector<mx::array> Axpby::vjp(
const std::vector<mx::array>& primals,
const std::vector<mx::array>& cotangents,
const std::vector<int>& argnums,
const std::vector<array>&) {
const std::vector<mx::array>&) {
// Reverse mode diff
std::vector<array> vjps;
std::vector<mx::array> vjps;
for (auto arg : argnums) {
auto scale = arg == 0 ? alpha_ : beta_;
auto scale_arr = array(scale, cotangents[0].dtype());
vjps.push_back(multiply(scale_arr, cotangents[0], stream()));
auto scale_arr = mx::array(scale, cotangents[0].dtype());
vjps.push_back(mx::multiply(scale_arr, cotangents[0], stream()));
}
return vjps;
}
/** Vectorize primitive along given axis */
std::pair<std::vector<array>, std::vector<int>> Axpby::vmap(
const std::vector<array>& inputs,
std::pair<std::vector<mx::array>, std::vector<int>> Axpby::vmap(
const std::vector<mx::array>& inputs,
const std::vector<int>& axes) {
throw std::runtime_error("Axpby has no vmap implementation.");
}
@@ -367,4 +367,4 @@ bool Axpby::is_equivalent(const Primitive& other) const {
return alpha_ == r_other.alpha_ && beta_ == r_other.beta_;
}
} // namespace mlx::core
} // namespace my_ext

View File

@@ -5,7 +5,9 @@
#include "mlx/ops.h"
#include "mlx/primitives.h"
namespace mlx::core {
namespace mx = mlx::core;
namespace my_ext {
///////////////////////////////////////////////////////////////////////////////
// Operation
@@ -18,22 +20,22 @@ namespace mlx::core {
* Follow numpy style broadcasting between x and y
* Inputs are upcasted to floats if needed
**/
array axpby(
const array& x, // Input array x
const array& y, // Input array y
mx::array axpby(
const mx::array& x, // Input array x
const mx::array& y, // Input array y
const float alpha, // Scaling factor for x
const float beta, // Scaling factor for y
StreamOrDevice s = {} // Stream on which to schedule the operation
mx::StreamOrDevice s = {} // Stream on which to schedule the operation
);
///////////////////////////////////////////////////////////////////////////////
// Primitive
///////////////////////////////////////////////////////////////////////////////
class Axpby : public Primitive {
class Axpby : public mx::Primitive {
public:
explicit Axpby(Stream stream, float alpha, float beta)
: Primitive(stream), alpha_(alpha), beta_(beta) {};
explicit Axpby(mx::Stream stream, float alpha, float beta)
: mx::Primitive(stream), alpha_(alpha), beta_(beta) {};
/**
* A primitive must know how to evaluate itself on the CPU/GPU
@@ -42,23 +44,25 @@ class Axpby : public Primitive {
* To avoid unnecessary allocations, the evaluation function
* is responsible for allocating space for the array.
*/
void eval_cpu(const std::vector<array>& inputs, std::vector<array>& outputs)
override;
void eval_gpu(const std::vector<array>& inputs, std::vector<array>& outputs)
override;
void eval_cpu(
const std::vector<mx::array>& inputs,
std::vector<mx::array>& outputs) override;
void eval_gpu(
const std::vector<mx::array>& inputs,
std::vector<mx::array>& outputs) override;
/** The Jacobian-vector product. */
std::vector<array> jvp(
const std::vector<array>& primals,
const std::vector<array>& tangents,
std::vector<mx::array> jvp(
const std::vector<mx::array>& primals,
const std::vector<mx::array>& tangents,
const std::vector<int>& argnums) override;
/** The vector-Jacobian product. */
std::vector<array> vjp(
const std::vector<array>& primals,
const std::vector<array>& cotangents,
std::vector<mx::array> vjp(
const std::vector<mx::array>& primals,
const std::vector<mx::array>& cotangents,
const std::vector<int>& argnums,
const std::vector<array>& outputs) override;
const std::vector<mx::array>& outputs) override;
/**
* The primitive must know how to vectorize itself across
@@ -66,8 +70,8 @@ class Axpby : public Primitive {
* representing the vectorized computation and the axis which
* corresponds to the output vectorized dimension.
*/
std::pair<std::vector<array>, std::vector<int>> vmap(
const std::vector<array>& inputs,
std::pair<std::vector<mx::array>, std::vector<int>> vmap(
const std::vector<mx::array>& inputs,
const std::vector<int>& axes) override;
/** Print the primitive. */
@@ -76,14 +80,16 @@ class Axpby : public Primitive {
}
/** Equivalence check **/
bool is_equivalent(const Primitive& other) const override;
bool is_equivalent(const mx::Primitive& other) const override;
private:
float alpha_;
float beta_;
/** Fall back implementation for evaluation on CPU */
void eval(const std::vector<array>& inputs, std::vector<array>& outputs);
void eval(
const std::vector<mx::array>& inputs,
std::vector<mx::array>& outputs);
};
} // namespace mlx::core
} // namespace my_ext

View File

@@ -8,14 +8,12 @@
namespace nb = nanobind;
using namespace nb::literals;
using namespace mlx::core;
NB_MODULE(_ext, m) {
m.doc() = "Sample extension for MLX";
m.def(
"axpby",
&axpby,
&my_ext::axpby,
"x"_a,
"y"_a,
"alpha"_a,