More jitting (#1132)

* docs + circle min size build * jit scan, arange, softmax * add sort * jit reductions * remove print * fix deps * clean includes / nits
2025-10-24 20:28:16 +08:00 · 2024-05-23 16:23:44 -07:00
parent 9401507336
commit 0189ab6ab6
41 changed files with 2377 additions and 1846 deletions
--- a/mlx/backend/metal/jit/arange.h
+++ b/mlx/backend/metal/jit/arange.h
@@ -0,0 +1,9 @@
+// Copyright © 2024 Apple Inc.
+
+constexpr std::string_view arange_kernels = R"(
+template [[host_name("{0}")]] [[kernel]] void arange<{1}>(
+    constant const {1}& start,
+    constant const {1}& step,
+    device {1}* out,
+    uint index [[thread_position_in_grid]]);
+)";
--- a/mlx/backend/metal/jit/includes.h
+++ b/mlx/backend/metal/jit/includes.h
@@ -1,4 +1,4 @@
-// Copyright © 2023-24 Apple Inc.
+// Copyright © 2023-2024 Apple Inc.

 #pragma once

@@ -8,14 +8,19 @@ const char* utils();
 const char* binary_ops();
 const char* unary_ops();
 const char* ternary_ops();
-const char* reduction();
+const char* reduce_utils();
 const char* gather();
 const char* scatter();

+const char* arange();
 const char* unary();
 const char* binary();
 const char* binary_two();
 const char* copy();
 const char* ternary();
+const char* scan();
+const char* softmax();
+const char* sort();
+const char* reduce();

 } // namespace mlx::core::metal
--- a/mlx/backend/metal/jit/reduce.h
+++ b/mlx/backend/metal/jit/reduce.h
@@ -0,0 +1,168 @@
+// Copyright © 2024 Apple Inc.
+
+constexpr std::string_view reduce_init_kernels = R"(
+[[kernel]] void {0}(
+    device {1}* out [[buffer(0)]],
+    uint tid [[thread_position_in_grid]]) {{
+  out[tid] = {2}<{1}>::init;
+}}
+)";
+
+constexpr std::string_view reduce_kernels = R"(
+template [[host_name("all_{0}")]] [[kernel]] void
+all_reduce<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device mlx_atomic<{2}>* out [[buffer(1)]],
+    const device size_t& in_size [[buffer(2)]],
+    uint gid [[thread_position_in_grid]],
+    uint lid [[thread_position_in_threadgroup]],
+    uint grid_size [[threads_per_grid]],
+    uint simd_per_group [[simdgroups_per_threadgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+template [[host_name("colGeneral_{0}")]] [[kernel]] void
+col_reduce_general<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device mlx_atomic<{2}>* out [[buffer(1)]],
+    const constant size_t& reduction_size [[buffer(2)]],
+    const constant size_t& reduction_stride [[buffer(3)]],
+    const constant size_t& out_size [[buffer(4)]],
+    const constant int* shape [[buffer(5)]],
+    const constant size_t* strides [[buffer(6)]],
+    const constant int& ndim [[buffer(7)]],
+    threadgroup {2}* local_data [[threadgroup(0)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint3 lsize [[threads_per_threadgroup]]);
+template [[host_name("colSmall_{0}")]] [[kernel]] void
+col_reduce_small<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant size_t& reduction_size [[buffer(2)]],
+    const constant size_t& reduction_stride [[buffer(3)]],
+    const constant size_t& out_size [[buffer(4)]],
+    const constant int* shape [[buffer(5)]],
+    const constant size_t* strides [[buffer(6)]],
+    const constant int& ndim [[buffer(7)]],
+    const constant size_t& non_col_reductions [[buffer(8)]],
+    const constant int* non_col_shapes [[buffer(9)]],
+    const constant size_t* non_col_strides [[buffer(10)]],
+    const constant int& non_col_ndim [[buffer(11)]],
+    uint tid [[thread_position_in_grid]]);
+template [[host_name("rowGeneralSmall_{0}")]] [[kernel]] void
+row_reduce_general_small<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant size_t& reduction_size [[buffer(2)]],
+    const constant size_t& out_size [[buffer(3)]],
+    const constant size_t& non_row_reductions [[buffer(4)]],
+    const constant int* shape [[buffer(5)]],
+    const constant size_t* strides [[buffer(6)]],
+    const constant int& ndim [[buffer(7)]],
+    uint lid [[thread_position_in_grid]]);
+template [[host_name("rowGeneralMed_{0}")]] [[kernel]] void
+row_reduce_general_med<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant size_t& reduction_size [[buffer(2)]],
+    const constant size_t& out_size [[buffer(3)]],
+    const constant size_t& non_row_reductions [[buffer(4)]],
+    const constant int* shape [[buffer(5)]],
+    const constant size_t* strides [[buffer(6)]],
+    const constant int& ndim [[buffer(7)]],
+    uint tid [[threadgroup_position_in_grid]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_per_group [[dispatch_simdgroups_per_threadgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+template [[host_name("rowGeneral_{0}")]] [[kernel]] void
+row_reduce_general<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device mlx_atomic<{2}>* out [[buffer(1)]],
+    const constant size_t& reduction_size [[buffer(2)]],
+    const constant size_t& out_size [[buffer(3)]],
+    const constant size_t& non_row_reductions [[buffer(4)]],
+    const constant int* shape [[buffer(5)]],
+    const constant size_t* strides [[buffer(6)]],
+    const constant int& ndim [[buffer(7)]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint3 lsize [[threads_per_threadgroup]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_per_group [[simdgroups_per_threadgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+)";
+
+constexpr std::string_view reduce_non_atomic_kernels = R"(
+template [[host_name("allNoAtomics_{0}")]] [[kernel]] void
+all_reduce_no_atomics<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const device size_t& in_size [[buffer(2)]],
+    uint gid [[thread_position_in_grid]],
+    uint lid [[thread_position_in_threadgroup]],
+    uint grid_size [[threads_per_grid]],
+    uint simd_per_group [[simdgroups_per_threadgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]],
+    uint thread_group_id [[threadgroup_position_in_grid]]);
+
+template [[host_name("colGeneralNoAtomics_{0}")]] [[kernel]] void
+  col_reduce_general_no_atomics<{1}, {2}, {3}<{2}>>(
+      const device {1}* in [[buffer(0)]],
+      device {2}* out [[buffer(1)]],
+      const constant size_t& reduction_size [[buffer(2)]],
+      const constant size_t& reduction_stride [[buffer(3)]],
+      const constant size_t& out_size [[buffer(4)]],
+      const constant int* shape [[buffer(5)]],
+      const constant size_t* strides [[buffer(6)]],
+      const constant int& ndim [[buffer(7)]],
+      threadgroup {2}* local_data [[threadgroup(0)]],
+      uint3 tid [[threadgroup_position_in_grid]],
+      uint3 lid [[thread_position_in_threadgroup]],
+      uint3 gid [[thread_position_in_grid]],
+      uint3 lsize [[threads_per_threadgroup]],
+      uint3 gsize [[threads_per_grid]]);
+template [[host_name("colSmall_{0}")]] [[kernel]] void
+col_reduce_small<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant size_t& reduction_size [[buffer(2)]],
+    const constant size_t& reduction_stride [[buffer(3)]],
+    const constant size_t& out_size [[buffer(4)]],
+    const constant int* shape [[buffer(5)]],
+    const constant size_t* strides [[buffer(6)]],
+    const constant int& ndim [[buffer(7)]],
+    const constant size_t& non_col_reductions [[buffer(8)]],
+    const constant int* non_col_shapes [[buffer(9)]],
+    const constant size_t* non_col_strides [[buffer(10)]],
+    const constant int& non_col_ndim [[buffer(11)]],
+    uint tid [[thread_position_in_grid]]);
+template [[host_name("rowGeneralSmall_{0}")]] [[kernel]] void
+row_reduce_general_small<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant size_t& reduction_size [[buffer(2)]],
+    const constant size_t& out_size [[buffer(3)]],
+    const constant size_t& non_row_reductions [[buffer(4)]],
+    const constant int* shape [[buffer(5)]],
+    const constant size_t* strides [[buffer(6)]],
+    const constant int& ndim [[buffer(7)]],
+    uint lid [[thread_position_in_grid]]);
+template [[host_name("rowGeneralNoAtomics_{0}")]] [[kernel]] void
+row_reduce_general_no_atomics<{1}, {2}, {3}<{2}>>(
+    const device {1}* in [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant size_t& reduction_size [[buffer(2)]],
+    const constant size_t& out_size [[buffer(3)]],
+    const constant size_t& non_row_reductions [[buffer(4)]],
+    const constant int* shape [[buffer(5)]],
+    const constant size_t* strides [[buffer(6)]],
+    const constant int& ndim [[buffer(7)]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint3 lsize [[threads_per_threadgroup]],
+    uint3 gsize [[threads_per_grid]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_per_group [[simdgroups_per_threadgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+)";
--- a/mlx/backend/metal/jit/scan.h
+++ b/mlx/backend/metal/jit/scan.h
@@ -0,0 +1,26 @@
+// Copyright © 2024 Apple Inc.
+
+constexpr std::string_view scan_kernels = R"(
+template [[host_name("contig_{0}")]] [[kernel]] void
+contiguous_scan<{1}, {2}, {3}<{2}>, 4, {4}, {5}>(
+    const device {1}* in [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant size_t& axis_size [[buffer(2)]],
+    uint gid [[thread_position_in_grid]],
+    uint lid [[thread_position_in_threadgroup]],
+    uint lsize [[threads_per_threadgroup]],
+    uint simd_size [[threads_per_simdgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+
+template [[host_name("strided_{0}")]] [[kernel]] void
+strided_scan<{1}, {2}, {3}<{2}>, 4, {4}, {5}>(
+    const device {1}* in [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant size_t& axis_size [[buffer(2)]],
+    const constant size_t& stride [[buffer(3)]],
+    uint2 gid [[thread_position_in_grid]],
+    uint2 lid [[thread_position_in_threadgroup]],
+    uint2 lsize [[threads_per_threadgroup]],
+    uint simd_size [[threads_per_simdgroup]]);
+)";
--- a/mlx/backend/metal/jit/softmax.h
+++ b/mlx/backend/metal/jit/softmax.h
@@ -0,0 +1,23 @@
+// Copyright © 2024 Apple Inc.
+
+constexpr std::string_view softmax_kernels = R"(
+template [[host_name("block_{0}")]] [[kernel]] void
+softmax_single_row<{1}, {2}>(
+    const device {1}* in,
+    device {1}* out,
+    constant int& axis_size,
+    uint gid [[thread_position_in_grid]],
+    uint _lid [[thread_position_in_threadgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+template [[host_name("looped_{0}")]] [[kernel]] void
+softmax_looped<{1}, {2}>(
+    const device {1}* in,
+    device {1}* out,
+    constant int& axis_size,
+    uint gid [[threadgroup_position_in_grid]],
+    uint lid [[thread_position_in_threadgroup]],
+    uint lsize [[threads_per_threadgroup]],
+    uint simd_lane_id [[thread_index_in_simdgroup]],
+    uint simd_group_id [[simdgroup_index_in_threadgroup]]);
+)";
--- a/mlx/backend/metal/jit/sort.h
+++ b/mlx/backend/metal/jit/sort.h
@@ -0,0 +1,81 @@
+// Copyright © 2024 Apple Inc.
+
+constexpr std::string_view block_sort_kernels = R"(
+template [[host_name("carg_{0}")]] [[kernel]] void
+block_sort<{1}, {2}, true, {3}, {4}>(
+    const device {1}* inp [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant int& size_sorted_axis [[buffer(2)]],
+    const constant int& stride_sorted_axis [[buffer(3)]],
+    const constant int& stride_segment_axis [[buffer(4)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]]);
+template [[host_name("ncarg_{0}")]] [[kernel]] void
+block_sort_nc<{1}, {2}, true, {3}, {4}>(
+    const device {1}* inp [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant int& size_sorted_axis [[buffer(2)]],
+    const constant int& stride_sorted_axis [[buffer(3)]],
+    const constant int& nc_dim [[buffer(4)]],
+    const device int* nc_shape [[buffer(5)]],
+    const device size_t* nc_strides [[buffer(6)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]]);
+template [[host_name("c_{0}")]] [[kernel]] void
+block_sort<{1}, {2}, false, {3}, {4}>(
+    const device {1}* inp [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant int& size_sorted_axis [[buffer(2)]],
+    const constant int& stride_sorted_axis [[buffer(3)]],
+    const constant int& stride_segment_axis [[buffer(4)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]]);
+template [[host_name("nc_{0}")]] [[kernel]] void
+block_sort_nc<{1}, {2}, false, {3}, {4}>(
+    const device {1}* inp [[buffer(0)]],
+    device {2}* out [[buffer(1)]],
+    const constant int& size_sorted_axis [[buffer(2)]],
+    const constant int& stride_sorted_axis [[buffer(3)]],
+    const constant int& nc_dim [[buffer(4)]],
+    const device int* nc_shape [[buffer(5)]],
+    const device size_t* nc_strides [[buffer(6)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]]);
+)";
+
+constexpr std::string_view multiblock_sort_kernels = R"(
+template [[host_name("sort_{0}")]] [[kernel]] void
+mb_block_sort<{1}, {2}, true, {3}, {4}>(
+    const device {1}* inp [[buffer(0)]],
+    device {1}* out_vals [[buffer(1)]],
+    device {2}* out_idxs [[buffer(2)]],
+    const constant int& size_sorted_axis [[buffer(3)]],
+    const constant int& stride_sorted_axis [[buffer(4)]],
+    const constant int& nc_dim [[buffer(5)]],
+    const device int* nc_shape [[buffer(6)]],
+    const device size_t* nc_strides [[buffer(7)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]]);
+template [[host_name("partition_{0}")]] [[kernel]] void
+mb_block_partition<{1}, {2}, true, {3}, {4}>(
+    device {2}* block_partitions [[buffer(0)]],
+    const device {1}* dev_vals [[buffer(1)]],
+    const device {2}* dev_idxs [[buffer(2)]],
+    const constant int& size_sorted_axis [[buffer(3)]],
+    const constant int& merge_tiles [[buffer(4)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]],
+    uint3 tgp_dims [[threads_per_threadgroup]]);
+template [[host_name("merge_{0}")]] [[kernel]] void
+mb_block_merge<{1}, {2}, true, {3}, {4}>(
+    const device {2}* block_partitions [[buffer(0)]],
+    const device {1}* dev_vals_in [[buffer(1)]],
+    const device {2}* dev_idxs_in [[buffer(2)]],
+    device {1}* dev_vals_out [[buffer(3)]],
+    device {2}* dev_idxs_out [[buffer(4)]],
+    const constant int& size_sorted_axis [[buffer(5)]],
+    const constant int& merge_tiles [[buffer(6)]],
+    const constant int& num_tiles [[buffer(7)]],
+    uint3 tid [[threadgroup_position_in_grid]],
+    uint3 lid [[thread_position_in_threadgroup]]);
+)";