Add load_safe to conv general loader

This commit is contained in:
Angelos Katharopoulos 2025-06-03 10:13:16 -07:00
parent 5866b3857b
commit c9af09d118
7 changed files with 198 additions and 22 deletions

View File

@ -1,6 +1,7 @@
// Copyright © 2023-2024 Apple Inc.
#include <algorithm>
#include <cassert>
#include <iostream>
#include <numeric>
#include <sstream>
@ -391,6 +392,7 @@ void implicit_gemm_conv_2D_general_gpu(
// Get channel iteration info
int channel_k_iters = ((conv_params.C + bk - 1) / bk);
int gemm_k_iters = channel_k_iters;
bool align_C = conv_params.C % bk == 0;
// Fix host side helper params
int sign = (conv_params.flip ? -1 : 1);
@ -419,14 +421,33 @@ void implicit_gemm_conv_2D_general_gpu(
/* const int swizzle_log = */ swizzle_log};
// Determine kernel
std::ostringstream kname;
kname << "implicit_gemm_conv_2d_general_" << type_to_name(out) << "_bm" << bm
<< "_bn" << bn << "_bk" << bk << "_wm" << wm << "_wn" << wn;
std::string kname;
kname.reserve(64);
concatenate(
kname,
"implicit_gemm_conv_2d_general_",
type_to_name(out),
"_bm",
bm,
"_bn",
bn,
"_bk",
bk,
"_wm",
wm,
"_wn",
wn);
std::string hash_name;
hash_name.reserve(64);
concatenate(hash_name, kname, "_alC_", align_C);
metal::MTLFCList func_consts = {
{&align_C, MTL::DataType::DataTypeBool, 200},
};
// Encode and dispatch kernel
auto& compute_encoder = d.get_command_encoder(s.index);
auto kernel =
get_steel_conv_general_kernel(d, kname.str(), out, bm, bn, bk, wm, wn);
auto kernel = get_steel_conv_general_kernel(
d, kname, hash_name, func_consts, out, bm, bn, bk, wm, wn);
compute_encoder.set_compute_pipeline_state(kernel);
// Deduce grid launch dimensions
@ -728,8 +749,10 @@ void dispatch_conv_2D_gpu(
// Direct to winograd conv
bool inp_large =
(conv_params.N * conv_params.iS[0] * conv_params.iS[1]) >= 1ul << 12;
(conv_params.N * conv_params.iS[0] * conv_params.iS[1]) >= 4096;
bool channels_large = (conv_params.C + conv_params.O) >= 256;
bool out_large =
(conv_params.N * conv_params.oS[0] * conv_params.oS[1]) >= 256;
if (!conv_params.flip && is_stride_one && is_kdil_one && is_idil_one &&
conv_params.wS[0] == 3 && conv_params.wS[1] == 3 &&
conv_params.C % 32 == 0 && conv_params.O % 32 == 0 && inp_large &&
@ -743,7 +766,9 @@ void dispatch_conv_2D_gpu(
return implicit_gemm_conv_2D_gpu(s, d, in, wt, out, conv_params);
}
else if (conv_params.C % 16 == 0 && conv_params.O % 16 == 0) {
else if (
(conv_params.C % 16 == 0 && conv_params.O % 16 == 0) || out_large ||
true) {
return implicit_gemm_conv_2D_general_gpu(s, d, in, wt, out, conv_params);
}

View File

@ -727,6 +727,8 @@ MTL::ComputePipelineState* get_steel_conv_kernel(
MTL::ComputePipelineState* get_steel_conv_general_kernel(
metal::Device& d,
const std::string& kernel_name,
const std::string& hash_name,
const metal::MTLFCList& func_consts,
const array& out,
int bm,
int bn,
@ -749,7 +751,7 @@ MTL::ComputePipelineState* get_steel_conv_general_kernel(
wn);
return kernel_source.str();
});
return d.get_kernel(kernel_name, lib);
return d.get_kernel(kernel_name, lib, hash_name, func_consts);
}
MTL::ComputePipelineState* get_fft_kernel(

View File

@ -205,6 +205,8 @@ MTL::ComputePipelineState* get_gemv_masked_kernel(
MTL::ComputePipelineState* get_steel_conv_general_kernel(
metal::Device& d,
const std::string& kernel_name,
const std::string& hash_name,
const metal::MTLFCList& func_consts,
const array& out,
int bm,
int bn,

View File

@ -2,6 +2,8 @@
#include "mlx/backend/metal/kernels/steel/conv/loaders/loader_general.h"
constant bool align_C [[function_constant(200)]];
template <
typename T,
int BM,
@ -118,23 +120,58 @@ implicit_gemm_conv_2d_general(
// Prepare threadgroup mma operation
mma_t mma_op(simd_gid, simd_lid);
int gemm_k_iterations =
base_wh_size * base_ww_size * gemm_params->gemm_k_iterations;
if (align_C) {
int gemm_k_iterations =
base_wh_size * base_ww_size * gemm_params->gemm_k_iterations;
for (int k = 0; k < gemm_k_iterations; k++) {
threadgroup_barrier(mem_flags::mem_threadgroup);
// Load elements into threadgroup
loader_a.load_unsafe();
loader_b.load_unsafe();
for (int k = 0; k < gemm_k_iterations; k++) {
threadgroup_barrier(mem_flags::mem_threadgroup);
// Load elements into threadgroup
loader_a.load_unsafe();
loader_b.load_unsafe();
threadgroup_barrier(mem_flags::mem_threadgroup);
threadgroup_barrier(mem_flags::mem_threadgroup);
// Multiply and accumulate threadgroup elements
mma_op.mma(As, Bs);
// Multiply and accumulate threadgroup elements
mma_op.mma(As, Bs);
// Prepare for next iteration
loader_a.next();
loader_b.next();
// Prepare for next iteration
loader_a.next();
loader_b.next();
}
}
else {
for (int k = 1; k < gemm_params->gemm_k_iterations; k++) {
for (int j = 0; j < base_wh_size * base_ww_size; j++) {
threadgroup_barrier(mem_flags::mem_threadgroup);
// Load elements into threadgroup
loader_a.load_unsafe();
loader_b.load_unsafe();
threadgroup_barrier(mem_flags::mem_threadgroup);
// Multiply and accumulate threadgroup elements
mma_op.mma(As, Bs);
// Prepare for next iteration
loader_a.next();
loader_b.next();
}
}
const short remaining_k = params->C % BK;
for (int j = 0; j < base_wh_size * base_ww_size; j++) {
// Load elements into threadgroup
threadgroup_barrier(mem_flags::mem_threadgroup);
loader_a.load_safe(remaining_k);
loader_b.load_safe(remaining_k);
threadgroup_barrier(mem_flags::mem_threadgroup);
// Multiply and accumulate threadgroup elements
mma_op.mma(As, Bs);
// Prepare for next iteration
loader_a.next();
loader_b.next();
}
}
threadgroup_barrier(mem_flags::mem_none);

View File

@ -137,6 +137,52 @@ struct Conv2DInputBlockLoaderGeneral {
}
}
METAL_FUNC void load_safe(const short remaining_k) const {
STEEL_PRAGMA_UNROLL
for (short i = 0, is = 0; i < n_rows; ++i, is += TROWS) {
// Find bounds
int n = read_n[i];
int h_flip = params->flip ? params->wS[0] - weight_h - 1 : weight_h;
int w_flip = params->flip ? params->wS[1] - weight_w - 1 : weight_w;
int ih_dil = read_ih[i] + h_flip * params->kdil[0];
int iw_dil = read_iw[i] + w_flip * params->kdil[1];
int ih = ih_dil / params->idil[0];
int iw = iw_dil / params->idil[1];
size_t offset = ih * params->in_strides[1] + iw * params->in_strides[2];
// Read from input if in bounds
if ((n < params->N) && (ih_dil >= 0 && ih < params->iS[0]) &&
(iw_dil >= 0 && iw < params->iS[1])) {
if (bj + vec_size <= remaining_k) {
STEEL_PRAGMA_UNROLL
for (short j = 0; j < vec_size; ++j) {
dst[is * dst_ld + j] = (src[i])[offset + j];
}
} else {
for (short j = 0; j < vec_size; ++j) {
if (bj + j < remaining_k) {
dst[is * dst_ld + j] = (src[i])[offset + j];
} else {
dst[is * dst_ld + j] = T(0);
}
}
}
}
// Zero pad otherwise
else {
STEEL_PRAGMA_UNROLL
for (short j = 0; j < vec_size; ++j) {
dst[is * dst_ld + j] = T(0);
}
}
}
}
/* Iteration helper */
METAL_FUNC void next() {
weight_w += jump_params->f_wgt_jump_w;
@ -262,6 +308,55 @@ struct Conv2DWeightBlockLoaderGeneral {
}
}
METAL_FUNC void load_safe(const short remaining_k) const {
const device T* curr_src = src + weight_h * params->wt_strides[1] +
weight_w * params->wt_strides[2];
if ((start_row + BN <= params->O)) {
STEEL_PRAGMA_UNROLL
for (short i = 0; i < BN; i += TROWS) {
if (bj + vec_size <= remaining_k) {
STEEL_PRAGMA_UNROLL
for (short j = 0; j < vec_size; j++) {
dst[i * dst_ld + j] = curr_src[i * src_ld + j];
}
} else {
for (short j = 0; j < vec_size; j++) {
if (bj + j < remaining_k) {
dst[i * dst_ld + j] = curr_src[i * src_ld + j];
} else {
dst[i * dst_ld + j] = T(0);
}
}
}
}
} else {
for (short i = 0; i < BN; i += TROWS) {
if ((start_row + i) < params->O) {
if (bj + vec_size <= remaining_k) {
STEEL_PRAGMA_UNROLL
for (short j = 0; j < vec_size; j++) {
dst[i * dst_ld + j] = curr_src[i * src_ld + j];
}
} else {
for (short j = 0; j < vec_size; j++) {
if (bj + j < remaining_k) {
dst[i * dst_ld + j] = curr_src[i * src_ld + j];
} else {
dst[i * dst_ld + j] = T(0);
}
}
}
} else {
STEEL_PRAGMA_UNROLL
for (short j = 0; j < vec_size; j++) {
dst[i * dst_ld + j] = T(0);
}
}
}
}
}
/* Iteration helper */
METAL_FUNC void next() {
weight_w += jump_params->f_wgt_jump_w;

View File

@ -244,13 +244,15 @@ MTL::ComputePipelineState* get_steel_conv_kernel(
MTL::ComputePipelineState* get_steel_conv_general_kernel(
metal::Device& d,
const std::string& kernel_name,
const std::string& hash_name,
const metal::MTLFCList& func_consts,
const array&,
int,
int,
int,
int,
int) {
return d.get_kernel(kernel_name);
return d.get_kernel(kernel_name, hash_name, func_consts);
}
MTL::ComputePipelineState* get_fft_kernel(

View File

@ -1173,6 +1173,19 @@ class TestConv(mlx_tests.MLXTestCase):
self.assertTrue(mx.allclose(out, out_2d.squeeze(2)))
def test_conv2d_unaligned_channels(self):
x = mx.random.uniform(shape=(2, 16, 16, 21))
w = mx.random.uniform(shape=(32, 3, 3, 21))
y = mx.conv2d(x, w, stream=mx.cpu)
y_hat = mx.conv2d(x, w)
self.assertTrue(mx.allclose(y, y_hat))
x = mx.random.uniform(shape=(2, 16, 16, 21))
w = mx.random.uniform(shape=(21, 3, 3, 21))
y = mx.conv2d(x, w, stream=mx.cpu)
y_hat = mx.conv2d(x, w)
self.assertTrue(mx.allclose(y, y_hat))
if __name__ == "__main__":
unittest.main()