2024-11-23 04:24:16 +08:00
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
< html xmlns = "http://www.w3.org/1999/xhtml" lang = "en-US" >
< head >
< meta http-equiv = "Content-Type" content = "text/xhtml;charset=UTF-8" / >
< meta http-equiv = "X-UA-Compatible" content = "IE=11" / >
2025-02-07 04:16:29 +08:00
< meta name = "generator" content = "Doxygen 1.13.2" / >
2024-11-23 04:24:16 +08:00
< meta name = "viewport" content = "width=device-width, initial-scale=1" / >
< title > MLX: mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h Source File< / title >
< link href = "tabs.css" rel = "stylesheet" type = "text/css" / >
< script type = "text/javascript" src = "jquery.js" > < / script >
< script type = "text/javascript" src = "dynsections.js" > < / script >
< script type = "text/javascript" src = "clipboard.js" > < / script >
< link href = "navtree.css" rel = "stylesheet" type = "text/css" / >
2025-01-10 05:56:20 +08:00
< script type = "text/javascript" src = "navtreedata.js" > < / script >
< script type = "text/javascript" src = "navtree.js" > < / script >
2024-11-23 04:24:16 +08:00
< script type = "text/javascript" src = "resize.js" > < / script >
< script type = "text/javascript" src = "cookie.js" > < / script >
< link href = "search/search.css" rel = "stylesheet" type = "text/css" / >
< script type = "text/javascript" src = "search/searchdata.js" > < / script >
< script type = "text/javascript" src = "search/search.js" > < / script >
2025-01-10 05:56:20 +08:00
< script type = "text/javascript" >
/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699& dn=expat.txt MIT */
$(function() { init_search(); });
/* @license-end */
< / script >
2024-11-23 04:24:16 +08:00
< link href = "doxygen.css" rel = "stylesheet" type = "text/css" / >
< / head >
< body >
< div id = "top" > <!-- do not remove this div, it is closed by doxygen! -->
< div id = "titlearea" >
< table cellspacing = "0" cellpadding = "0" >
< tbody >
< tr id = "projectrow" >
< td id = "projectalign" >
< div id = "projectname" > MLX
< / div >
< / td >
2025-01-10 05:56:20 +08:00
< td > < div id = "MSearchBox" class = "MSearchBoxInactive" >
< span class = "left" >
< span id = "MSearchSelect" onmouseover = "return searchBox.OnSearchSelectShow()" onmouseout = "return searchBox.OnSearchSelectHide()" >   < / span >
< input type = "text" id = "MSearchField" value = "" placeholder = "Search" accesskey = "S"
onfocus="searchBox.OnSearchFieldFocus(true)"
onblur="searchBox.OnSearchFieldFocus(false)"
onkeyup="searchBox.OnSearchFieldChange(event)"/>
< / span > < span class = "right" >
< a id = "MSearchClose" href = "javascript:searchBox.CloseResultsWindow()" > < img id = "MSearchCloseImg" border = "0" src = "search/close.svg" alt = "" / > < / a >
< / span >
< / div >
< / td >
2024-11-23 04:24:16 +08:00
< / tr >
< / tbody >
< / table >
< / div >
<!-- end header part -->
2025-02-07 04:16:29 +08:00
<!-- Generated by Doxygen 1.13.2 -->
2024-11-23 04:24:16 +08:00
< script type = "text/javascript" >
/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699& dn=expat.txt MIT */
var searchBox = new SearchBox("searchBox", "search/",'.html');
/* @license-end */
< / script >
< script type = "text/javascript" >
/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699& dn=expat.txt MIT */
$(function() { codefold.init(0); });
/* @license-end */
< / script >
2025-01-10 05:56:20 +08:00
< / div > <!-- top -->
< div id = "side-nav" class = "ui-resizable side-nav-resizable" >
< div id = "nav-tree" >
< div id = "nav-tree-contents" >
< div id = "nav-sync" class = "sync" > < / div >
< / div >
< / div >
< div id = "splitbar" style = "-moz-user-select:none;"
class="ui-resizable-handle">
< / div >
< / div >
2024-11-23 04:24:16 +08:00
< script type = "text/javascript" >
/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699& dn=expat.txt MIT */
2025-01-10 05:56:20 +08:00
$(function(){initNavTree('steel__attention_8h_source.html',''); initResizable(true); });
2024-11-23 04:24:16 +08:00
/* @license-end */
< / script >
2025-01-10 05:56:20 +08:00
< div id = "doc-content" >
2024-11-23 04:24:16 +08:00
<!-- window showing the filter options -->
< div id = "MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
< / div >
<!-- iframe showing the search results (closed by default) -->
< div id = "MSearchResultsWindow" >
< div id = "MSearchResults" >
< div class = "SRPage" >
< div id = "SRIndex" >
< div id = "SRResults" > < / div >
< div class = "SRStatus" id = "Loading" > Loading...< / div >
< div class = "SRStatus" id = "Searching" > Searching...< / div >
< div class = "SRStatus" id = "NoMatches" > No Matches< / div >
< / div >
< / div >
< / div >
< / div >
< div class = "header" >
< div class = "headertitle" > < div class = "title" > steel_attention.h< / div > < / div >
< / div > <!-- header -->
< div class = "contents" >
< a href = "steel__attention_8h.html" > Go to the documentation of this file.< / a > < div class = "fragment" > < div class = "line" > < a id = "l00001" name = "l00001" > < / a > < span class = "lineno" > 1< / span > < span class = "comment" > // Copyright © 2024 Apple Inc.< / span > < / div >
< div class = "line" > < a id = "l00002" name = "l00002" > < / a > < span class = "lineno" > 2< / span > < / div >
< div class = "line" > < a id = "l00003" name = "l00003" > < / a > < span class = "lineno" > 3< / span > < span class = "keyword" > using namespace < / span > < a class = "code hl_namespace" href = "namespacemlx_1_1steel.html" > mlx::steel< / a > ;< / div >
2025-01-10 05:56:20 +08:00
< div class = "line" > < a id = "l00004" name = "l00004" > < / a > < span class = "lineno" > 4< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00006" name = "l00006" > < / a > < span class = "lineno" > 6< / span > < span class = "comment" > // GEMM kernels< / span > < / div >
< div class = "line" > < a id = "l00008" name = "l00008" > < / a > < span class = "lineno" > 8< / span > < / div >
< div class = "line" > < a id = "l00009" name = "l00009" > < / a > < span class = "lineno" > < a class = "line" href = "steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982" > 9< / a > < / span > constant < span class = "keywordtype" > bool< / span > < a class = "code hl_variable" href = "steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982" > align_Q< / a > [[function_constant(200)]];< / div >
< div class = "line" > < a id = "l00010" name = "l00010" > < / a > < span class = "lineno" > < a class = "line" href = "steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416" > 10< / a > < / span > constant < span class = "keywordtype" > bool< / span > < a class = "code hl_variable" href = "steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416" > align_K< / a > [[function_constant(201)]];< / div >
< div class = "line" > < a id = "l00011" name = "l00011" > < / a > < span class = "lineno" > 11< / span > < / div >
< div class = "line" > < a id = "l00012" name = "l00012" > < / a > < span class = "lineno" > 12< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T> < / div >
< div class = "foldopen" id = "foldopen00013" data-start = "{" data-end = "};" >
2025-01-10 05:56:20 +08:00
< div class = "line" > < a id = "l00013" name = "l00013" > < / a > < span class = "lineno" > < a class = "line" href = "struct_transform_scale.html" > 13< / a > < / span > < span class = "keyword" > struct < / span > < a class = "code hl_function" href = "struct_transform_scale.html#ae109cf7c963ba13df96977e7563f7b70" > TransformScale< / a > {< / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00014" name = "l00014" > < / a > < span class = "lineno" > < a class = "line" href = "struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6" > 14< / a > < / span > T < a class = "code hl_variable" href = "struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6" > scale< / a > ;< / div >
< div class = "line" > < a id = "l00015" name = "l00015" > < / a > < span class = "lineno" > < a class = "line" href = "struct_transform_scale.html#ae109cf7c963ba13df96977e7563f7b70" > 15< / a > < / span > METAL_FUNC < a class = "code hl_function" href = "struct_transform_scale.html#ae109cf7c963ba13df96977e7563f7b70" > TransformScale< / a > (T scale_) : < a class = "code hl_variable" href = "struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6" > scale< / a > (scale_) {}< / div >
< div class = "line" > < a id = "l00016" name = "l00016" > < / a > < span class = "lineno" > 16< / span > < / div >
< div class = "foldopen" id = "foldopen00017" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00017" name = "l00017" > < / a > < span class = "lineno" > < a class = "line" href = "struct_transform_scale.html#a9dd329422e5b8da43486cdce17132e16" > 17< / a > < / span > METAL_FUNC T < a class = "code hl_function" href = "struct_transform_scale.html#a9dd329422e5b8da43486cdce17132e16" > apply< / a > (T x)< span class = "keyword" > const < / span > {< / div >
< div class = "line" > < a id = "l00018" name = "l00018" > < / a > < span class = "lineno" > 18< / span > < span class = "keywordflow" > return< / span > < a class = "code hl_variable" href = "struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6" > scale< / a > * x;< / div >
< div class = "line" > < a id = "l00019" name = "l00019" > < / a > < span class = "lineno" > 19< / span > }< / div >
< / div >
< div class = "line" > < a id = "l00020" name = "l00020" > < / a > < span class = "lineno" > 20< / span > };< / div >
< / div >
< div class = "line" > < a id = "l00021" name = "l00021" > < / a > < span class = "lineno" > 21< / span > < / div >
< div class = "foldopen" id = "foldopen00022" data-start = "{" data-end = "};" >
< div class = "line" > < a id = "l00022" name = "l00022" > < / a > < span class = "lineno" > < a class = "line" href = "struct_max_op.html" > 22< / a > < / span > < span class = "keyword" > struct < / span > < a class = "code hl_struct" href = "struct_max_op.html" > MaxOp< / a > {< / div >
< div class = "line" > < a id = "l00023" name = "l00023" > < / a > < span class = "lineno" > 23< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T> < / div >
< div class = "foldopen" id = "foldopen00024" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00024" name = "l00024" > < / a > < span class = "lineno" > < a class = "line" href = "struct_max_op.html#ab3d3c3040017a13c170e7bdd1ffac46e" > 24< / a > < / span > METAL_FUNC < span class = "keyword" > static< / span > < span class = "keyword" > constexpr< / span > T < a class = "code hl_function" href = "struct_max_op.html#ab3d3c3040017a13c170e7bdd1ffac46e" > apply< / a > (T x, T y) {< / div >
< div class = "line" > < a id = "l00025" name = "l00025" > < / a > < span class = "lineno" > 25< / span > < span class = "keywordflow" > return< / span > < a class = "code hl_function" href = "namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b" > metal::max< / a > (x, y);< / div >
< div class = "line" > < a id = "l00026" name = "l00026" > < / a > < span class = "lineno" > 26< / span > }< / div >
< / div >
< div class = "line" > < a id = "l00027" name = "l00027" > < / a > < span class = "lineno" > 27< / span > };< / div >
< / div >
< div class = "line" > < a id = "l00028" name = "l00028" > < / a > < span class = "lineno" > 28< / span > < / div >
< div class = "foldopen" id = "foldopen00029" data-start = "{" data-end = "};" >
< div class = "line" > < a id = "l00029" name = "l00029" > < / a > < span class = "lineno" > < a class = "line" href = "struct_sum_op.html" > 29< / a > < / span > < span class = "keyword" > struct < / span > < a class = "code hl_struct" href = "struct_sum_op.html" > SumOp< / a > {< / div >
< div class = "line" > < a id = "l00030" name = "l00030" > < / a > < span class = "lineno" > 30< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T> < / div >
< div class = "foldopen" id = "foldopen00031" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00031" name = "l00031" > < / a > < span class = "lineno" > < a class = "line" href = "struct_sum_op.html#aa9563a98cbbe1b1921ade0c63ab38b4d" > 31< / a > < / span > METAL_FUNC < span class = "keyword" > static< / span > < span class = "keyword" > constexpr< / span > T < a class = "code hl_function" href = "struct_sum_op.html#aa9563a98cbbe1b1921ade0c63ab38b4d" > apply< / a > (T x, T y) {< / div >
< div class = "line" > < a id = "l00032" name = "l00032" > < / a > < span class = "lineno" > 32< / span > < span class = "keywordflow" > return< / span > x + y;< / div >
< div class = "line" > < a id = "l00033" name = "l00033" > < / a > < span class = "lineno" > 33< / span > }< / div >
< / div >
< div class = "line" > < a id = "l00034" name = "l00034" > < / a > < span class = "lineno" > 34< / span > };< / div >
< / div >
< div class = "line" > < a id = "l00035" name = "l00035" > < / a > < span class = "lineno" > 35< / span > < / div >
< div class = "foldopen" id = "foldopen00036" data-start = "{" data-end = "};" >
< div class = "line" > < a id = "l00036" name = "l00036" > < / a > < span class = "lineno" > < a class = "line" href = "struct_mul_op.html" > 36< / a > < / span > < span class = "keyword" > struct < / span > < a class = "code hl_struct" href = "struct_mul_op.html" > MulOp< / a > {< / div >
< div class = "line" > < a id = "l00037" name = "l00037" > < / a > < span class = "lineno" > 37< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T> < / div >
< div class = "foldopen" id = "foldopen00038" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00038" name = "l00038" > < / a > < span class = "lineno" > < a class = "line" href = "struct_mul_op.html#a1b93d804653d92fc7e46747de9e9c756" > 38< / a > < / span > METAL_FUNC < span class = "keyword" > static< / span > < span class = "keyword" > constexpr< / span > T < a class = "code hl_function" href = "struct_mul_op.html#a1b93d804653d92fc7e46747de9e9c756" > apply< / a > (T x, T y) {< / div >
< div class = "line" > < a id = "l00039" name = "l00039" > < / a > < span class = "lineno" > 39< / span > < span class = "keywordflow" > return< / span > x * y;< / div >
< div class = "line" > < a id = "l00040" name = "l00040" > < / a > < span class = "lineno" > 40< / span > }< / div >
< / div >
< div class = "line" > < a id = "l00041" name = "l00041" > < / a > < span class = "lineno" > 41< / span > };< / div >
< / div >
< div class = "line" > < a id = "l00042" name = "l00042" > < / a > < span class = "lineno" > 42< / span > < / div >
< div class = "foldopen" id = "foldopen00043" data-start = "{" data-end = "};" >
< div class = "line" > < a id = "l00043" name = "l00043" > < / a > < span class = "lineno" > < a class = "line" href = "struct_sub_op.html" > 43< / a > < / span > < span class = "keyword" > struct < / span > < a class = "code hl_struct" href = "struct_sub_op.html" > SubOp< / a > {< / div >
< div class = "line" > < a id = "l00044" name = "l00044" > < / a > < span class = "lineno" > 44< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T> < / div >
< div class = "foldopen" id = "foldopen00045" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00045" name = "l00045" > < / a > < span class = "lineno" > < a class = "line" href = "struct_sub_op.html#ad211f879a212ed0e98136217ca8e4143" > 45< / a > < / span > METAL_FUNC < span class = "keyword" > static< / span > < span class = "keyword" > constexpr< / span > T < a class = "code hl_function" href = "struct_sub_op.html#ad211f879a212ed0e98136217ca8e4143" > apply< / a > (T x, T y) {< / div >
< div class = "line" > < a id = "l00046" name = "l00046" > < / a > < span class = "lineno" > 46< / span > < span class = "keywordflow" > return< / span > x - y;< / div >
< div class = "line" > < a id = "l00047" name = "l00047" > < / a > < span class = "lineno" > 47< / span > }< / div >
< / div >
< div class = "line" > < a id = "l00048" name = "l00048" > < / a > < span class = "lineno" > 48< / span > };< / div >
< / div >
< div class = "line" > < a id = "l00049" name = "l00049" > < / a > < span class = "lineno" > 49< / span > < / div >
< div class = "foldopen" id = "foldopen00050" data-start = "{" data-end = "};" >
< div class = "line" > < a id = "l00050" name = "l00050" > < / a > < span class = "lineno" > < a class = "line" href = "struct_exp_sub_op.html" > 50< / a > < / span > < span class = "keyword" > struct < / span > < a class = "code hl_struct" href = "struct_exp_sub_op.html" > ExpSubOp< / a > {< / div >
< div class = "line" > < a id = "l00051" name = "l00051" > < / a > < span class = "lineno" > 51< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T> < / div >
< div class = "foldopen" id = "foldopen00052" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00052" name = "l00052" > < / a > < span class = "lineno" > < a class = "line" href = "struct_exp_sub_op.html#a00e457a01cb38f959dfd789455e7f334" > 52< / a > < / span > METAL_FUNC < span class = "keyword" > static< / span > < span class = "keyword" > constexpr< / span > T < a class = "code hl_function" href = "struct_exp_sub_op.html#a00e457a01cb38f959dfd789455e7f334" > apply< / a > (T x, T y) {< / div >
< div class = "line" > < a id = "l00053" name = "l00053" > < / a > < span class = "lineno" > 53< / span > < span class = "keywordflow" > return< / span > fast::exp(x - y);< / div >
< div class = "line" > < a id = "l00054" name = "l00054" > < / a > < span class = "lineno" > 54< / span > }< / div >
< / div >
< div class = "line" > < a id = "l00055" name = "l00055" > < / a > < span class = "lineno" > 55< / span > };< / div >
< / div >
< div class = "line" > < a id = "l00056" name = "l00056" > < / a > < span class = "lineno" > 56< / span > < / div >
< div class = "foldopen" id = "foldopen00057" data-start = "{" data-end = "};" >
< div class = "line" > < a id = "l00057" name = "l00057" > < / a > < span class = "lineno" > < a class = "line" href = "struct_div_op.html" > 57< / a > < / span > < span class = "keyword" > struct < / span > < a class = "code hl_struct" href = "struct_div_op.html" > DivOp< / a > {< / div >
< div class = "line" > < a id = "l00058" name = "l00058" > < / a > < span class = "lineno" > 58< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T> < / div >
< div class = "foldopen" id = "foldopen00059" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00059" name = "l00059" > < / a > < span class = "lineno" > < a class = "line" href = "struct_div_op.html#a1b8df47142dc6ea15315ce3a310f9221" > 59< / a > < / span > METAL_FUNC < span class = "keyword" > static< / span > < span class = "keyword" > constexpr< / span > T < a class = "code hl_function" href = "struct_div_op.html#a1b8df47142dc6ea15315ce3a310f9221" > apply< / a > (T x, T y) {< / div >
< div class = "line" > < a id = "l00060" name = "l00060" > < / a > < span class = "lineno" > 60< / span > < span class = "keywordflow" > return< / span > x / y;< / div >
< div class = "line" > < a id = "l00061" name = "l00061" > < / a > < span class = "lineno" > 61< / span > }< / div >
< / div >
< div class = "line" > < a id = "l00062" name = "l00062" > < / a > < span class = "lineno" > 62< / span > };< / div >
< / div >
< div class = "line" > < a id = "l00063" name = "l00063" > < / a > < span class = "lineno" > 63< / span > < / div >
< div class = "line" > < a id = "l00064" name = "l00064" > < / a > < span class = "lineno" > 64< / span > < span class = "comment" > // clang-format off< / span > < / div >
< div class = "line" > < a id = "l00065" name = "l00065" > < / a > < span class = "lineno" > 65< / span > < span class = "keyword" > template< / span > < < / div >
< div class = "line" > < a id = "l00066" name = "l00066" > < / a > < span class = "lineno" > 66< / span > < span class = "keyword" > typename< / span > T,< / div >
< div class = "line" > < a id = "l00067" name = "l00067" > < / a > < span class = "lineno" > 67< / span > < span class = "keywordtype" > int< / span > BQ,< / div >
< div class = "line" > < a id = "l00068" name = "l00068" > < / a > < span class = "lineno" > 68< / span > < span class = "keywordtype" > int< / span > BK,< / div >
< div class = "line" > < a id = "l00069" name = "l00069" > < / a > < span class = "lineno" > 69< / span > < span class = "keywordtype" > int< / span > BD,< / div >
< div class = "line" > < a id = "l00070" name = "l00070" > < / a > < span class = "lineno" > 70< / span > < span class = "keywordtype" > int< / span > WM,< / div >
< div class = "line" > < a id = "l00071" name = "l00071" > < / a > < span class = "lineno" > 71< / span > < span class = "keywordtype" > int< / span > WN,< / div >
< div class = "line" > < a id = "l00072" name = "l00072" > < / a > < span class = "lineno" > 72< / span > < span class = "keyword" > typename< / span > AccumType = < span class = "keywordtype" > float< / span > > < / div >
< div class = "foldopen" id = "foldopen00073" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00073" name = "l00073" > < / a > < span class = "lineno" > < a class = "line" href = "steel__attention_8h.html#a5423b2a414f5e3c14166d568dedfbd33" > 73< / a > < / span > [[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "steel__attention_8h.html#a5423b2a414f5e3c14166d568dedfbd33" > attention< / a > (< / div >
< div class = "line" > < a id = "l00074" name = "l00074" > < / a > < span class = "lineno" > 74< / span > < span class = "keyword" > const< / span > device T* Q [[buffer(0)]],< / div >
< div class = "line" > < a id = "l00075" name = "l00075" > < / a > < span class = "lineno" > 75< / span > < span class = "keyword" > const< / span > device T* K [[buffer(1)]],< / div >
< div class = "line" > < a id = "l00076" name = "l00076" > < / a > < span class = "lineno" > 76< / span > < span class = "keyword" > const< / span > device T* V [[buffer(2)]],< / div >
< div class = "line" > < a id = "l00077" name = "l00077" > < / a > < span class = "lineno" > 77< / span > device T* O [[buffer(3)]],< / div >
< div class = "line" > < a id = "l00078" name = "l00078" > < / a > < span class = "lineno" > 78< / span > < span class = "keyword" > const< / span > constant < a class = "code hl_struct" href = "structmlx_1_1steel_1_1_attn_params.html" > AttnParams< / a > * params [[buffer(4)]],< / div >
< div class = "line" > < a id = "l00079" name = "l00079" > < / a > < span class = "lineno" > 79< / span > uint simd_lane_id [[thread_index_in_simdgroup]],< / div >
< div class = "line" > < a id = "l00080" name = "l00080" > < / a > < span class = "lineno" > 80< / span > uint simd_group_id [[simdgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l00081" name = "l00081" > < / a > < span class = "lineno" > 81< / span > uint3 tid [[threadgroup_position_in_grid]],< / div >
< div class = "line" > < a id = "l00082" name = "l00082" > < / a > < span class = "lineno" > 82< / span > uint3 lid [[thread_position_in_threadgroup]]) { < span class = "comment" > // clang-format on< / span > < / div >
< div class = "line" > < a id = "l00083" name = "l00083" > < / a > < span class = "lineno" > 83< / span > < / div >
< div class = "line" > < a id = "l00084" name = "l00084" > < / a > < span class = "lineno" > 84< / span > < span class = "comment" > // Pacifying compiler< / span > < / div >
< div class = "line" > < a id = "l00085" name = "l00085" > < / a > < span class = "lineno" > 85< / span > (void)lid;< / div >
< div class = "line" > < a id = "l00086" name = "l00086" > < / a > < span class = "lineno" > 86< / span > < / div >
< div class = "line" > < a id = "l00087" name = "l00087" > < / a > < span class = "lineno" > 87< / span > < span class = "comment" > // Move to correct block< / span > < / div >
< div class = "line" > < a id = "l00088" name = "l00088" > < / a > < span class = "lineno" > 88< / span > ulong3 tidl{tid.x, tid.y, tid.z};< / div >
< div class = "line" > < a id = "l00089" name = "l00089" > < / a > < span class = "lineno" > 89< / span > < / div >
< div class = "line" > < a id = "l00090" name = "l00090" > < / a > < span class = "lineno" > 90< / span > Q += tidl.z * params-> Q_strides[0] + < span class = "comment" > // Batch< / span > < / div >
< div class = "line" > < a id = "l00091" name = "l00091" > < / a > < span class = "lineno" > 91< / span > tidl.y * params-> Q_strides[1] + < span class = "comment" > // Head< / span > < / div >
< div class = "line" > < a id = "l00092" name = "l00092" > < / a > < span class = "lineno" > 92< / span > tidl.x * BQ * params-> Q_strides[2]; < span class = "comment" > // Seqeunce< / span > < / div >
< div class = "line" > < a id = "l00093" name = "l00093" > < / a > < span class = "lineno" > 93< / span > < / div >
< div class = "line" > < a id = "l00094" name = "l00094" > < / a > < span class = "lineno" > 94< / span > ulong kv_head_idx = int(tid.y) / params-> gqa_factor;< / div >
< div class = "line" > < a id = "l00095" name = "l00095" > < / a > < span class = "lineno" > 95< / span > K += tidl.z * params-> K_strides[0] + < span class = "comment" > // Batch< / span > < / div >
< div class = "line" > < a id = "l00096" name = "l00096" > < / a > < span class = "lineno" > 96< / span > kv_head_idx * params-> K_strides[1]; < span class = "comment" > // Head< / span > < / div >
< div class = "line" > < a id = "l00097" name = "l00097" > < / a > < span class = "lineno" > 97< / span > < / div >
< div class = "line" > < a id = "l00098" name = "l00098" > < / a > < span class = "lineno" > 98< / span > V += tidl.z * params-> V_strides[0] + < span class = "comment" > // Batch< / span > < / div >
< div class = "line" > < a id = "l00099" name = "l00099" > < / a > < span class = "lineno" > 99< / span > kv_head_idx * params-> V_strides[1]; < span class = "comment" > // Head< / span > < / div >
< div class = "line" > < a id = "l00100" name = "l00100" > < / a > < span class = "lineno" > 100< / span > < / div >
< div class = "line" > < a id = "l00101" name = "l00101" > < / a > < span class = "lineno" > 101< / span > O += tidl.z * params-> O_strides[0] + < span class = "comment" > // Batch< / span > < / div >
< div class = "line" > < a id = "l00102" name = "l00102" > < / a > < span class = "lineno" > 102< / span > tidl.y * params-> O_strides[1] + < span class = "comment" > // Head< / span > < / div >
< div class = "line" > < a id = "l00103" name = "l00103" > < / a > < span class = "lineno" > 103< / span > tidl.x * BQ * params-> O_strides[2]; < span class = "comment" > // Seqeunce< / span > < / div >
< div class = "line" > < a id = "l00104" name = "l00104" > < / a > < span class = "lineno" > 104< / span > < / div >
< div class = "line" > < a id = "l00105" name = "l00105" > < / a > < span class = "lineno" > 105< / span > < span class = "comment" > // Prepare threadgroup memory< / span > < / div >
< div class = "line" > < a id = "l00106" name = "l00106" > < / a > < span class = "lineno" > 106< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > short< / span > padQ = 0; < span class = "comment" > // 16 / sizeof(T);< / span > < / div >
< div class = "line" > < a id = "l00107" name = "l00107" > < / a > < span class = "lineno" > 107< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > short< / span > padK = 0; < span class = "comment" > // 16 / sizeof(T);< / span > < / div >
< div class = "line" > < a id = "l00108" name = "l00108" > < / a > < span class = "lineno" > 108< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > short< / span > padV = 0; < span class = "comment" > // 16 / sizeof(T);< / span > < / div >
< div class = "line" > < a id = "l00109" name = "l00109" > < / a > < span class = "lineno" > 109< / span > < / div >
< div class = "line" > < a id = "l00110" name = "l00110" > < / a > < span class = "lineno" > 110< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > short< / span > LDQ_tgp = BD + padQ;< / div >
< div class = "line" > < a id = "l00111" name = "l00111" > < / a > < span class = "lineno" > 111< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > short< / span > LDK_tgp = BK + padK;< / div >
< div class = "line" > < a id = "l00112" name = "l00112" > < / a > < span class = "lineno" > 112< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > short< / span > LDV_tgp = BD + padV;< / div >
< div class = "line" > < a id = "l00113" name = "l00113" > < / a > < span class = "lineno" > 113< / span > < / div >
< div class = "line" > < a id = "l00114" name = "l00114" > < / a > < span class = "lineno" > 114< / span > threadgroup T Qs[BQ * (BD + padQ)];< / div >
< div class = "line" > < a id = "l00115" name = "l00115" > < / a > < span class = "lineno" > 115< / span > threadgroup T Ks[(BK + padK) * BD];< / div >
< div class = "line" > < a id = "l00116" name = "l00116" > < / a > < span class = "lineno" > 116< / span > threadgroup T Vs[BK * (BD + padV)];< / div >
< div class = "line" > < a id = "l00117" name = "l00117" > < / a > < span class = "lineno" > 117< / span > < / div >
< div class = "line" > < a id = "l00118" name = "l00118" > < / a > < span class = "lineno" > 118< / span > < span class = "comment" > // Prepare block loaders< / span > < / div >
< div class = "line" > < a id = "l00119" name = "l00119" > < / a > < span class = "lineno" > 119< / span > < span class = "keyword" > using < / span > QBlockLoader = < a class = "code hl_struct" href = "structmlx_1_1steel_1_1_block_loader_t.html" > BlockLoaderT< / a > < < / div >
< div class = "line" > < a id = "l00120" name = "l00120" > < / a > < span class = "lineno" > 120< / span > < span class = "comment" > /* typename T = */< / span > T,< / div >
< div class = "line" > < a id = "l00121" name = "l00121" > < / a > < span class = "lineno" > 121< / span > < span class = "comment" > /* short BROWS = */< / span > BQ,< / div >
< div class = "line" > < a id = "l00122" name = "l00122" > < / a > < span class = "lineno" > 122< / span > < span class = "comment" > /* short BCOLS = */< / span > BD,< / div >
< div class = "line" > < a id = "l00123" name = "l00123" > < / a > < span class = "lineno" > 123< / span > < span class = "comment" > /* short kDstStrRow = */< / span > LDQ_tgp,< / div >
< div class = "line" > < a id = "l00124" name = "l00124" > < / a > < span class = "lineno" > 124< / span > < span class = "comment" > /* short kDstStrCol = */< / span > 1,< / div >
< div class = "line" > < a id = "l00125" name = "l00125" > < / a > < span class = "lineno" > 125< / span > < span class = "comment" > /* short reduction_dim = */< / span > 1,< / div >
< div class = "line" > < a id = "l00126" name = "l00126" > < / a > < span class = "lineno" > 126< / span > < span class = "comment" > /* short tgp_size = */< / span > WM * WN * 32> ;< / div >
< div class = "line" > < a id = "l00127" name = "l00127" > < / a > < span class = "lineno" > 127< / span > < / div >
< div class = "line" > < a id = "l00128" name = "l00128" > < / a > < span class = "lineno" > 128< / span > < span class = "comment" > // K is loaded in transposed< / span > < / div >
< div class = "line" > < a id = "l00129" name = "l00129" > < / a > < span class = "lineno" > 129< / span > < span class = "keyword" > using < / span > KBlockLoader = < a class = "code hl_struct" href = "structmlx_1_1steel_1_1_block_loader_t.html" > BlockLoaderT< / a > < < / div >
< div class = "line" > < a id = "l00130" name = "l00130" > < / a > < span class = "lineno" > 130< / span > < span class = "comment" > /* typename T = */< / span > T,< / div >
< div class = "line" > < a id = "l00131" name = "l00131" > < / a > < span class = "lineno" > 131< / span > < span class = "comment" > /* short BROWS = */< / span > BK,< / div >
< div class = "line" > < a id = "l00132" name = "l00132" > < / a > < span class = "lineno" > 132< / span > < span class = "comment" > /* short BCOLS = */< / span > BD,< / div >
< div class = "line" > < a id = "l00133" name = "l00133" > < / a > < span class = "lineno" > 133< / span > < span class = "comment" > /* short kDstStrRow = */< / span > 1,< / div >
< div class = "line" > < a id = "l00134" name = "l00134" > < / a > < span class = "lineno" > 134< / span > < span class = "comment" > /* short kDstStrCol = */< / span > LDK_tgp,< / div >
< div class = "line" > < a id = "l00135" name = "l00135" > < / a > < span class = "lineno" > 135< / span > < span class = "comment" > /* short reduction_dim = */< / span > 0,< / div >
< div class = "line" > < a id = "l00136" name = "l00136" > < / a > < span class = "lineno" > 136< / span > < span class = "comment" > /* short tgp_size = */< / span > WM * WN * 32> ;< / div >
< div class = "line" > < a id = "l00137" name = "l00137" > < / a > < span class = "lineno" > 137< / span > < / div >
< div class = "line" > < a id = "l00138" name = "l00138" > < / a > < span class = "lineno" > 138< / span > < span class = "keyword" > using < / span > VBlockLoader = < a class = "code hl_struct" href = "structmlx_1_1steel_1_1_block_loader_t.html" > BlockLoaderT< / a > < < / div >
< div class = "line" > < a id = "l00139" name = "l00139" > < / a > < span class = "lineno" > 139< / span > < span class = "comment" > /* typename T = */< / span > T,< / div >
< div class = "line" > < a id = "l00140" name = "l00140" > < / a > < span class = "lineno" > 140< / span > < span class = "comment" > /* short BROWS = */< / span > BK,< / div >
< div class = "line" > < a id = "l00141" name = "l00141" > < / a > < span class = "lineno" > 141< / span > < span class = "comment" > /* short BCOLS = */< / span > BD,< / div >
< div class = "line" > < a id = "l00142" name = "l00142" > < / a > < span class = "lineno" > 142< / span > < span class = "comment" > /* short kDstStrRow = */< / span > LDV_tgp,< / div >
< div class = "line" > < a id = "l00143" name = "l00143" > < / a > < span class = "lineno" > 143< / span > < span class = "comment" > /* short kDstStrCol = */< / span > 1,< / div >
< div class = "line" > < a id = "l00144" name = "l00144" > < / a > < span class = "lineno" > 144< / span > < span class = "comment" > /* short reduction_dim = */< / span > 0,< / div >
< div class = "line" > < a id = "l00145" name = "l00145" > < / a > < span class = "lineno" > 145< / span > < span class = "comment" > /* short tgp_size = */< / span > WM * WN * 32> ;< / div >
< div class = "line" > < a id = "l00146" name = "l00146" > < / a > < span class = "lineno" > 146< / span > < / div >
< div class = "line" > < a id = "l00147" name = "l00147" > < / a > < span class = "lineno" > 147< / span > QBlockLoader loader_q(< / div >
< div class = "line" > < a id = "l00148" name = "l00148" > < / a > < span class = "lineno" > 148< / span > Q, params-> Q_strides[2], Qs, simd_group_id, simd_lane_id);< / div >
< div class = "line" > < a id = "l00149" name = "l00149" > < / a > < span class = "lineno" > 149< / span > KBlockLoader loader_k(< / div >
< div class = "line" > < a id = "l00150" name = "l00150" > < / a > < span class = "lineno" > 150< / span > K, params-> K_strides[2], Ks, simd_group_id, simd_lane_id);< / div >
< div class = "line" > < a id = "l00151" name = "l00151" > < / a > < span class = "lineno" > 151< / span > VBlockLoader loader_v(< / div >
< div class = "line" > < a id = "l00152" name = "l00152" > < / a > < span class = "lineno" > 152< / span > V, params-> V_strides[2], Vs, simd_group_id, simd_lane_id);< / div >
< div class = "line" > < a id = "l00153" name = "l00153" > < / a > < span class = "lineno" > 153< / span > < / div >
< div class = "line" > < a id = "l00154" name = "l00154" > < / a > < span class = "lineno" > 154< / span > < a class = "code hl_struct" href = "struct_transform_scale.html" > TransformScale< T> < / a > ts(< span class = "keyword" > static_cast< < / span > T< span class = "keyword" > > < / span > (params-> scale));< / div >
< div class = "line" > < a id = "l00155" name = "l00155" > < / a > < span class = "lineno" > 155< / span > < / div >
< div class = "line" > < a id = "l00156" name = "l00156" > < / a > < span class = "lineno" > 156< / span > < span class = "comment" > // Prepare MMA tiles< / span > < / div >
< div class = "line" > < a id = "l00157" name = "l00157" > < / a > < span class = "lineno" > 157< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > short< / span > kFragSize = 8; < span class = "comment" > // MMAFrag size< / span > < / div >
< div class = "line" > < a id = "l00158" name = "l00158" > < / a > < span class = "lineno" > 158< / span > < span class = "keyword" > using < / span > MMAFrag_acc_t = < a class = "code hl_struct" href = "structmlx_1_1steel_1_1_base_m_m_a_frag.html" > BaseMMAFrag< AccumType, kFragSize, kFragSize> < / a > ;< / div >
< div class = "line" > < a id = "l00159" name = "l00159" > < / a > < span class = "lineno" > 159< / span > < / div >
< div class = "line" > < a id = "l00160" name = "l00160" > < / a > < span class = "lineno" > 160< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > kNWarps = WM * WN;< / div >
< div class = "line" > < a id = "l00161" name = "l00161" > < / a > < span class = "lineno" > 161< / span > < span class = "keyword" > static_assert< / span > (< / div >
< div class = "line" > < a id = "l00162" name = "l00162" > < / a > < span class = "lineno" > 162< / span > BQ > = (kNWarps * kFragSize) & & BQ % (kNWarps * kFragSize) == 0,< / div >
< div class = "line" > < a id = "l00163" name = "l00163" > < / a > < span class = "lineno" > 163< / span > < span class = "stringliteral" > " Each simdgroup must host atleast 1 simdgroup matrix along Q sequence." < / span > );< / div >
< div class = "line" > < a id = "l00164" name = "l00164" > < / a > < span class = "lineno" > 164< / span > < / div >
< div class = "line" > < a id = "l00165" name = "l00165" > < / a > < span class = "lineno" > 165< / span > < span class = "comment" > // Q seq frags per warp< / span > < / div >
< div class = "line" > < a id = "l00166" name = "l00166" > < / a > < span class = "lineno" > 166< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > TQ = BQ / (kNWarps * kFragSize);< / div >
< div class = "line" > < a id = "l00167" name = "l00167" > < / a > < span class = "lineno" > 167< / span > < span class = "comment" > // KV sequence frags (all warps load the same frags)< / span > < / div >
< div class = "line" > < a id = "l00168" name = "l00168" > < / a > < span class = "lineno" > 168< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > TK = BK / kFragSize;< / div >
< div class = "line" > < a id = "l00169" name = "l00169" > < / a > < span class = "lineno" > 169< / span > < span class = "comment" > // HeadDim frags (all warps load the same frags)< / span > < / div >
< div class = "line" > < a id = "l00170" name = "l00170" > < / a > < span class = "lineno" > 170< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > TD = BD / kFragSize;< / div >
< div class = "line" > < a id = "l00171" name = "l00171" > < / a > < span class = "lineno" > 171< / span > < / div >
< div class = "line" > < a id = "l00172" name = "l00172" > < / a > < span class = "lineno" > 172< / span > < span class = "keyword" > static_assert< / span > (TQ == 1, < span class = "stringliteral" > " Check TQ" < / span > );< / div >
< div class = "line" > < a id = "l00173" name = "l00173" > < / a > < span class = "lineno" > 173< / span > < / div >
< div class = "line" > < a id = "l00174" name = "l00174" > < / a > < span class = "lineno" > 174< / span > < a class = "code hl_struct" href = "structmlx_1_1steel_1_1_m_m_a_tile.html" > MMATile< AccumType, TQ, 1, MMAFrag_acc_t> < / a > Qtile;< / div >
< div class = "line" > < a id = "l00175" name = "l00175" > < / a > < span class = "lineno" > 175< / span > < a class = "code hl_struct" href = "structmlx_1_1steel_1_1_m_m_a_tile.html" > MMATile< AccumType, 1, TK, MMAFrag_acc_t> < / a > Ktile;< / div >
< div class = "line" > < a id = "l00176" name = "l00176" > < / a > < span class = "lineno" > 176< / span > < a class = "code hl_struct" href = "structmlx_1_1steel_1_1_m_m_a_tile.html" > MMATile< AccumType, TQ, TK, MMAFrag_acc_t> < / a > Stile;< / div >
< div class = "line" > < a id = "l00177" name = "l00177" > < / a > < span class = "lineno" > 177< / span > < a class = "code hl_struct" href = "structmlx_1_1steel_1_1_m_m_a_tile.html" > MMATile< AccumType, TK, TD, MMAFrag_acc_t> < / a > Vtile;< / div >
< div class = "line" > < a id = "l00178" name = "l00178" > < / a > < span class = "lineno" > 178< / span > < a class = "code hl_struct" href = "structmlx_1_1steel_1_1_m_m_a_tile.html" > MMATile< AccumType, TQ, TD, MMAFrag_acc_t> < / a > Otile;< / div >
< div class = "line" > < a id = "l00179" name = "l00179" > < / a > < span class = "lineno" > 179< / span > < / div >
< div class = "line" > < a id = "l00180" name = "l00180" > < / a > < span class = "lineno" > 180< / span > Otile.< a class = "code hl_function" href = "structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7" > clear< / a > ();< / div >
< div class = "line" > < a id = "l00181" name = "l00181" > < / a > < span class = "lineno" > 181< / span > < / div >
< div class = "line" > < a id = "l00182" name = "l00182" > < / a > < span class = "lineno" > 182< / span > < span class = "comment" > // Prepare mma tile offsets< / span > < / div >
< div class = "line" > < a id = "l00183" name = "l00183" > < / a > < span class = "lineno" > 183< / span > < span class = "keyword" > const< / span > short2 simd_coord = MMAFrag_acc_t::get_coord(simd_lane_id);< / div >
< div class = "line" > < a id = "l00184" name = "l00184" > < / a > < span class = "lineno" > 184< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > short< / span > sm = simd_coord.y;< / div >
< div class = "line" > < a id = "l00185" name = "l00185" > < / a > < span class = "lineno" > 185< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > short< / span > sn = simd_coord.x;< / div >
< div class = "line" > < a id = "l00186" name = "l00186" > < / a > < span class = "lineno" > 186< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > short< / span > tm = kFragSize * TQ * simd_group_id;< / div >
< div class = "line" > < a id = "l00187" name = "l00187" > < / a > < span class = "lineno" > 187< / span > < / div >
< div class = "line" > < a id = "l00188" name = "l00188" > < / a > < span class = "lineno" > 188< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > short< / span > Qs_offset = (tm + sm) * LDQ_tgp + sn;< / div >
< div class = "line" > < a id = "l00189" name = "l00189" > < / a > < span class = "lineno" > 189< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > short< / span > Ks_offset = sm * LDK_tgp + sn;< / div >
< div class = "line" > < a id = "l00190" name = "l00190" > < / a > < span class = "lineno" > 190< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > short< / span > Vs_offset = sm * LDV_tgp + sn;< / div >
< div class = "line" > < a id = "l00191" name = "l00191" > < / a > < span class = "lineno" > 191< / span > < / div >
< div class = "line" > < a id = "l00192" name = "l00192" > < / a > < span class = "lineno" > 192< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > short< / span > Qs_tile_stride = kFragSize;< / div >
< div class = "line" > < a id = "l00193" name = "l00193" > < / a > < span class = "lineno" > 193< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > short< / span > Ks_tile_stride = kFragSize * LDK_tgp;< / div >
< div class = "line" > < a id = "l00194" name = "l00194" > < / a > < span class = "lineno" > 194< / span > < / div >
< div class = "line" > < a id = "l00195" name = "l00195" > < / a > < span class = "lineno" > 195< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l00196" name = "l00196" > < / a > < span class = "lineno" > 196< / span > < / div >
< div class = "line" > < a id = "l00197" name = "l00197" > < / a > < span class = "lineno" > 197< / span > < span class = "comment" > // Load Q blocks apply scale< / span > < / div >
< div class = "line" > < a id = "l00198" name = "l00198" > < / a > < span class = "lineno" > 198< / span > < span class = "keywordflow" > if< / span > (!< a class = "code hl_variable" href = "steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982" > align_Q< / a > & & < span class = "keywordtype" > int< / span > (tid.x) == (params-> NQ_aligned)) {< / div >
< div class = "line" > < a id = "l00199" name = "l00199" > < / a > < span class = "lineno" > 199< / span > loader_q.load_safe(short2(BD, params-> qL - params-> NQ_aligned * BQ));< / div >
< div class = "line" > < a id = "l00200" name = "l00200" > < / a > < span class = "lineno" > 200< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l00201" name = "l00201" > < / a > < span class = "lineno" > 201< / span > loader_q.load_unsafe();< / div >
< div class = "line" > < a id = "l00202" name = "l00202" > < / a > < span class = "lineno" > 202< / span > }< / div >
< div class = "line" > < a id = "l00203" name = "l00203" > < / a > < span class = "lineno" > 203< / span > loader_q.apply_inplace_op(ts);< / div >
< div class = "line" > < a id = "l00204" name = "l00204" > < / a > < span class = "lineno" > 204< / span > < / div >
< div class = "line" > < a id = "l00205" name = "l00205" > < / a > < span class = "lineno" > 205< / span > < span class = "comment" > // Init row reduction variables< / span > < / div >
< div class = "line" > < a id = "l00206" name = "l00206" > < / a > < span class = "lineno" > 206< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > short< / span > kRowsPT = < span class = "keyword" > decltype< / span > (Stile)::kRowsPerThread;< / div >
< div class = "line" > < a id = "l00207" name = "l00207" > < / a > < span class = "lineno" > 207< / span > < / div >
< div class = "line" > < a id = "l00208" name = "l00208" > < / a > < span class = "lineno" > 208< / span > AccumType max_score[kRowsPT];< / div >
< div class = "line" > < a id = "l00209" name = "l00209" > < / a > < span class = "lineno" > 209< / span > AccumType sum_score[kRowsPT] = {0};< / div >
< div class = "line" > < a id = "l00210" name = "l00210" > < / a > < span class = "lineno" > 210< / span > < / div >
< div class = "line" > < a id = "l00211" name = "l00211" > < / a > < span class = "lineno" > 211< / span > < span class = "comment" > // Init to -Inf< / span > < / div >
< div class = "line" > < a id = "l00212" name = "l00212" > < / a > < span class = "lineno" > 212< / span > < a class = "code hl_define" href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div >
< div class = "line" > < a id = "l00213" name = "l00213" > < / a > < span class = "lineno" > 213< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > short< / span > i = 0; i < kRowsPT; ++i) {< / div >
2025-01-10 05:56:20 +08:00
< div class = "line" > < a id = "l00214" name = "l00214" > < / a > < span class = "lineno" > 214< / span > max_score[i] = < a class = "code hl_variable" href = "struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e" > Limits< AccumType> ::min< / a > ;< / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00215" name = "l00215" > < / a > < span class = "lineno" > 215< / span > }< / div >
< div class = "line" > < a id = "l00216" name = "l00216" > < / a > < span class = "lineno" > 216< / span > < / div >
< div class = "line" > < a id = "l00217" name = "l00217" > < / a > < span class = "lineno" > 217< / span > < span class = "comment" > // Loop over KV seq length< / span > < / div >
< div class = "line" > < a id = "l00218" name = "l00218" > < / a > < span class = "lineno" > 218< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > kb = 0; kb < params-> NK; kb++) {< / div >
< div class = "line" > < a id = "l00219" name = "l00219" > < / a > < span class = "lineno" > 219< / span > < span class = "comment" > // Load K block and apply scale< / span > < / div >
< div class = "line" > < a id = "l00220" name = "l00220" > < / a > < span class = "lineno" > 220< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l00221" name = "l00221" > < / a > < span class = "lineno" > 221< / span > < span class = "keywordflow" > if< / span > (!< a class = "code hl_variable" href = "steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416" > align_K< / a > & & kb == (params-> NK_aligned)) {< / div >
< div class = "line" > < a id = "l00222" name = "l00222" > < / a > < span class = "lineno" > 222< / span > loader_k.load_safe(short2(BD, params-> kL - params-> NK_aligned * BK));< / div >
< div class = "line" > < a id = "l00223" name = "l00223" > < / a > < span class = "lineno" > 223< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l00224" name = "l00224" > < / a > < span class = "lineno" > 224< / span > loader_k.load_unsafe();< / div >
< div class = "line" > < a id = "l00225" name = "l00225" > < / a > < span class = "lineno" > 225< / span > }< / div >
< div class = "line" > < a id = "l00226" name = "l00226" > < / a > < span class = "lineno" > 226< / span > < / div >
< div class = "line" > < a id = "l00227" name = "l00227" > < / a > < span class = "lineno" > 227< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l00228" name = "l00228" > < / a > < span class = "lineno" > 228< / span > < / div >
< div class = "line" > < a id = "l00229" name = "l00229" > < / a > < span class = "lineno" > 229< / span > < span class = "comment" > // Do S = Q @ K.T< / span > < / div >
< div class = "line" > < a id = "l00230" name = "l00230" > < / a > < span class = "lineno" > 230< / span > Stile.< a class = "code hl_function" href = "structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7" > clear< / a > ();< / div >
< div class = "line" > < a id = "l00231" name = "l00231" > < / a > < span class = "lineno" > 231< / span > < / div >
< div class = "line" > < a id = "l00232" name = "l00232" > < / a > < span class = "lineno" > 232< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > short< / span > dd = 0; dd < TD; dd++) {< / div >
< div class = "line" > < a id = "l00233" name = "l00233" > < / a > < span class = "lineno" > 233< / span > simdgroup_barrier(mem_flags::mem_none);< / div >
< div class = "line" > < a id = "l00234" name = "l00234" > < / a > < span class = "lineno" > 234< / span > < / div >
< div class = "line" > < a id = "l00235" name = "l00235" > < / a > < span class = "lineno" > 235< / span > Qtile.template load< T, 1, 1, LDQ_tgp, 1> (< / div >
< div class = "line" > < a id = "l00236" name = "l00236" > < / a > < span class = "lineno" > 236< / span > & Qs[Qs_offset + dd * Qs_tile_stride]);< / div >
< div class = "line" > < a id = "l00237" name = "l00237" > < / a > < span class = "lineno" > 237< / span > Ktile.template load< T, 1, 1, LDK_tgp, 1> (< / div >
< div class = "line" > < a id = "l00238" name = "l00238" > < / a > < span class = "lineno" > 238< / span > & Ks[Ks_offset + dd * Ks_tile_stride]);< / div >
< div class = "line" > < a id = "l00239" name = "l00239" > < / a > < span class = "lineno" > 239< / span > < / div >
< div class = "line" > < a id = "l00240" name = "l00240" > < / a > < span class = "lineno" > 240< / span > simdgroup_barrier(mem_flags::mem_none);< / div >
< div class = "line" > < a id = "l00241" name = "l00241" > < / a > < span class = "lineno" > 241< / span > < / div >
< div class = "line" > < a id = "l00242" name = "l00242" > < / a > < span class = "lineno" > 242< / span > < a class = "code hl_function" href = "namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad" > tile_matmad< / a > (Stile, Qtile, Ktile, Stile);< / div >
< div class = "line" > < a id = "l00243" name = "l00243" > < / a > < span class = "lineno" > 243< / span > }< / div >
< div class = "line" > < a id = "l00244" name = "l00244" > < / a > < span class = "lineno" > 244< / span > < / div >
< div class = "line" > < a id = "l00245" name = "l00245" > < / a > < span class = "lineno" > 245< / span > < span class = "comment" > // Mask out of length sequence< / span > < / div >
< div class = "line" > < a id = "l00246" name = "l00246" > < / a > < span class = "lineno" > 246< / span > < span class = "keywordflow" > if< / span > (!< a class = "code hl_variable" href = "steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416" > align_K< / a > & & kb == (params-> NK_aligned)) {< / div >
< div class = "line" > < a id = "l00247" name = "l00247" > < / a > < span class = "lineno" > 247< / span > < span class = "keyword" > using < / span > stile_t = < span class = "keyword" > decltype< / span > (Stile);< / div >
< div class = "line" > < a id = "l00248" name = "l00248" > < / a > < span class = "lineno" > 248< / span > < span class = "keyword" > using < / span > selem_t = < span class = "keyword" > typename< / span > stile_t::elem_type;< / div >
< div class = "line" > < a id = "l00249" name = "l00249" > < / a > < span class = "lineno" > 249< / span > < span class = "keyword" > constexpr< / span > < span class = "keyword" > auto< / span > neg_inf = -metal::numeric_limits< selem_t> ::infinity();< / div >
< div class = "line" > < a id = "l00250" name = "l00250" > < / a > < span class = "lineno" > 250< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > short< / span > lim = params-> kL - params-> NK_aligned * BK;< / div >
< div class = "line" > < a id = "l00251" name = "l00251" > < / a > < span class = "lineno" > 251< / span > < / div >
< div class = "line" > < a id = "l00252" name = "l00252" > < / a > < span class = "lineno" > 252< / span > < a class = "code hl_define" href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div >
< div class = "line" > < a id = "l00253" name = "l00253" > < / a > < span class = "lineno" > 253< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > short< / span > i = 0; i < stile_t::kTileRows; i++) {< / div >
< div class = "line" > < a id = "l00254" name = "l00254" > < / a > < span class = "lineno" > 254< / span > < a class = "code hl_define" href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div >
< div class = "line" > < a id = "l00255" name = "l00255" > < / a > < span class = "lineno" > 255< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > short< / span > j = 0; j < stile_t::kTileCols; j++) {< / div >
< div class = "line" > < a id = "l00256" name = "l00256" > < / a > < span class = "lineno" > 256< / span > < span class = "keywordtype" > short< / span > col_pos = sn + (j * stile_t::kFragCols);< / div >
< div class = "line" > < a id = "l00257" name = "l00257" > < / a > < span class = "lineno" > 257< / span > < a class = "code hl_define" href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div >
< div class = "line" > < a id = "l00258" name = "l00258" > < / a > < span class = "lineno" > 258< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > short< / span > jj = 0; jj < stile_t::MMAFrag_t::kElemCols; jj++) {< / div >
< div class = "line" > < a id = "l00259" name = "l00259" > < / a > < span class = "lineno" > 259< / span > < span class = "keywordflow" > if< / span > ((col_pos + jj) > = lim) {< / div >
< div class = "line" > < a id = "l00260" name = "l00260" > < / a > < span class = "lineno" > 260< / span > Stile.< a class = "code hl_function" href = "structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4" > frag_at< / a > (i, j)[jj] = neg_inf;< / div >
< div class = "line" > < a id = "l00261" name = "l00261" > < / a > < span class = "lineno" > 261< / span > }< / div >
< div class = "line" > < a id = "l00262" name = "l00262" > < / a > < span class = "lineno" > 262< / span > }< / div >
< div class = "line" > < a id = "l00263" name = "l00263" > < / a > < span class = "lineno" > 263< / span > }< / div >
< div class = "line" > < a id = "l00264" name = "l00264" > < / a > < span class = "lineno" > 264< / span > }< / div >
< div class = "line" > < a id = "l00265" name = "l00265" > < / a > < span class = "lineno" > 265< / span > }< / div >
< div class = "line" > < a id = "l00266" name = "l00266" > < / a > < span class = "lineno" > 266< / span > < / div >
< div class = "line" > < a id = "l00267" name = "l00267" > < / a > < span class = "lineno" > 267< / span > simdgroup_barrier(mem_flags::mem_none);< / div >
< div class = "line" > < a id = "l00268" name = "l00268" > < / a > < span class = "lineno" > 268< / span > < / div >
< div class = "line" > < a id = "l00269" name = "l00269" > < / a > < span class = "lineno" > 269< / span > < span class = "comment" > // Load V blocks< / span > < / div >
< div class = "line" > < a id = "l00270" name = "l00270" > < / a > < span class = "lineno" > 270< / span > < span class = "keywordflow" > if< / span > (!< a class = "code hl_variable" href = "steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416" > align_K< / a > & & kb == (params-> NK_aligned)) {< / div >
< div class = "line" > < a id = "l00271" name = "l00271" > < / a > < span class = "lineno" > 271< / span > loader_v.load_safe(short2(BD, params-> kL - params-> NK_aligned * BK));< / div >
< div class = "line" > < a id = "l00272" name = "l00272" > < / a > < span class = "lineno" > 272< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l00273" name = "l00273" > < / a > < span class = "lineno" > 273< / span > loader_v.load_unsafe();< / div >
< div class = "line" > < a id = "l00274" name = "l00274" > < / a > < span class = "lineno" > 274< / span > }< / div >
< div class = "line" > < a id = "l00275" name = "l00275" > < / a > < span class = "lineno" > 275< / span > < / div >
< div class = "line" > < a id = "l00276" name = "l00276" > < / a > < span class = "lineno" > 276< / span > < span class = "comment" > // Do softmax< / span > < / div >
< div class = "line" > < a id = "l00277" name = "l00277" > < / a > < span class = "lineno" > 277< / span > < / div >
< div class = "line" > < a id = "l00278" name = "l00278" > < / a > < span class = "lineno" > 278< / span > < span class = "comment" > // Temp variables< / span > < / div >
< div class = "line" > < a id = "l00279" name = "l00279" > < / a > < span class = "lineno" > 279< / span > AccumType new_max[kRowsPT];< / div >
< div class = "line" > < a id = "l00280" name = "l00280" > < / a > < span class = "lineno" > 280< / span > AccumType factor[kRowsPT];< / div >
< div class = "line" > < a id = "l00281" name = "l00281" > < / a > < span class = "lineno" > 281< / span > < a class = "code hl_define" href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div >
< div class = "line" > < a id = "l00282" name = "l00282" > < / a > < span class = "lineno" > 282< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > short< / span > i = 0; i < kRowsPT; ++i) {< / div >
< div class = "line" > < a id = "l00283" name = "l00283" > < / a > < span class = "lineno" > 283< / span > new_max[i] = max_score[i];< / div >
< div class = "line" > < a id = "l00284" name = "l00284" > < / a > < span class = "lineno" > 284< / span > }< / div >
< div class = "line" > < a id = "l00285" name = "l00285" > < / a > < span class = "lineno" > 285< / span > < / div >
< div class = "line" > < a id = "l00286" name = "l00286" > < / a > < span class = "lineno" > 286< / span > < span class = "comment" > // Row max< / span > < / div >
< div class = "line" > < a id = "l00287" name = "l00287" > < / a > < span class = "lineno" > 287< / span > Stile.template row_reduce< MaxOp> (new_max);< / div >
< div class = "line" > < a id = "l00288" name = "l00288" > < / a > < span class = "lineno" > 288< / span > < / div >
< div class = "line" > < a id = "l00289" name = "l00289" > < / a > < span class = "lineno" > 289< / span > < span class = "comment" > // exp(Si - rowmax(Si))< / span > < / div >
< div class = "line" > < a id = "l00290" name = "l00290" > < / a > < span class = "lineno" > 290< / span > Stile.template row_bin_op< ExpSubOp> (new_max);< / div >
< div class = "line" > < a id = "l00291" name = "l00291" > < / a > < span class = "lineno" > 291< / span > < / div >
< div class = "line" > < a id = "l00292" name = "l00292" > < / a > < span class = "lineno" > 292< / span > < span class = "comment" > // Factor exp(rowmax(Si) - rowmax(Si-1))< / span > < / div >
< div class = "line" > < a id = "l00293" name = "l00293" > < / a > < span class = "lineno" > 293< / span > < a class = "code hl_define" href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div >
< div class = "line" > < a id = "l00294" name = "l00294" > < / a > < span class = "lineno" > 294< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > short< / span > i = 0; i < kRowsPT; ++i) {< / div >
< div class = "line" > < a id = "l00295" name = "l00295" > < / a > < span class = "lineno" > 295< / span > factor[i] = fast::exp(max_score[i] - new_max[i]);< / div >
< div class = "line" > < a id = "l00296" name = "l00296" > < / a > < span class = "lineno" > 296< / span > }< / div >
< div class = "line" > < a id = "l00297" name = "l00297" > < / a > < span class = "lineno" > 297< / span > < / div >
< div class = "line" > < a id = "l00298" name = "l00298" > < / a > < span class = "lineno" > 298< / span > < span class = "comment" > // Save max for next iteration< / span > < / div >
< div class = "line" > < a id = "l00299" name = "l00299" > < / a > < span class = "lineno" > 299< / span > < a class = "code hl_define" href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div >
< div class = "line" > < a id = "l00300" name = "l00300" > < / a > < span class = "lineno" > 300< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > short< / span > i = 0; i < kRowsPT; ++i) {< / div >
< div class = "line" > < a id = "l00301" name = "l00301" > < / a > < span class = "lineno" > 301< / span > max_score[i] = new_max[i];< / div >
< div class = "line" > < a id = "l00302" name = "l00302" > < / a > < span class = "lineno" > 302< / span > }< / div >
< div class = "line" > < a id = "l00303" name = "l00303" > < / a > < span class = "lineno" > 303< / span > < / div >
< div class = "line" > < a id = "l00304" name = "l00304" > < / a > < span class = "lineno" > 304< / span > < span class = "comment" > // Row Sum< / span > < / div >
< div class = "line" > < a id = "l00305" name = "l00305" > < / a > < span class = "lineno" > 305< / span > AccumType sum_score_tmp[kRowsPT] = {0};< / div >
< div class = "line" > < a id = "l00306" name = "l00306" > < / a > < span class = "lineno" > 306< / span > Stile.template row_reduce< SumOp> (sum_score_tmp);< / div >
< div class = "line" > < a id = "l00307" name = "l00307" > < / a > < span class = "lineno" > 307< / span > < / div >
< div class = "line" > < a id = "l00308" name = "l00308" > < / a > < span class = "lineno" > 308< / span > < span class = "comment" > // Update norm< / span > < / div >
< div class = "line" > < a id = "l00309" name = "l00309" > < / a > < span class = "lineno" > 309< / span > < a class = "code hl_define" href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div >
< div class = "line" > < a id = "l00310" name = "l00310" > < / a > < span class = "lineno" > 310< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > short< / span > i = 0; i < kRowsPT; ++i) {< / div >
< div class = "line" > < a id = "l00311" name = "l00311" > < / a > < span class = "lineno" > 311< / span > sum_score[i] = sum_score[i] * factor[i] + sum_score_tmp[i];< / div >
< div class = "line" > < a id = "l00312" name = "l00312" > < / a > < span class = "lineno" > 312< / span > }< / div >
< div class = "line" > < a id = "l00313" name = "l00313" > < / a > < span class = "lineno" > 313< / span > < / div >
< div class = "line" > < a id = "l00314" name = "l00314" > < / a > < span class = "lineno" > 314< / span > < span class = "comment" > // Update O< / span > < / div >
< div class = "line" > < a id = "l00315" name = "l00315" > < / a > < span class = "lineno" > 315< / span > Otile.template row_bin_op< MulOp> (factor);< / div >
< div class = "line" > < a id = "l00316" name = "l00316" > < / a > < span class = "lineno" > 316< / span > < / div >
< div class = "line" > < a id = "l00317" name = "l00317" > < / a > < span class = "lineno" > 317< / span > < span class = "comment" > // Load V into registers< / span > < / div >
< div class = "line" > < a id = "l00318" name = "l00318" > < / a > < span class = "lineno" > 318< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l00319" name = "l00319" > < / a > < span class = "lineno" > 319< / span > Vtile.template load< T, 1, 1, LDV_tgp, 1> (& Vs[Vs_offset]);< / div >
< div class = "line" > < a id = "l00320" name = "l00320" > < / a > < span class = "lineno" > 320< / span > < / div >
< div class = "line" > < a id = "l00321" name = "l00321" > < / a > < span class = "lineno" > 321< / span > simdgroup_barrier(mem_flags::mem_none);< / div >
< div class = "line" > < a id = "l00322" name = "l00322" > < / a > < span class = "lineno" > 322< / span > < / div >
< div class = "line" > < a id = "l00323" name = "l00323" > < / a > < span class = "lineno" > 323< / span > < span class = "comment" > // Do O = S @ V< / span > < / div >
< div class = "line" > < a id = "l00324" name = "l00324" > < / a > < span class = "lineno" > 324< / span > < a class = "code hl_function" href = "namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad" > tile_matmad< / a > (Otile, Stile, Vtile, Otile);< / div >
< div class = "line" > < a id = "l00325" name = "l00325" > < / a > < span class = "lineno" > 325< / span > < / div >
< div class = "line" > < a id = "l00326" name = "l00326" > < / a > < span class = "lineno" > 326< / span > < span class = "comment" > // Prepare for next iteration< / span > < / div >
< div class = "line" > < a id = "l00327" name = "l00327" > < / a > < span class = "lineno" > 327< / span > loader_k.next();< / div >
< div class = "line" > < a id = "l00328" name = "l00328" > < / a > < span class = "lineno" > 328< / span > loader_v.next();< / div >
< div class = "line" > < a id = "l00329" name = "l00329" > < / a > < span class = "lineno" > 329< / span > }< / div >
< div class = "line" > < a id = "l00330" name = "l00330" > < / a > < span class = "lineno" > 330< / span > < / div >
< div class = "line" > < a id = "l00331" name = "l00331" > < / a > < span class = "lineno" > 331< / span > < span class = "comment" > // Normalize output< / span > < / div >
< div class = "line" > < a id = "l00332" name = "l00332" > < / a > < span class = "lineno" > 332< / span > Otile.template row_bin_op< DivOp> (sum_score);< / div >
< div class = "line" > < a id = "l00333" name = "l00333" > < / a > < span class = "lineno" > 333< / span > threadgroup_barrier(mem_flags::mem_none);< / div >
< div class = "line" > < a id = "l00334" name = "l00334" > < / a > < span class = "lineno" > 334< / span > < / div >
< div class = "line" > < a id = "l00335" name = "l00335" > < / a > < span class = "lineno" > 335< / span > < span class = "comment" > // Store results< / span > < / div >
< div class = "line" > < a id = "l00336" name = "l00336" > < / a > < span class = "lineno" > 336< / span > O += (tm + sm) * params-> O_strides[2] + sn;< / div >
< div class = "line" > < a id = "l00337" name = "l00337" > < / a > < span class = "lineno" > 337< / span > < / div >
< div class = "line" > < a id = "l00338" name = "l00338" > < / a > < span class = "lineno" > 338< / span > < span class = "keywordflow" > if< / span > (!< a class = "code hl_variable" href = "steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982" > align_Q< / a > & & < span class = "keywordtype" > int< / span > (tid.x) == (params-> NQ_aligned)) {< / div >
< div class = "line" > < a id = "l00339" name = "l00339" > < / a > < span class = "lineno" > 339< / span > < span class = "keyword" > auto< / span > dst_tile_dims =< / div >
< div class = "line" > < a id = "l00340" name = "l00340" > < / a > < span class = "lineno" > 340< / span > short2(BD - sn, params-> qL - BQ * params-> NQ_aligned - (tm + sm));< / div >
< div class = "line" > < a id = "l00341" name = "l00341" > < / a > < span class = "lineno" > 341< / span > < / div >
< div class = "line" > < a id = "l00342" name = "l00342" > < / a > < span class = "lineno" > 342< / span > < span class = "keywordflow" > if< / span > (dst_tile_dims.x < = 0 || dst_tile_dims.y < = 0)< / div >
< div class = "line" > < a id = "l00343" name = "l00343" > < / a > < span class = "lineno" > 343< / span > < span class = "keywordflow" > return< / span > ;< / div >
< div class = "line" > < a id = "l00344" name = "l00344" > < / a > < span class = "lineno" > 344< / span > < / div >
< div class = "line" > < a id = "l00345" name = "l00345" > < / a > < span class = "lineno" > 345< / span > Otile.template store_safe< T, 1, 1> (O, params-> O_strides[2], dst_tile_dims);< / div >
< div class = "line" > < a id = "l00346" name = "l00346" > < / a > < span class = "lineno" > 346< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l00347" name = "l00347" > < / a > < span class = "lineno" > 347< / span > Otile.template store< T, 1, 1> (O, params-> O_strides[2]);< / div >
< div class = "line" > < a id = "l00348" name = "l00348" > < / a > < span class = "lineno" > 348< / span > }< / div >
< div class = "line" > < a id = "l00349" name = "l00349" > < / a > < span class = "lineno" > 349< / span > }< / div >
< / div >
< div class = "ttc" id = "anamespacemetal_html_a853c80479ab2264d9c4587c7bcac767b" > < div class = "ttname" > < a href = "namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b" > metal::max< / a > < / div > < div class = "ttdeci" > METAL_FUNC bfloat16_t max(bfloat16_t x, bfloat16_t y)< / div > < div class = "ttdef" > < b > Definition< / b > bf16_math.h:232< / div > < / div >
< div class = "ttc" id = "anamespacemlx_1_1steel_html" > < div class = "ttname" > < a href = "namespacemlx_1_1steel.html" > mlx::steel< / a > < / div > < div class = "ttdef" > < b > Definition< / b > attn.h:19< / div > < / div >
< div class = "ttc" id = "anamespacemlx_1_1steel_html_ad583e6038efc119542410f43b603d4ad" > < div class = "ttname" > < a href = "namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad" > mlx::steel::tile_matmad< / a > < / div > < div class = "ttdeci" > METAL_FUNC void tile_matmad(thread MMATile< T, M, N > & D, thread MMATile< U, M, K > & A, thread MMATile< U, K, N > & B, thread MMATile< T, M, N > & C)< / div > < div class = "ttdef" > < b > Definition< / b > mma.h:413< / div > < / div >
< div class = "ttc" id = "asteel_2defines_8h_html_a5a5c3095b132a7589bc19cd5cb80e2c6" > < div class = "ttname" > < a href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div > < div class = "ttdeci" > #define STEEL_PRAGMA_UNROLL< / div > < div class = "ttdef" > < b > Definition< / b > defines.h:4< / div > < / div >
< div class = "ttc" id = "asteel__attention_8h_html_a171fdea1b23976453f5dc5e6b3161982" > < div class = "ttname" > < a href = "steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982" > align_Q< / a > < / div > < div class = "ttdeci" > constant bool align_Q< / div > < div class = "ttdef" > < b > Definition< / b > steel_attention.h:9< / div > < / div >
< div class = "ttc" id = "asteel__attention_8h_html_a5423b2a414f5e3c14166d568dedfbd33" > < div class = "ttname" > < a href = "steel__attention_8h.html#a5423b2a414f5e3c14166d568dedfbd33" > attention< / a > < / div > < div class = "ttdeci" > void attention(const device T *Q, const device T *K, const device T *V, device T *O, const constant AttnParams *params, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid)< / div > < div class = "ttdef" > < b > Definition< / b > steel_attention.h:73< / div > < / div >
< div class = "ttc" id = "asteel__attention_8h_html_a8bdd2cecf97aa5b033152b1d0f0d2416" > < div class = "ttname" > < a href = "steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416" > align_K< / a > < / div > < div class = "ttdeci" > constant bool align_K< / div > < div class = "ttdef" > < b > Definition< / b > steel_attention.h:10< / div > < / div >
< div class = "ttc" id = "astruct_div_op_html" > < div class = "ttname" > < a href = "struct_div_op.html" > DivOp< / a > < / div > < div class = "ttdef" > < b > Definition< / b > steel_attention.h:57< / div > < / div >
< div class = "ttc" id = "astruct_div_op_html_a1b8df47142dc6ea15315ce3a310f9221" > < div class = "ttname" > < a href = "struct_div_op.html#a1b8df47142dc6ea15315ce3a310f9221" > DivOp::apply< / a > < / div > < div class = "ttdeci" > static METAL_FUNC constexpr T apply(T x, T y)< / div > < div class = "ttdef" > < b > Definition< / b > steel_attention.h:59< / div > < / div >
< div class = "ttc" id = "astruct_exp_sub_op_html" > < div class = "ttname" > < a href = "struct_exp_sub_op.html" > ExpSubOp< / a > < / div > < div class = "ttdef" > < b > Definition< / b > steel_attention.h:50< / div > < / div >
< div class = "ttc" id = "astruct_exp_sub_op_html_a00e457a01cb38f959dfd789455e7f334" > < div class = "ttname" > < a href = "struct_exp_sub_op.html#a00e457a01cb38f959dfd789455e7f334" > ExpSubOp::apply< / a > < / div > < div class = "ttdeci" > static METAL_FUNC constexpr T apply(T x, T y)< / div > < div class = "ttdef" > < b > Definition< / b > steel_attention.h:52< / div > < / div >
2025-01-10 05:56:20 +08:00
< div class = "ttc" id = "astruct_limits_html_a6e81584ba65a4dc6ff9366b458e3a20e" > < div class = "ttname" > < a href = "struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e" > Limits::min< / a > < / div > < div class = "ttdeci" > static const constant U min< / div > < div class = "ttdef" > < b > Definition< / b > utils.h:25< / div > < / div >
2024-11-23 04:24:16 +08:00
< div class = "ttc" id = "astruct_max_op_html" > < div class = "ttname" > < a href = "struct_max_op.html" > MaxOp< / a > < / div > < div class = "ttdef" > < b > Definition< / b > steel_attention.h:22< / div > < / div >
< div class = "ttc" id = "astruct_max_op_html_ab3d3c3040017a13c170e7bdd1ffac46e" > < div class = "ttname" > < a href = "struct_max_op.html#ab3d3c3040017a13c170e7bdd1ffac46e" > MaxOp::apply< / a > < / div > < div class = "ttdeci" > static METAL_FUNC constexpr T apply(T x, T y)< / div > < div class = "ttdef" > < b > Definition< / b > steel_attention.h:24< / div > < / div >
< div class = "ttc" id = "astruct_mul_op_html" > < div class = "ttname" > < a href = "struct_mul_op.html" > MulOp< / a > < / div > < div class = "ttdef" > < b > Definition< / b > steel_attention.h:36< / div > < / div >
< div class = "ttc" id = "astruct_mul_op_html_a1b93d804653d92fc7e46747de9e9c756" > < div class = "ttname" > < a href = "struct_mul_op.html#a1b93d804653d92fc7e46747de9e9c756" > MulOp::apply< / a > < / div > < div class = "ttdeci" > static METAL_FUNC constexpr T apply(T x, T y)< / div > < div class = "ttdef" > < b > Definition< / b > steel_attention.h:38< / div > < / div >
< div class = "ttc" id = "astruct_sub_op_html" > < div class = "ttname" > < a href = "struct_sub_op.html" > SubOp< / a > < / div > < div class = "ttdef" > < b > Definition< / b > steel_attention.h:43< / div > < / div >
< div class = "ttc" id = "astruct_sub_op_html_ad211f879a212ed0e98136217ca8e4143" > < div class = "ttname" > < a href = "struct_sub_op.html#ad211f879a212ed0e98136217ca8e4143" > SubOp::apply< / a > < / div > < div class = "ttdeci" > static METAL_FUNC constexpr T apply(T x, T y)< / div > < div class = "ttdef" > < b > Definition< / b > steel_attention.h:45< / div > < / div >
< div class = "ttc" id = "astruct_sum_op_html" > < div class = "ttname" > < a href = "struct_sum_op.html" > SumOp< / a > < / div > < div class = "ttdef" > < b > Definition< / b > steel_attention.h:29< / div > < / div >
< div class = "ttc" id = "astruct_sum_op_html_aa9563a98cbbe1b1921ade0c63ab38b4d" > < div class = "ttname" > < a href = "struct_sum_op.html#aa9563a98cbbe1b1921ade0c63ab38b4d" > SumOp::apply< / a > < / div > < div class = "ttdeci" > static METAL_FUNC constexpr T apply(T x, T y)< / div > < div class = "ttdef" > < b > Definition< / b > steel_attention.h:31< / div > < / div >
< div class = "ttc" id = "astruct_transform_scale_html" > < div class = "ttname" > < a href = "struct_transform_scale.html" > TransformScale< / a > < / div > < div class = "ttdef" > < b > Definition< / b > steel_attention.h:13< / div > < / div >
< div class = "ttc" id = "astruct_transform_scale_html_a9dd329422e5b8da43486cdce17132e16" > < div class = "ttname" > < a href = "struct_transform_scale.html#a9dd329422e5b8da43486cdce17132e16" > TransformScale::apply< / a > < / div > < div class = "ttdeci" > METAL_FUNC T apply(T x) const< / div > < div class = "ttdef" > < b > Definition< / b > steel_attention.h:17< / div > < / div >
< div class = "ttc" id = "astruct_transform_scale_html_aa56b8e107acf16fdf77006625c2b8bc6" > < div class = "ttname" > < a href = "struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6" > TransformScale::scale< / a > < / div > < div class = "ttdeci" > T scale< / div > < div class = "ttdef" > < b > Definition< / b > steel_attention.h:14< / div > < / div >
< div class = "ttc" id = "astruct_transform_scale_html_ae109cf7c963ba13df96977e7563f7b70" > < div class = "ttname" > < a href = "struct_transform_scale.html#ae109cf7c963ba13df96977e7563f7b70" > TransformScale::TransformScale< / a > < / div > < div class = "ttdeci" > METAL_FUNC TransformScale(T scale_)< / div > < div class = "ttdef" > < b > Definition< / b > steel_attention.h:15< / div > < / div >
< div class = "ttc" id = "astructmlx_1_1steel_1_1_attn_params_html" > < div class = "ttname" > < a href = "structmlx_1_1steel_1_1_attn_params.html" > mlx::steel::AttnParams< / a > < / div > < div class = "ttdef" > < b > Definition< / b > params.h:12< / div > < / div >
2025-02-07 04:16:29 +08:00
< div class = "ttc" id = "astructmlx_1_1steel_1_1_base_m_m_a_frag_html" > < div class = "ttname" > < a href = "structmlx_1_1steel_1_1_base_m_m_a_frag.html" > mlx::steel::BaseMMAFrag< / a > < / div > < div class = "ttdef" > < b > Definition< / b > mma.h:37< / div > < / div >
2024-11-23 04:24:16 +08:00
< div class = "ttc" id = "astructmlx_1_1steel_1_1_block_loader_t_html" > < div class = "ttname" > < a href = "structmlx_1_1steel_1_1_block_loader_t.html" > mlx::steel::BlockLoaderT< / a > < / div > < div class = "ttdef" > < b > Definition< / b > loader.h:153< / div > < / div >
2025-02-07 04:16:29 +08:00
< div class = "ttc" id = "astructmlx_1_1steel_1_1_m_m_a_tile_html" > < div class = "ttname" > < a href = "structmlx_1_1steel_1_1_m_m_a_tile.html" > mlx::steel::MMATile< / a > < / div > < div class = "ttdef" > < b > Definition< / b > mma.h:223< / div > < / div >
2024-11-23 04:24:16 +08:00
< div class = "ttc" id = "astructmlx_1_1steel_1_1_m_m_a_tile_html_a1a6b1446e8c8da46885bbaa8e8fdc7e4" > < div class = "ttname" > < a href = "structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4" > mlx::steel::MMATile::frag_at< / a > < / div > < div class = "ttdeci" > METAL_FUNC constexpr thread frag_type & frag_at(const short i, const short j)< / div > < div class = "ttdef" > < b > Definition< / b > mma.h:256< / div > < / div >
< div class = "ttc" id = "astructmlx_1_1steel_1_1_m_m_a_tile_html_aa97a98e423827a889c13a92217626ec7" > < div class = "ttname" > < a href = "structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7" > mlx::steel::MMATile::clear< / a > < / div > < div class = "ttdeci" > METAL_FUNC constexpr void clear()< / div > < div class = "ttdef" > < b > Definition< / b > mma.h:249< / div > < / div >
< / div > <!-- fragment --> < / div > <!-- contents -->
< / div > <!-- doc - content -->
2025-01-10 05:56:20 +08:00
<!-- start footer part -->
< div id = "nav-path" class = "navpath" > <!-- id is needed for treeview function! -->
< ul >
< li class = "navelem" > < a class = "el" href = "dir_938ab0ecf10b8b860ff766c820f665fd.html" > mlx< / a > < / li > < li class = "navelem" > < a class = "el" href = "dir_1d446c9bd3c99228254c9484e0bc5c06.html" > backend< / a > < / li > < li class = "navelem" > < a class = "el" href = "dir_d0c977ea65824390717cdb7efc36c157.html" > metal< / a > < / li > < li class = "navelem" > < a class = "el" href = "dir_70a37effa88bcbd6b791977fa1e64356.html" > kernels< / a > < / li > < li class = "navelem" > < a class = "el" href = "dir_76215a6c54e2b67053e723fc2395583c.html" > steel< / a > < / li > < li class = "navelem" > < a class = "el" href = "dir_e1756c7634b0c14aead026895ad71c6d.html" > attn< / a > < / li > < li class = "navelem" > < a class = "el" href = "dir_5aea41cce495e77a0857a0aecf063e33.html" > kernels< / a > < / li > < li class = "navelem" > < a class = "el" href = "steel__attention_8h.html" > steel_attention.h< / a > < / li >
2025-02-07 04:16:29 +08:00
< li class = "footer" > Generated by < a href = "https://www.doxygen.org/index.html" > < img class = "footer" src = "doxygen.svg" width = "104" height = "31" alt = "doxygen" / > < / a > 1.13.2 < / li >
2025-01-10 05:56:20 +08:00
< / ul >
< / div >
2024-11-23 04:24:16 +08:00
< / body >
< / html >