2024-11-22 20:24:16 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< html  xmlns = "http://www.w3.org/1999/xhtml"  lang = "en-US" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< head > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< meta  http-equiv = "Content-Type"  content = "text/xhtml;charset=UTF-8" / > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< meta  http-equiv = "X-UA-Compatible"  content = "IE=11" / > 
							 
						 
					
						
							
								
									
										
										
										
											2025-02-06 20:16:29 +00:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								< meta  name = "generator"  content = "Doxygen 1.13.2" / > 
							 
						 
					
						
							
								
									
										
										
										
											2024-11-22 20:24:16 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								< meta  name = "viewport"  content = "width=device-width, initial-scale=1" / > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< title > MLX: mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h Source File< / title > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< link  href = "tabs.css"  rel = "stylesheet"  type = "text/css" / > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript"  src = "jquery.js" > < / script > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript"  src = "dynsections.js" > < / script > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript"  src = "clipboard.js" > < / script > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< link  href = "navtree.css"  rel = "stylesheet"  type = "text/css" / > 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-09 21:56:20 +00:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript"  src = "navtreedata.js" > < / script > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript"  src = "navtree.js" > < / script > 
							 
						 
					
						
							
								
									
										
										
										
											2024-11-22 20:24:16 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript"  src = "resize.js" > < / script > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript"  src = "cookie.js" > < / script > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< link  href = "search/search.css"  rel = "stylesheet"  type = "text/css" / > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript"  src = "search/searchdata.js" > < / script > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript"  src = "search/search.js" > < / script > 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-09 21:56:20 +00:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699& dn=expat.txt MIT */
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  $(function() { init_search(); });
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								/* @license-end */
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / script > 
							 
						 
					
						
							
								
									
										
										
										
											2024-11-22 20:24:16 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								< link  href = "doxygen.css"  rel = "stylesheet"  type = "text/css"  / > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / head > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< body > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  id = "top" > <!--  do not remove this div, it is closed by doxygen!  --> 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  id = "titlearea" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< table  cellspacing = "0"  cellpadding = "0" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								 < tbody > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								 < tr  id = "projectrow" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  < td  id = "projectalign" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								   < div  id = "projectname" > MLX
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								   < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  < / td > 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-09 21:56:20 +00:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								    < td >         < div  id = "MSearchBox"  class = "MSearchBoxInactive" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        < span  class = "left" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          < span  id = "MSearchSelect"                 onmouseover = "return searchBox.OnSearchSelectShow()"                 onmouseout = "return searchBox.OnSearchSelectHide()" >   < / span > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          < input  type = "text"  id = "MSearchField"  value = ""  placeholder = "Search"  accesskey = "S" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								               onfocus="searchBox.OnSearchFieldFocus(true)" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								               onblur="searchBox.OnSearchFieldFocus(false)" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								               onkeyup="searchBox.OnSearchFieldChange(event)"/>
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          < / span > < span  class = "right" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            < a  id = "MSearchClose"  href = "javascript:searchBox.CloseResultsWindow()" > < img  id = "MSearchCloseImg"  border = "0"  src = "search/close.svg"  alt = "" / > < / a > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          < / span > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / td > 
							 
						 
					
						
							
								
									
										
										
										
											2024-11-22 20:24:16 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								 < / tr > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								 < / tbody > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / table > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								<!--  end header part  --> 
							 
						 
					
						
							
								
									
										
										
										
											2025-02-06 20:16:29 +00:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								<!--  Generated by Doxygen 1.13.2  --> 
							 
						 
					
						
							
								
									
										
										
										
											2024-11-22 20:24:16 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699& dn=expat.txt MIT */
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								var searchBox = new SearchBox("searchBox", "search/",'.html');
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								/* @license-end */
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / script > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699& dn=expat.txt MIT */
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								$(function() { codefold.init(0); });
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								/* @license-end */
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / script > 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-09 21:56:20 +00:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								< / div > <!--  top  --> 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  id = "side-nav"  class = "ui-resizable side-nav-resizable" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  < div  id = "nav-tree" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    < div  id = "nav-tree-contents" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      < div  id = "nav-sync"  class = "sync" > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  < div  id = "splitbar"  style = "-moz-user-select:none;"  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								       class="ui-resizable-handle">
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
									
										
										
										
											2024-11-22 20:24:16 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699& dn=expat.txt MIT */
							 
						 
					
						
							
								
									
										
										
										
											2025-01-09 21:56:20 +00:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								$(function(){initNavTree('steel__attention_8h_source.html',''); initResizable(true); });
							 
						 
					
						
							
								
									
										
										
										
											2024-11-22 20:24:16 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								/* @license-end */
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / script > 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-09 21:56:20 +00:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								< div  id = "doc-content" > 
							 
						 
					
						
							
								
									
										
										
										
											2024-11-22 20:24:16 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								<!--  window showing the filter options  --> 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  id = "MSearchSelectWindow" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								     onmouseover="return searchBox.OnSearchSelectShow()"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								     onmouseout="return searchBox.OnSearchSelectHide()"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								     onkeydown="return searchBox.OnSearchSelectKey(event)">
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								<!--  iframe showing the search results (closed by default)  --> 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  id = "MSearchResultsWindow" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  id = "MSearchResults" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "SRPage" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  id = "SRIndex" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  id = "SRResults" > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "SRStatus"  id = "Loading" > Loading...< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "SRStatus"  id = "Searching" > Searching...< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "SRStatus"  id = "NoMatches" > No Matches< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "header" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  < div  class = "headertitle" > < div  class = "title" > steel_attention.h< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > <!-- header --> 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "contents" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< a  href = "steel__attention_8h.html" > Go to the documentation of this file.< / a > < div  class = "fragment" > < div  class = "line" > < a  id = "l00001"  name = "l00001" > < / a > < span  class = "lineno" >     1< / span > < span  class = "comment" > // Copyright © 2024 Apple Inc.< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00002"  name = "l00002" > < / a > < span  class = "lineno" >     2< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00003"  name = "l00003" > < / a > < span  class = "lineno" >     3< / span > < span  class = "keyword" > using namespace < / span > < a  class = "code hl_namespace"  href = "namespacemlx_1_1steel.html" > mlx::steel< / a > ;< / div > 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-09 21:56:20 +00:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00004"  name = "l00004" > < / a > < span  class = "lineno" >     4< / span > < / div > 
							 
						 
					
						
							
								
									
										
										
										
											2024-11-22 20:24:16 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00006"  name = "l00006" > < / a > < span  class = "lineno" >     6< / span > < span  class = "comment" > // GEMM kernels< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00008"  name = "l00008" > < / a > < span  class = "lineno" >     8< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00009"  name = "l00009" > < / a > < span  class = "lineno" > < a  class = "line"  href = "steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982" >     9< / a > < / span > constant < span  class = "keywordtype" > bool< / span >  < a  class = "code hl_variable"  href = "steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982" > align_Q< / a >  [[function_constant(200)]];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00010"  name = "l00010" > < / a > < span  class = "lineno" > < a  class = "line"  href = "steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416" >    10< / a > < / span > constant < span  class = "keywordtype" > bool< / span >  < a  class = "code hl_variable"  href = "steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416" > align_K< / a >  [[function_constant(201)]];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00011"  name = "l00011" > < / a > < span  class = "lineno" >    11< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00012"  name = "l00012" > < / a > < span  class = "lineno" >    12< / span > < span  class = "keyword" > template< / span >  < < span  class = "keyword" > typename< / span >  T> < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00013"  data-start = "{"  data-end = "};" > 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-09 21:56:20 +00:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00013"  name = "l00013" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct_transform_scale.html" >    13< / a > < / span > < span  class = "keyword" > struct < / span > < a  class = "code hl_function"  href = "struct_transform_scale.html#ae109cf7c963ba13df96977e7563f7b70" > TransformScale< / a >  {< / div > 
							 
						 
					
						
							
								
									
										
										
										
											2024-11-22 20:24:16 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00014"  name = "l00014" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6" >    14< / a > < / span >   T < a  class = "code hl_variable"  href = "struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6" > scale< / a > ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00015"  name = "l00015" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct_transform_scale.html#ae109cf7c963ba13df96977e7563f7b70" >    15< / a > < / span >   METAL_FUNC < a  class = "code hl_function"  href = "struct_transform_scale.html#ae109cf7c963ba13df96977e7563f7b70" > TransformScale< / a > (T scale_) : < a  class = "code hl_variable"  href = "struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6" > scale< / a > (scale_) {}< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00016"  name = "l00016" > < / a > < span  class = "lineno" >    16< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00017"  data-start = "{"  data-end = "}" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00017"  name = "l00017" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct_transform_scale.html#a9dd329422e5b8da43486cdce17132e16" >    17< / a > < / span >   METAL_FUNC T < a  class = "code hl_function"  href = "struct_transform_scale.html#a9dd329422e5b8da43486cdce17132e16" > apply< / a > (T x)< span  class = "keyword" >  const < / span > {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00018"  name = "l00018" > < / a > < span  class = "lineno" >    18< / span >     < span  class = "keywordflow" > return< / span >  < a  class = "code hl_variable"  href = "struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6" > scale< / a >  * x;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00019"  name = "l00019" > < / a > < span  class = "lineno" >    19< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00020"  name = "l00020" > < / a > < span  class = "lineno" >    20< / span > };< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00021"  name = "l00021" > < / a > < span  class = "lineno" >    21< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00022"  data-start = "{"  data-end = "};" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00022"  name = "l00022" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct_max_op.html" >    22< / a > < / span > < span  class = "keyword" > struct < / span > < a  class = "code hl_struct"  href = "struct_max_op.html" > MaxOp< / a >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00023"  name = "l00023" > < / a > < span  class = "lineno" >    23< / span >   < span  class = "keyword" > template< / span >  < < span  class = "keyword" > typename< / span >  T> < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00024"  data-start = "{"  data-end = "}" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00024"  name = "l00024" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct_max_op.html#ab3d3c3040017a13c170e7bdd1ffac46e" >    24< / a > < / span >   METAL_FUNC < span  class = "keyword" > static< / span >  < span  class = "keyword" > constexpr< / span >  T < a  class = "code hl_function"  href = "struct_max_op.html#ab3d3c3040017a13c170e7bdd1ffac46e" > apply< / a > (T x, T y) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00025"  name = "l00025" > < / a > < span  class = "lineno" >    25< / span >     < span  class = "keywordflow" > return< / span >  < a  class = "code hl_function"  href = "namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b" > metal::max< / a > (x, y);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00026"  name = "l00026" > < / a > < span  class = "lineno" >    26< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00027"  name = "l00027" > < / a > < span  class = "lineno" >    27< / span > };< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00028"  name = "l00028" > < / a > < span  class = "lineno" >    28< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00029"  data-start = "{"  data-end = "};" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00029"  name = "l00029" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct_sum_op.html" >    29< / a > < / span > < span  class = "keyword" > struct < / span > < a  class = "code hl_struct"  href = "struct_sum_op.html" > SumOp< / a >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00030"  name = "l00030" > < / a > < span  class = "lineno" >    30< / span >   < span  class = "keyword" > template< / span >  < < span  class = "keyword" > typename< / span >  T> < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00031"  data-start = "{"  data-end = "}" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00031"  name = "l00031" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct_sum_op.html#aa9563a98cbbe1b1921ade0c63ab38b4d" >    31< / a > < / span >   METAL_FUNC < span  class = "keyword" > static< / span >  < span  class = "keyword" > constexpr< / span >  T < a  class = "code hl_function"  href = "struct_sum_op.html#aa9563a98cbbe1b1921ade0c63ab38b4d" > apply< / a > (T x, T y) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00032"  name = "l00032" > < / a > < span  class = "lineno" >    32< / span >     < span  class = "keywordflow" > return< / span >  x + y;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00033"  name = "l00033" > < / a > < span  class = "lineno" >    33< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00034"  name = "l00034" > < / a > < span  class = "lineno" >    34< / span > };< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00035"  name = "l00035" > < / a > < span  class = "lineno" >    35< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00036"  data-start = "{"  data-end = "};" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00036"  name = "l00036" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct_mul_op.html" >    36< / a > < / span > < span  class = "keyword" > struct < / span > < a  class = "code hl_struct"  href = "struct_mul_op.html" > MulOp< / a >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00037"  name = "l00037" > < / a > < span  class = "lineno" >    37< / span >   < span  class = "keyword" > template< / span >  < < span  class = "keyword" > typename< / span >  T> < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00038"  data-start = "{"  data-end = "}" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00038"  name = "l00038" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct_mul_op.html#a1b93d804653d92fc7e46747de9e9c756" >    38< / a > < / span >   METAL_FUNC < span  class = "keyword" > static< / span >  < span  class = "keyword" > constexpr< / span >  T < a  class = "code hl_function"  href = "struct_mul_op.html#a1b93d804653d92fc7e46747de9e9c756" > apply< / a > (T x, T y) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00039"  name = "l00039" > < / a > < span  class = "lineno" >    39< / span >     < span  class = "keywordflow" > return< / span >  x * y;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00040"  name = "l00040" > < / a > < span  class = "lineno" >    40< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00041"  name = "l00041" > < / a > < span  class = "lineno" >    41< / span > };< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00042"  name = "l00042" > < / a > < span  class = "lineno" >    42< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00043"  data-start = "{"  data-end = "};" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00043"  name = "l00043" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct_sub_op.html" >    43< / a > < / span > < span  class = "keyword" > struct < / span > < a  class = "code hl_struct"  href = "struct_sub_op.html" > SubOp< / a >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00044"  name = "l00044" > < / a > < span  class = "lineno" >    44< / span >   < span  class = "keyword" > template< / span >  < < span  class = "keyword" > typename< / span >  T> < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00045"  data-start = "{"  data-end = "}" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00045"  name = "l00045" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct_sub_op.html#ad211f879a212ed0e98136217ca8e4143" >    45< / a > < / span >   METAL_FUNC < span  class = "keyword" > static< / span >  < span  class = "keyword" > constexpr< / span >  T < a  class = "code hl_function"  href = "struct_sub_op.html#ad211f879a212ed0e98136217ca8e4143" > apply< / a > (T x, T y) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00046"  name = "l00046" > < / a > < span  class = "lineno" >    46< / span >     < span  class = "keywordflow" > return< / span >  x - y;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00047"  name = "l00047" > < / a > < span  class = "lineno" >    47< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00048"  name = "l00048" > < / a > < span  class = "lineno" >    48< / span > };< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00049"  name = "l00049" > < / a > < span  class = "lineno" >    49< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00050"  data-start = "{"  data-end = "};" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00050"  name = "l00050" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct_exp_sub_op.html" >    50< / a > < / span > < span  class = "keyword" > struct < / span > < a  class = "code hl_struct"  href = "struct_exp_sub_op.html" > ExpSubOp< / a >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00051"  name = "l00051" > < / a > < span  class = "lineno" >    51< / span >   < span  class = "keyword" > template< / span >  < < span  class = "keyword" > typename< / span >  T> < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00052"  data-start = "{"  data-end = "}" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00052"  name = "l00052" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct_exp_sub_op.html#a00e457a01cb38f959dfd789455e7f334" >    52< / a > < / span >   METAL_FUNC < span  class = "keyword" > static< / span >  < span  class = "keyword" > constexpr< / span >  T < a  class = "code hl_function"  href = "struct_exp_sub_op.html#a00e457a01cb38f959dfd789455e7f334" > apply< / a > (T x, T y) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00053"  name = "l00053" > < / a > < span  class = "lineno" >    53< / span >     < span  class = "keywordflow" > return< / span >  fast::exp(x - y);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00054"  name = "l00054" > < / a > < span  class = "lineno" >    54< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00055"  name = "l00055" > < / a > < span  class = "lineno" >    55< / span > };< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00056"  name = "l00056" > < / a > < span  class = "lineno" >    56< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00057"  data-start = "{"  data-end = "};" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00057"  name = "l00057" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct_div_op.html" >    57< / a > < / span > < span  class = "keyword" > struct < / span > < a  class = "code hl_struct"  href = "struct_div_op.html" > DivOp< / a >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00058"  name = "l00058" > < / a > < span  class = "lineno" >    58< / span >   < span  class = "keyword" > template< / span >  < < span  class = "keyword" > typename< / span >  T> < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00059"  data-start = "{"  data-end = "}" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00059"  name = "l00059" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct_div_op.html#a1b8df47142dc6ea15315ce3a310f9221" >    59< / a > < / span >   METAL_FUNC < span  class = "keyword" > static< / span >  < span  class = "keyword" > constexpr< / span >  T < a  class = "code hl_function"  href = "struct_div_op.html#a1b8df47142dc6ea15315ce3a310f9221" > apply< / a > (T x, T y) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00060"  name = "l00060" > < / a > < span  class = "lineno" >    60< / span >     < span  class = "keywordflow" > return< / span >  x / y;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00061"  name = "l00061" > < / a > < span  class = "lineno" >    61< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00062"  name = "l00062" > < / a > < span  class = "lineno" >    62< / span > };< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00063"  name = "l00063" > < / a > < span  class = "lineno" >    63< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00064"  name = "l00064" > < / a > < span  class = "lineno" >    64< / span > < span  class = "comment" > // clang-format off< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00065"  name = "l00065" > < / a > < span  class = "lineno" >    65< / span > < span  class = "keyword" > template< / span >  < < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00066"  name = "l00066" > < / a > < span  class = "lineno" >    66< / span >     < span  class = "keyword" > typename< / span >  T,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00067"  name = "l00067" > < / a > < span  class = "lineno" >    67< / span >     < span  class = "keywordtype" > int< / span >  BQ,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00068"  name = "l00068" > < / a > < span  class = "lineno" >    68< / span >     < span  class = "keywordtype" > int< / span >  BK,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00069"  name = "l00069" > < / a > < span  class = "lineno" >    69< / span >     < span  class = "keywordtype" > int< / span >  BD,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00070"  name = "l00070" > < / a > < span  class = "lineno" >    70< / span >     < span  class = "keywordtype" > int< / span >  WM,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00071"  name = "l00071" > < / a > < span  class = "lineno" >    71< / span >     < span  class = "keywordtype" > int< / span >  WN,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00072"  name = "l00072" > < / a > < span  class = "lineno" >    72< / span >     < span  class = "keyword" > typename< / span >  AccumType = < span  class = "keywordtype" > float< / span > > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00073"  data-start = "{"  data-end = "}" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00073"  name = "l00073" > < / a > < span  class = "lineno" > < a  class = "line"  href = "steel__attention_8h.html#a5423b2a414f5e3c14166d568dedfbd33" >    73< / a > < / span > [[kernel, max_total_threads_per_threadgroup(WM * WN * 32)]] < span  class = "keywordtype" > void< / span >  < a  class = "code hl_function"  href = "steel__attention_8h.html#a5423b2a414f5e3c14166d568dedfbd33" > attention< / a > (< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00074"  name = "l00074" > < / a > < span  class = "lineno" >    74< / span >     < span  class = "keyword" > const< / span >  device T* Q [[buffer(0)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00075"  name = "l00075" > < / a > < span  class = "lineno" >    75< / span >     < span  class = "keyword" > const< / span >  device T* K [[buffer(1)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00076"  name = "l00076" > < / a > < span  class = "lineno" >    76< / span >     < span  class = "keyword" > const< / span >  device T* V [[buffer(2)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00077"  name = "l00077" > < / a > < span  class = "lineno" >    77< / span >     device T* O [[buffer(3)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00078"  name = "l00078" > < / a > < span  class = "lineno" >    78< / span >     < span  class = "keyword" > const< / span >  constant < a  class = "code hl_struct"  href = "structmlx_1_1steel_1_1_attn_params.html" > AttnParams< / a > * params [[buffer(4)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00079"  name = "l00079" > < / a > < span  class = "lineno" >    79< / span >     uint simd_lane_id [[thread_index_in_simdgroup]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00080"  name = "l00080" > < / a > < span  class = "lineno" >    80< / span >     uint simd_group_id [[simdgroup_index_in_threadgroup]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00081"  name = "l00081" > < / a > < span  class = "lineno" >    81< / span >     uint3 tid [[threadgroup_position_in_grid]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00082"  name = "l00082" > < / a > < span  class = "lineno" >    82< / span >     uint3 lid [[thread_position_in_threadgroup]]) { < span  class = "comment" > // clang-format on< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00083"  name = "l00083" > < / a > < span  class = "lineno" >    83< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00084"  name = "l00084" > < / a > < span  class = "lineno" >    84< / span >   < span  class = "comment" > // Pacifying compiler< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00085"  name = "l00085" > < / a > < span  class = "lineno" >    85< / span >   (void)lid;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00086"  name = "l00086" > < / a > < span  class = "lineno" >    86< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00087"  name = "l00087" > < / a > < span  class = "lineno" >    87< / span >   < span  class = "comment" > // Move to correct block< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00088"  name = "l00088" > < / a > < span  class = "lineno" >    88< / span >   ulong3 tidl{tid.x, tid.y, tid.z};< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00089"  name = "l00089" > < / a > < span  class = "lineno" >    89< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00090"  name = "l00090" > < / a > < span  class = "lineno" >    90< / span >   Q += tidl.z * params-> Q_strides[0] + < span  class = "comment" > // Batch< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00091"  name = "l00091" > < / a > < span  class = "lineno" >    91< / span >       tidl.y * params-> Q_strides[1] + < span  class = "comment" > // Head< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00092"  name = "l00092" > < / a > < span  class = "lineno" >    92< / span >       tidl.x * BQ * params-> Q_strides[2]; < span  class = "comment" > // Seqeunce< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00093"  name = "l00093" > < / a > < span  class = "lineno" >    93< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00094"  name = "l00094" > < / a > < span  class = "lineno" >    94< / span >   ulong kv_head_idx = int(tid.y) / params-> gqa_factor;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00095"  name = "l00095" > < / a > < span  class = "lineno" >    95< / span >   K += tidl.z * params-> K_strides[0] + < span  class = "comment" > // Batch< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00096"  name = "l00096" > < / a > < span  class = "lineno" >    96< / span >       kv_head_idx * params-> K_strides[1]; < span  class = "comment" > // Head< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00097"  name = "l00097" > < / a > < span  class = "lineno" >    97< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00098"  name = "l00098" > < / a > < span  class = "lineno" >    98< / span >   V += tidl.z * params-> V_strides[0] + < span  class = "comment" > // Batch< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00099"  name = "l00099" > < / a > < span  class = "lineno" >    99< / span >       kv_head_idx * params-> V_strides[1]; < span  class = "comment" > // Head< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00100"  name = "l00100" > < / a > < span  class = "lineno" >   100< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00101"  name = "l00101" > < / a > < span  class = "lineno" >   101< / span >   O += tidl.z * params-> O_strides[0] + < span  class = "comment" > // Batch< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00102"  name = "l00102" > < / a > < span  class = "lineno" >   102< / span >       tidl.y * params-> O_strides[1] + < span  class = "comment" > // Head< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00103"  name = "l00103" > < / a > < span  class = "lineno" >   103< / span >       tidl.x * BQ * params-> O_strides[2]; < span  class = "comment" > // Seqeunce< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00104"  name = "l00104" > < /a>< span  class = "lineno" >   104< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00105"  name = "l00105" > < / a > < span  class = "lineno" >   105< / span >   < span  class = "comment" > // Prepare threadgroup memory< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00106"  name = "l00106" > < / a > < span  class = "lineno" >   106< / span >   < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > short< / span >  padQ = 0; < span  class = "comment" > // 16 / sizeof(T);< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00107"  name = "l00107" > < / a > < span  class = "lineno" >   107< / span >   < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > short< / span >  padK = 0; < span  class = "comment" > // 16 / sizeof(T);< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00108"  name = "l00108" > < / a > < span  class = "lineno" >   108< / span >   < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > short< / span >  padV = 0; < span  class = "comment" > // 16 / sizeof(T);< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00109"  name = "l00109" > < / a > < span  class = "lineno" >   109< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00110"  name = "l00110" > < / a > < span  class = "lineno" >   110< / span >   < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > short< / span >  LDQ_tgp = BD + padQ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00111"  name = "l00111" > < / a > < span  class = "lineno" >   111< / span >   < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > short< / span >  LDK_tgp = BK + padK;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00112"  name = "l00112" > < / a > < span  class = "lineno" >   112< / span >   < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > short< / span >  LDV_tgp = BD + padV;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00113"  name = "l00113" > < / a > < span  class = "lineno" >   113< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00114"  name = "l00114" > < / a > < span  class = "lineno" >   114< / span >   threadgroup T Qs[BQ * (BD + padQ)];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00115"  name = "l00115" > < / a > < span  class = "lineno" >   115< / span >   threadgroup T Ks[(BK + padK) * BD];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00116"  name = "l00116" > < / a > < span  class = "lineno" >   116< / span >   threadgroup T Vs[BK * (BD + padV)];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00117"  name = "l00117" > < / a > < span  class = "lineno" >   117< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00118"  name = "l00118" > < / a > < span  class = "lineno" >   118< / span >   < span  class = "comment" > // Prepare block loaders< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00119"  name = "l00119" > < / a > < span  class = "lineno" >   119< / span >   < span  class = "keyword" > using < / span > QBlockLoader = < a  class = "code hl_struct"  href = "structmlx_1_1steel_1_1_block_loader_t.html" > BlockLoaderT< / a > < < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00120"  name = "l00120" > < / a > < span  class = "lineno" >   120< / span >       < span  class = "comment" > /* typename T = */< / span >  T,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00121"  name = "l00121" > < / a > < span  class = "lineno" >   121< / span >       < span  class = "comment" > /* short BROWS = */< / span >  BQ,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00122"  name = "l00122" > < / a > < span  class = "lineno" >   122< / span >       < span  class = "comment" > /* short BCOLS = */< / span >  BD,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00123"  name = "l00123" > < / a > < span  class = "lineno" >   123< / span >       < span  class = "comment" > /* short kDstStrRow = */< / span >  LDQ_tgp,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00124"  name = "l00124" > < / a > < span  class = "lineno" >   124< / span >       < span  class = "comment" > /* short kDstStrCol = */< / span >  1,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00125"  name = "l00125" > < / a > < span  class = "lineno" >   125< / span >       < span  class = "comment" > /* short reduction_dim = */< / span >  1,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00126"  name = "l00126" > < / a > < span  class = "lineno" >   126< / span >       < span  class = "comment" > /* short tgp_size = */< / span >  WM * WN * 32> ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00127"  name = "l00127" > < / a > < span  class = "lineno" >   127< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00128"  name = "l00128" > < / a > < span  class = "lineno" >   128< / span >   < span  class = "comment" > // K is loaded in transposed< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00129"  name = "l00129" > < / a > < span  class = "lineno" >   129< / span >   < span  class = "keyword" > using < / span > KBlockLoader = < a  class = "code hl_struct"  href = "structmlx_1_1steel_1_1_block_loader_t.html" > BlockLoaderT< / a > < < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00130"  name = "l00130" > < / a > < span  class = "lineno" >   130< / span >       < span  class = "comment" > /* typename T = */< / span >  T,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00131"  name = "l00131" > < / a > < span  class = "lineno" >   131< / span >       < span  class = "comment" > /* short BROWS = */< / span >  BK,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00132"  name = "l00132" > < / a > < span  class = "lineno" >   132< / span >       < span  class = "comment" > /* short BCOLS = */< / span >  BD,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00133"  name = "l00133" > < / a > < span  class = "lineno" >   133< / span >       < span  class = "comment" > /* short kDstStrRow = */< / span >  1,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00134"  name = "l00134" > < / a > < span  class = "lineno" >   134< / span >       < span  class = "comment" > /* short kDstStrCol = */< / span >  LDK_tgp,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00135"  name = "l00135" > < / a > < span  class = "lineno" >   135< / span >       < span  class = "comment" > /* short reduction_dim = */< / span >  0,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00136"  name = "l00136" > < / a > < span  class = "lineno" >   136< / span >       < span  class = "comment" > /* short tgp_size = */< / span >  WM * WN * 32> ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00137"  name = "l00137" > < / a > < span  class = "lineno" >   137< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00138"  name = "l00138" > < / a > < span  class = "lineno" >   138< / span >   < span  class = "keyword" > using < / span > VBlockLoader = < a  class = "code hl_struct"  href = "structmlx_1_1steel_1_1_block_loader_t.html" > BlockLoaderT< / a > < < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00139"  name = "l00139" > < / a > < span  class = "lineno" >   139< / span >       < span  class = "comment" > /* typename T = */< / span >  T,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00140"  name = "l00140" > < / a > < span  class = "lineno" >   140< / span >       < span  class = "comment" > /* short BROWS = */< / span >  BK,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00141"  name = "l00141" > < / a > < span  class = "lineno" >   141< / span >       < span  class = "comment" > /* short BCOLS = */< / span >  BD,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00142"  name = "l00142" > < / a > < span  class = "lineno" >   142< / span >       < span  class = "comment" > /* short kDstStrRow = */< / span >  LDV_tgp,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00143"  name = "l00143" > < / a > < span  class = "lineno" >   143< / span >       < span  class = "comment" > /* short kDstStrCol = */< / span >  1,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00144"  name = "l00144" > < / a > < span  class = "lineno" >   144< / span >       < span  class = "comment" > /* short reduction_dim = */< / span >  0,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00145"  name = "l00145" > < / a > < span  class = "lineno" >   145< / span >       < span  class = "comment" > /* short tgp_size = */< / span >  WM * WN * 32> ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00146"  name = "l00146" > < / a > < span  class = "lineno" >   146< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00147"  name = "l00147" > < / a > < span  class = "lineno" >   147< / span >   QBlockLoader loader_q(< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00148"  name = "l00148" > < / a > < span  class = "lineno" >   148< / span >       Q, params-> Q_strides[2], Qs, simd_group_id, simd_lane_id);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00149"  name = "l00149" > < / a > < span  class = "lineno" >   149< / span >   KBlockLoader loader_k(< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00150"  name = "l00150" > < / a > < span  class = "lineno" >   150< / span >       K, params-> K_strides[2], Ks, simd_group_id, simd_lane_id);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00151"  name = "l00151" > < / a > < span  class = "lineno" >   151< / span >   VBlockLoader loader_v(< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00152"  name = "l00152" > < / a > < span  class = "lineno" >   152< / span >       V, params-> V_strides[2], Vs, simd_group_id, simd_lane_id);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00153"  name = "l00153" > < / a > < span  class = "lineno" >   153< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00154"  name = "l00154" > < / a > < span  class = "lineno" >   154< / span >   < a  class = "code hl_struct"  href = "struct_transform_scale.html" > TransformScale< T> < / a >  ts(< span  class = "keyword" > static_cast< < / span > T< span  class = "keyword" > > < / span > (params-> scale));< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00155"  name = "l00155" > < / a > < span  class = "lineno" >   155< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00156"  name = "l00156" > < / a > < span  class = "lineno" >   156< / span >   < span  class = "comment" > // Prepare MMA tiles< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00157"  name = "l00157" > < / a > < span  class = "lineno" >   157< / span >   < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > short< / span >  kFragSize = 8; < span  class = "comment" > // MMAFrag size< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00158"  name = "l00158" > < / a > < span  class = "lineno" >   158< / span >   < span  class = "keyword" > using < / span > MMAFrag_acc_t = < a  class = "code hl_struct"  href = "structmlx_1_1steel_1_1_base_m_m_a_frag.html" > BaseMMAFrag< AccumType, kFragSize, kFragSize> < / a > ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00159"  name = "l00159" > < / a > < span  class = "lineno" >   159< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00160"  name = "l00160" > < / a > < span  class = "lineno" >   160< / span >   < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > int< / span >  kNWarps = WM * WN;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00161"  name = "l00161" > < / a > < span  class = "lineno" >   161< / span >   < span  class = "keyword" > static_assert< / span > (< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00162"  name = "l00162" > < / a > < span  class = "lineno" >   162< / span >       BQ > = (kNWarps * kFragSize) & &  BQ % (kNWarps * kFragSize) == 0,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00163"  name = "l00163" > < / a > < span  class = "lineno" >   163< / span >       < span  class = "stringliteral" > " Each simdgroup must host atleast 1 simdgroup matrix along Q sequence." < / span > );< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00164"  name = "l00164" > < / a > < span  class = "lineno" >   164< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00165"  name = "l00165" > < / a > < span  class = "lineno" >   165< / span >   < span  class = "comment" > // Q seq frags per warp< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00166"  name = "l00166" > < / a > < span  class = "lineno" >   166< / span >   < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > int< / span >  TQ = BQ / (kNWarps * kFragSize);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00167"  name = "l00167" > < / a > < span  class = "lineno" >   167< / span >   < span  class = "comment" > // KV sequence frags (all warps load the same frags)< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00168"  name = "l00168" > < / a > < span  class = "lineno" >   168< / span >   < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > int< / span >  TK = BK / kFragSize;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00169"  name = "l00169" > < / a > < span  class = "lineno" >   169< / span >   < span  class = "comment" > // HeadDim frags (all warps load the same frags)< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00170"  name = "l00170" > < / a > < span  class = "lineno" >   170< / span >   < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > int< / span >  TD = BD / kFragSize;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00171"  name = "l00171" > < / a > < span  class = "lineno" >   171< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00172"  name = "l00172" > < / a > < span  class = "lineno" >   172< / span >   < span  class = "keyword" > static_assert< / span > (TQ == 1, < span  class = "stringliteral" > " Check TQ" < / span > );< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00173"  name = "l00173" > < / a > < span  class = "lineno" >   173< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00174"  name = "l00174" > < / a > < span  class = "lineno" >   174< / span >   < a  class = "code hl_struct"  href = "structmlx_1_1steel_1_1_m_m_a_tile.html" > MMATile< AccumType, TQ, 1, MMAFrag_acc_t> < / a >  Qtile;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00175"  name = "l00175" > < / a > < span  class = "lineno" >   175< / span >   < a  class = "code hl_struct"  href = "structmlx_1_1steel_1_1_m_m_a_tile.html" > MMATile< AccumType, 1, TK, MMAFrag_acc_t> < / a >  Ktile;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00176"  name = "l00176" > < / a > < span  class = "lineno" >   176< / span >   < a  class = "code hl_struct"  href = "structmlx_1_1steel_1_1_m_m_a_tile.html" > MMATile< AccumType, TQ, TK, MMAFrag_acc_t> < / a >  Stile;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00177"  name = "l00177" > < / a > < span  class = "lineno" >   177< / span >   < a  class = "code hl_struct"  href = "structmlx_1_1steel_1_1_m_m_a_tile.html" > MMATile< AccumType, TK, TD, MMAFrag_acc_t> < / a >  Vtile;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00178"  name = "l00178" > < / a > < span  class = "lineno" >   178< / span >   < a  class = "code hl_struct"  href = "structmlx_1_1steel_1_1_m_m_a_tile.html" > MMATile< AccumType, TQ, TD, MMAFrag_acc_t> < / a >  Otile;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00179"  name = "l00179" > < / a > < span  class = "lineno" >   179< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00180"  name = "l00180" > < / a > < span  class = "lineno" >   180< / span >   Otile.< a  class = "code hl_function"  href = "structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7" > clear< / a > ();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00181"  name = "l00181" > < / a > < span  class = "lineno" >   181< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00182"  name = "l00182" > < / a > < span  class = "lineno" >   182< / span >   < span  class = "comment" > // Prepare mma tile offsets< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00183"  name = "l00183" > < / a > < span  class = "lineno" >   183< / span >   < span  class = "keyword" > const< / span >  short2 simd_coord = MMAFrag_acc_t::get_coord(simd_lane_id);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00184"  name = "l00184" > < / a > < span  class = "lineno" >   184< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > short< / span >  sm = simd_coord.y;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00185"  name = "l00185" > < / a > < span  class = "lineno" >   185< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > short< / span >  sn = simd_coord.x;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00186"  name = "l00186" > < / a > < span  class = "lineno" >   186< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > short< / span >  tm = kFragSize * TQ * simd_group_id;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00187"  name = "l00187" > < / a > < span  class = "lineno" >   187< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00188"  name = "l00188" > < / a > < span  class = "lineno" >   188< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > short< / span >  Qs_offset = (tm + sm) * LDQ_tgp + sn;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00189"  name = "l00189" > < / a > < span  class = "lineno" >   189< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > short< / span >  Ks_offset = sm * LDK_tgp + sn;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00190"  name = "l00190" > < / a > < span  class = "lineno" >   190< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > short< / span >  Vs_offset = sm * LDV_tgp + sn;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00191"  name = "l00191" > < / a > < span  class = "lineno" >   191< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00192"  name = "l00192" > < / a > < span  class = "lineno" >   192< / span >   < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > short< / span >  Qs_tile_stride = kFragSize;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00193"  name = "l00193" > < / a > < span  class = "lineno" >   193< / span >   < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > short< / span >  Ks_tile_stride = kFragSize * LDK_tgp;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00194"  name = "l00194" > < / a > < span  class = "lineno" >   194< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00195"  name = "l00195" > < / a > < span  class = "lineno" >   195< / span >   threadgroup_barrier(mem_flags::mem_threadgroup);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00196"  name = "l00196" > < / a > < span  class = "lineno" >   196< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00197"  name = "l00197" > < / a > < span  class = "lineno" >   197< / span >   < span  class = "comment" > // Load Q blocks apply scale< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00198"  name = "l00198" > < / a > < span  class = "lineno" >   198< / span >   < span  class = "keywordflow" > if< / span >  (!< a  class = "code hl_variable"  href = "steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982" > align_Q< / a >  & &  < span  class = "keywordtype" > int< / span > (tid.x) == (params-> NQ_aligned)) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00199"  name = "l00199" > < / a > < span  class = "lineno" >   199< / span >     loader_q.load_safe(short2(BD, params-> qL - params-> NQ_aligned * BQ));< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00200"  name = "l00200" > < / a > < span  class = "lineno" >   200< / span >   } < span  class = "keywordflow" > else< / span >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00201"  name = "l00201" > < / a > < span  class = "lineno" >   201< / span >     loader_q.load_unsafe();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00202"  name = "l00202" > < / a > < span  class = "lineno" >   202< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00203"  name = "l00203" > < / a > < span  class = "lineno" >   203< / span >   loader_q.apply_inplace_op(ts);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00204"  name = "l00204" > < / a > < span  class = "lineno" >   204< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00205"  name = "l00205" > < / a > < span  class = "lineno" >   205< / span >   < span  class = "comment" > // Init row reduction variables< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00206"  name = "l00206" > < / a > < span  class = "lineno" >   206< / span >   < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > short< / span >  kRowsPT = < span  class = "keyword" > decltype< / span > (Stile)::kRowsPerThread;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00207"  name = "l00207" > < / a > < span  class = "lineno" >   207< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00208"  name = "l00208" > < / a > < span  class = "lineno" >   208< / span >   AccumType max_score[kRowsPT];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00209"  name = "l00209" > < / a > < span  class = "lineno" >   209< / span >   AccumType sum_score[kRowsPT] = {0};< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00210"  name = "l00210" > < / a > < span  class = "lineno" >   210< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00211"  name = "l00211" > < / a > < span  class = "lineno" >   211< / span >   < span  class = "comment" > // Init to -Inf< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00212"  name = "l00212" > < / a > < span  class = "lineno" >   212< / span >   < a  class = "code hl_define"  href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00213"  name = "l00213" > < / a > < span  class = "lineno" >   213< / span >   < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > short< / span >  i = 0; i <  kRowsPT; ++i) {< / div > 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-09 21:56:20 +00:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00214"  name = "l00214" > < / a > < span  class = "lineno" >   214< / span >     max_score[i] = < a  class = "code hl_variable"  href = "struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e" > Limits< AccumType> ::min< / a > ;< / div > 
							 
						 
					
						
							
								
									
										
										
										
											2024-11-22 20:24:16 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00215"  name = "l00215" > < / a > < span  class = "lineno" >   215< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00216"  name = "l00216" > < / a > < span  class = "lineno" >   216< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00217"  name = "l00217" > < / a > < span  class = "lineno" >   217< / span >   < span  class = "comment" > // Loop over KV seq length< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00218"  name = "l00218" > < / a > < span  class = "lineno" >   218< / span >   < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > int< / span >  kb = 0; kb <  params-> NK; kb++) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00219"  name = "l00219" > < / a > < span  class = "lineno" >   219< / span >     < span  class = "comment" > // Load K block and apply scale< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00220"  name = "l00220" > < / a > < span  class = "lineno" >   220< / span >     threadgroup_barrier(mem_flags::mem_threadgroup);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00221"  name = "l00221" > < / a > < span  class = "lineno" >   221< / span >     < span  class = "keywordflow" > if< / span >  (!< a  class = "code hl_variable"  href = "steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416" > align_K< / a >  & &  kb == (params-> NK_aligned)) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00222"  name = "l00222" > < / a > < span  class = "lineno" >   222< / span >       loader_k.load_safe(short2(BD, params-> kL - params-> NK_aligned * BK));< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00223"  name = "l00223" > < / a > < span  class = "lineno" >   223< / span >     } < span  class = "keywordflow" > else< / span >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00224"  name = "l00224" > < / a > < span  class = "lineno" >   224< / span >       loader_k.load_unsafe();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00225"  name = "l00225" > < / a > < span  class = "lineno" >   225< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00226"  name = "l00226" > < / a > < span  class = "lineno" >   226< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00227"  name = "l00227" > < / a > < span  class = "lineno" >   227< / span >     threadgroup_barrier(mem_flags::mem_threadgroup);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00228"  name = "l00228" > < / a > < span  class = "lineno" >   228< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00229"  name = "l00229" > < / a > < span  class = "lineno" >   229< / span >     < span  class = "comment" > // Do S = Q @ K.T< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00230"  name = "l00230" > < / a > < span  class = "lineno" >   230< / span >     Stile.< a  class = "code hl_function"  href = "structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7" > clear< / a > ();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00231"  name = "l00231" > < / a > < span  class = "lineno" >   231< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00232"  name = "l00232" > < / a > < span  class = "lineno" >   232< / span >     < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > short< / span >  dd = 0; dd <  TD; dd++) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00233"  name = "l00233" > < / a > < span  class = "lineno" >   233< / span >       simdgroup_barrier(mem_flags::mem_none);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00234"  name = "l00234" > < / a > < span  class = "lineno" >   234< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00235"  name = "l00235" > < / a > < span  class = "lineno" >   235< / span >       Qtile.template load< T, 1, 1, LDQ_tgp, 1> (< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00236"  name = "l00236" > < / a > < span  class = "lineno" >   236< / span >           & Qs[Qs_offset + dd * Qs_tile_stride]);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00237"  name = "l00237" > < / a > < span  class = "lineno" >   237< / span >       Ktile.template load< T, 1, 1, LDK_tgp, 1> (< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00238"  name = "l00238" > < / a > < span  class = "lineno" >   238< / span >           & Ks[Ks_offset + dd * Ks_tile_stride]);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00239"  name = "l00239" > < / a > < span  class = "lineno" >   239< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00240"  name = "l00240" > < / a > < span  class = "lineno" >   240< / span >       simdgroup_barrier(mem_flags::mem_none);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00241"  name = "l00241" > < / a > < span  class = "lineno" >   241< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00242"  name = "l00242" > < / a > < span  class = "lineno" >   242< / span >       < a  class = "code hl_function"  href = "namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad" > tile_matmad< / a > (Stile, Qtile, Ktile, Stile);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00243"  name = "l00243" > < / a > < span  class = "lineno" >   243< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00244"  name = "l00244" > < / a > < span  class = "lineno" >   244< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00245"  name = "l00245" > < / a > < span  class = "lineno" >   245< / span >     < span  class = "comment" > // Mask out of length sequence< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00246"  name = "l00246" > < / a > < span  class = "lineno" >   246< / span >     < span  class = "keywordflow" > if< / span >  (!< a  class = "code hl_variable"  href = "steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416" > align_K< / a >  & &  kb == (params-> NK_aligned)) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00247"  name = "l00247" > < / a > < span  class = "lineno" >   247< / span >       < span  class = "keyword" > using < / span > stile_t = < span  class = "keyword" > decltype< / span > (Stile);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00248"  name = "l00248" > < / a > < span  class = "lineno" >   248< / span >       < span  class = "keyword" > using < / span > selem_t = < span  class = "keyword" > typename< / span >  stile_t::elem_type;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00249"  name = "l00249" > < / a > < span  class = "lineno" >   249< / span >       < span  class = "keyword" > constexpr< / span >  < span  class = "keyword" > auto< / span >  neg_inf = -metal::numeric_limits< selem_t> ::infinity();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00250"  name = "l00250" > < / a > < span  class = "lineno" >   250< / span >       < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > short< / span >  lim = params-> kL - params-> NK_aligned * BK;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00251"  name = "l00251" > < / a > < span  class = "lineno" >   251< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00252"  name = "l00252" > < / a > < span  class = "lineno" >   252< / span >       < a  class = "code hl_define"  href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00253"  name = "l00253" > < / a > < span  class = "lineno" >   253< / span >       < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > short< / span >  i = 0; i <  stile_t::kTileRows; i++) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00254"  name = "l00254" > < / a > < span  class = "lineno" >   254< / span >         < a  class = "code hl_define"  href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00255"  name = "l00255" > < / a > < span  class = "lineno" >   255< / span >         < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > short< / span >  j = 0; j <  stile_t::kTileCols; j++) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00256"  name = "l00256" > < / a > < span  class = "lineno" >   256< / span >           < span  class = "keywordtype" > short< / span >  col_pos = sn + (j * stile_t::kFragCols);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00257"  name = "l00257" > < / a > < span  class = "lineno" >   257< / span >           < a  class = "code hl_define"  href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00258"  name = "l00258" > < / a > < span  class = "lineno" >   258< / span >           < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > short< / span >  jj = 0; jj <  stile_t::MMAFrag_t::kElemCols; jj++) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00259"  name = "l00259" > < / a > < span  class = "lineno" >   259< / span >             < span  class = "keywordflow" > if< / span >  ((col_pos + jj) > = lim) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00260"  name = "l00260" > < / a > < span  class = "lineno" >   260< / span >               Stile.< a  class = "code hl_function"  href = "structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4" > frag_at< / a > (i, j)[jj] = neg_inf;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00261"  name = "l00261" > < / a > < span  class = "lineno" >   261< / span >             }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00262"  name = "l00262" > < / a > < span  class = "lineno" >   262< / span >           }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00263"  name = "l00263" > < / a > < span  class = "lineno" >   263< / span >         }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00264"  name = "l00264" > < / a > < span  class = "lineno" >   264< / span >       }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00265"  name = "l00265" > < / a > < span  class = "lineno" >   265< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00266"  name = "l00266" > < / a > < span  class = "lineno" >   266< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00267"  name = "l00267" > < / a > < span  class = "lineno" >   267< / span >     simdgroup_barrier(mem_flags::mem_none);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00268"  name = "l00268" > < / a > < span  class = "lineno" >   268< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00269"  name = "l00269" > < / a > < span  class = "lineno" >   269< / span >     < span  class = "comment" > // Load V blocks< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00270"  name = "l00270" > < / a > < span  class = "lineno" >   270< / span >     < span  class = "keywordflow" > if< / span >  (!< a  class = "code hl_variable"  href = "steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416" > align_K< / a >  & &  kb == (params-> NK_aligned)) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00271"  name = "l00271" > < / a > < span  class = "lineno" >   271< / span >       loader_v.load_safe(short2(BD, params-> kL - params-> NK_aligned * BK));< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00272"  name = "l00272" > < / a > < span  class = "lineno" >   272< / span >     } < span  class = "keywordflow" > else< / span >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00273"  name = "l00273" > < / a > < span  class = "lineno" >   273< / span >       loader_v.load_unsafe();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00274"  name = "l00274" > < / a > < span  class = "lineno" >   274< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00275"  name = "l00275" > < / a > < span  class = "lineno" >   275< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00276"  name = "l00276" > < / a > < span  class = "lineno" >   276< / span >     < span  class = "comment" > // Do softmax< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00277"  name = "l00277" > < / a > < span  class = "lineno" >   277< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00278"  name = "l00278" > < / a > < span  class = "lineno" >   278< / span >     < span  class = "comment" > // Temp variables< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00279"  name = "l00279" > < / a > < span  class = "lineno" >   279< / span >     AccumType new_max[kRowsPT];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00280"  name = "l00280" > < / a > < span  class = "lineno" >   280< / span >     AccumType factor[kRowsPT];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00281"  name = "l00281" > < / a > < span  class = "lineno" >   281< / span >     < a  class = "code hl_define"  href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00282"  name = "l00282" > < / a > < span  class = "lineno" >   282< / span >     < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > short< / span >  i = 0; i <  kRowsPT; ++i) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00283"  name = "l00283" > < / a > < span  class = "lineno" >   283< / span >       new_max[i] = max_score[i];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00284"  name = "l00284" > < / a > < span  class = "lineno" >   284< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00285"  name = "l00285" > < / a > < span  class = "lineno" >   285< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00286"  name = "l00286" > < / a > < span  class = "lineno" >   286< / span >     < span  class = "comment" > // Row max< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00287"  name = "l00287" > < / a > < span  class = "lineno" >   287< / span >     Stile.template row_reduce< MaxOp> (new_max);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00288"  name = "l00288" > < / a > < span  class = "lineno" >   288< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00289"  name = "l00289" > < / a > < span  class = "lineno" >   289< / span >     < span  class = "comment" > // exp(Si - rowmax(Si))< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00290"  name = "l00290" > < / a > < span  class = "lineno" >   290< / span >     Stile.template row_bin_op< ExpSubOp> (new_max);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00291"  name = "l00291" > < / a > < span  class = "lineno" >   291< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00292"  name = "l00292" > < / a > < span  class = "lineno" >   292< / span >     < span  class = "comment" > // Factor exp(rowmax(Si) - rowmax(Si-1))< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00293"  name = "l00293" > < / a > < span  class = "lineno" >   293< / span >     < a  class = "code hl_define"  href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00294"  name = "l00294" > < / a > < span  class = "lineno" >   294< / span >     < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > short< / span >  i = 0; i <  kRowsPT; ++i) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00295"  name = "l00295" > < / a > < span  class = "lineno" >   295< / span >       factor[i] = fast::exp(max_score[i] - new_max[i]);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00296"  name = "l00296" > < / a > < span  class = "lineno" >   296< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00297"  name = "l00297" > < / a > < span  class = "lineno" >   297< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00298"  name = "l00298" > < / a > < span  class = "lineno" >   298< / span >     < span  class = "comment" > // Save max for next iteration< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00299"  name = "l00299" > < / a > < span  class = "lineno" >   299< / span >     < a  class = "code hl_define"  href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00300"  name = "l00300" > < / a > < span  class = "lineno" >   300< / span >     < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > short< / span >  i = 0; i <  kRowsPT; ++i) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00301"  name = "l00301" > < / a > < span  class = "lineno" >   301< / span >       max_score[i] = new_max[i];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00302"  name = "l00302" > < / a > < span  class = "lineno" >   302< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00303"  name = "l00303" > < / a > < span  class = "lineno" >   303< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00304"  name = "l00304" > < / a > < span  class = "lineno" >   304< / span >     < span  class = "comment" > // Row Sum< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00305"  name = "l00305" > < / a > < span  class = "lineno" >   305< / span >     AccumType sum_score_tmp[kRowsPT] = {0};< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00306"  name = "l00306" > < / a > < span  class = "lineno" >   306< / span >     Stile.template row_reduce< SumOp> (sum_score_tmp);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00307"  name = "l00307" > < / a > < span  class = "lineno" >   307< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00308"  name = "l00308" > < / a > < span  class = "lineno" >   308< / span >     < span  class = "comment" > // Update norm< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00309"  name = "l00309" > < / a > < span  class = "lineno" >   309< / span >     < a  class = "code hl_define"  href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00310"  name = "l00310" > < / a > < span  class = "lineno" >   310< / span >     < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > short< / span >  i = 0; i <  kRowsPT; ++i) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00311"  name = "l00311" > < / a > < span  class = "lineno" >   311< / span >       sum_score[i] = sum_score[i] * factor[i] + sum_score_tmp[i];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00312"  name = "l00312" > < / a > < span  class = "lineno" >   312< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00313"  name = "l00313" > < / a > < span  class = "lineno" >   313< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00314"  name = "l00314" > < / a > < span  class = "lineno" >   314< / span >     < span  class = "comment" > // Update O< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00315"  name = "l00315" > < / a > < span  class = "lineno" >   315< / span >     Otile.template row_bin_op< MulOp> (factor);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00316"  name = "l00316" > < / a > < span  class = "lineno" >   316< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00317"  name = "l00317" > < / a > < span  class = "lineno" >   317< / span >     < span  class = "comment" > // Load V into registers< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00318"  name = "l00318" > < / a > < span  class = "lineno" >   318< / span >     threadgroup_barrier(mem_flags::mem_threadgroup);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00319"  name = "l00319" > < / a > < span  class = "lineno" >   319< / span >     Vtile.template load< T, 1, 1, LDV_tgp, 1> (& Vs[Vs_offset]);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00320"  name = "l00320" > < / a > < span  class = "lineno" >   320< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00321"  name = "l00321" > < / a > < span  class = "lineno" >   321< / span >     simdgroup_barrier(mem_flags::mem_none);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00322"  name = "l00322" > < / a > < span  class = "lineno" >   322< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00323"  name = "l00323" > < / a > < span  class = "lineno" >   323< / span >     < span  class = "comment" > // Do O = S @ V< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00324"  name = "l00324" > < / a > < span  class = "lineno" >   324< / span >     < a  class = "code hl_function"  href = "namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad" > tile_matmad< / a > (Otile, Stile, Vtile, Otile);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00325"  name = "l00325" > < / a > < span  class = "lineno" >   325< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00326"  name = "l00326" > < / a > < span  class = "lineno" >   326< / span >     < span  class = "comment" > // Prepare for next iteration< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00327"  name = "l00327" > < / a > < span  class = "lineno" >   327< / span >     loader_k.next();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00328"  name = "l00328" > < / a > < span  class = "lineno" >   328< / span >     loader_v.next();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00329"  name = "l00329" > < / a > < span  class = "lineno" >   329< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00330"  name = "l00330" > < / a > < span  class = "lineno" >   330< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00331"  name = "l00331" > < / a > < span  class = "lineno" >   331< / span >   < span  class = "comment" > // Normalize output< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00332"  name = "l00332" > < / a > < span  class = "lineno" >   332< / span >   Otile.template row_bin_op< DivOp> (sum_score);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00333"  name = "l00333" > < / a > < span  class = "lineno" >   333< / span >   threadgroup_barrier(mem_flags::mem_none);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00334"  name = "l00334" > < / a > < span  class = "lineno" >   334< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00335"  name = "l00335" > < / a > < span  class = "lineno" >   335< / span >   < span  class = "comment" > // Store results< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00336"  name = "l00336" > < / a > < span  class = "lineno" >   336< / span >   O += (tm + sm) * params-> O_strides[2] + sn;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00337"  name = "l00337" > < / a > < span  class = "lineno" >   337< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00338"  name = "l00338" > < / a > < span  class = "lineno" >   338< / span >   < span  class = "keywordflow" > if< / span >  (!< a  class = "code hl_variable"  href = "steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982" > align_Q< / a >  & &  < span  class = "keywordtype" > int< / span > (tid.x) == (params-> NQ_aligned)) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00339"  name = "l00339" > < / a > < span  class = "lineno" >   339< / span >     < span  class = "keyword" > auto< / span >  dst_tile_dims =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00340"  name = "l00340" > < / a > < span  class = "lineno" >   340< / span >         short2(BD - sn, params-> qL - BQ * params-> NQ_aligned - (tm + sm));< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00341"  name = "l00341" > < / a > < span  class = "lineno" >   341< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00342"  name = "l00342" > < / a > < span  class = "lineno" >   342< / span >     < span  class = "keywordflow" > if< / span >  (dst_tile_dims.x < = 0 || dst_tile_dims.y < = 0)< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00343"  name = "l00343" > < / a > < span  class = "lineno" >   343< / span >       < span  class = "keywordflow" > return< / span > ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00344"  name = "l00344" > < / a > < span  class = "lineno" >   344< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00345"  name = "l00345" > < / a > < span  class = "lineno" >   345< / span >     Otile.template store_safe< T, 1, 1> (O, params-> O_strides[2], dst_tile_dims);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00346"  name = "l00346" > < / a > < span  class = "lineno" >   346< / span >   } < span  class = "keywordflow" > else< / span >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00347"  name = "l00347" > < / a > < span  class = "lineno" >   347< / span >     Otile.template store< T, 1, 1> (O, params-> O_strides[2]);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00348"  name = "l00348" > < / a > < span  class = "lineno" >   348< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00349"  name = "l00349" > < / a > < span  class = "lineno" >   349< / span > }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "anamespacemetal_html_a853c80479ab2264d9c4587c7bcac767b" > < div  class = "ttname" > < a  href = "namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b" > metal::max< / a > < / div > < div  class = "ttdeci" > METAL_FUNC bfloat16_t max(bfloat16_t x, bfloat16_t y)< / div > < div  class = "ttdef" > < b > Definition< / b >  bf16_math.h:232< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "anamespacemlx_1_1steel_html" > < div  class = "ttname" > < a  href = "namespacemlx_1_1steel.html" > mlx::steel< / a > < / div > < div  class = "ttdef" > < b > Definition< / b >  attn.h:19< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "anamespacemlx_1_1steel_html_ad583e6038efc119542410f43b603d4ad" > < div  class = "ttname" > < a  href = "namespacemlx_1_1steel.html#ad583e6038efc119542410f43b603d4ad" > mlx::steel::tile_matmad< / a > < / div > < div  class = "ttdeci" > METAL_FUNC void tile_matmad(thread MMATile<  T, M, N >  & D, thread MMATile<  U, M, K >  & A, thread MMATile<  U, K, N >  & B, thread MMATile<  T, M, N >  & C)< / div > < div  class = "ttdef" > < b > Definition< / b >  mma.h:413< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "asteel_2defines_8h_html_a5a5c3095b132a7589bc19cd5cb80e2c6" > < div  class = "ttname" > < a  href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div > < div  class = "ttdeci" > #define STEEL_PRAGMA_UNROLL< / div > < div  class = "ttdef" > < b > Definition< / b >  defines.h:4< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "asteel__attention_8h_html_a171fdea1b23976453f5dc5e6b3161982" > < div  class = "ttname" > < a  href = "steel__attention_8h.html#a171fdea1b23976453f5dc5e6b3161982" > align_Q< / a > < / div > < div  class = "ttdeci" > constant bool align_Q< / div > < div  class = "ttdef" > < b > Definition< / b >  steel_attention.h:9< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "asteel__attention_8h_html_a5423b2a414f5e3c14166d568dedfbd33" > < div  class = "ttname" > < a  href = "steel__attention_8h.html#a5423b2a414f5e3c14166d568dedfbd33" > attention< / a > < / div > < div  class = "ttdeci" > void attention(const device T *Q, const device T *K, const device T *V, device T *O, const constant AttnParams *params, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid)< / div > < div  class = "ttdef" > < b > Definition< / b >  steel_attention.h:73< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "asteel__attention_8h_html_a8bdd2cecf97aa5b033152b1d0f0d2416" > < div  class = "ttname" > < a  href = "steel__attention_8h.html#a8bdd2cecf97aa5b033152b1d0f0d2416" > align_K< / a > < / div > < div  class = "ttdeci" > constant bool align_K< / div > < div  class = "ttdef" > < b > Definition< / b >  steel_attention.h:10< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_div_op_html" > < div  class = "ttname" > < a  href = "struct_div_op.html" > DivOp< / a > < / div > < div  class = "ttdef" > < b > Definition< / b >  steel_attention.h:57< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_div_op_html_a1b8df47142dc6ea15315ce3a310f9221" > < div  class = "ttname" > < a  href = "struct_div_op.html#a1b8df47142dc6ea15315ce3a310f9221" > DivOp::apply< / a > < / div > < div  class = "ttdeci" > static METAL_FUNC constexpr T apply(T x, T y)< / div > < div  class = "ttdef" > < b > Definition< / b >  steel_attention.h:59< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_exp_sub_op_html" > < div  class = "ttname" > < a  href = "struct_exp_sub_op.html" > ExpSubOp< / a > < / div > < div  class = "ttdef" > < b > Definition< / b >  steel_attention.h:50< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_exp_sub_op_html_a00e457a01cb38f959dfd789455e7f334" > < div  class = "ttname" > < a  href = "struct_exp_sub_op.html#a00e457a01cb38f959dfd789455e7f334" > ExpSubOp::apply< / a > < / div > < div  class = "ttdeci" > static METAL_FUNC constexpr T apply(T x, T y)< / div > < div  class = "ttdef" > < b > Definition< / b >  steel_attention.h:52< / div > < / div > 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-09 21:56:20 +00:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_limits_html_a6e81584ba65a4dc6ff9366b458e3a20e" > < div  class = "ttname" > < a  href = "struct_limits.html#a6e81584ba65a4dc6ff9366b458e3a20e" > Limits::min< / a > < / div > < div  class = "ttdeci" > static const constant U min< / div > < div  class = "ttdef" > < b > Definition< / b >  utils.h:25< / div > < / div > 
							 
						 
					
						
							
								
									
										
										
										
											2024-11-22 20:24:16 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_max_op_html" > < div  class = "ttname" > < a  href = "struct_max_op.html" > MaxOp< / a > < / div > < div  class = "ttdef" > < b > Definition< / b >  steel_attention.h:22< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_max_op_html_ab3d3c3040017a13c170e7bdd1ffac46e" > < div  class = "ttname" > < a  href = "struct_max_op.html#ab3d3c3040017a13c170e7bdd1ffac46e" > MaxOp::apply< / a > < / div > < div  class = "ttdeci" > static METAL_FUNC constexpr T apply(T x, T y)< / div > < div  class = "ttdef" > < b > Definition< / b >  steel_attention.h:24< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_mul_op_html" > < div  class = "ttname" > < a  href = "struct_mul_op.html" > MulOp< / a > < / div > < div  class = "ttdef" > < b > Definition< / b >  steel_attention.h:36< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_mul_op_html_a1b93d804653d92fc7e46747de9e9c756" > < div  class = "ttname" > < a  href = "struct_mul_op.html#a1b93d804653d92fc7e46747de9e9c756" > MulOp::apply< / a > < / div > < div  class = "ttdeci" > static METAL_FUNC constexpr T apply(T x, T y)< / div > < div  class = "ttdef" > < b > Definition< / b >  steel_attention.h:38< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_sub_op_html" > < div  class = "ttname" > < a  href = "struct_sub_op.html" > SubOp< / a > < / div > < div  class = "ttdef" > < b > Definition< / b >  steel_attention.h:43< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_sub_op_html_ad211f879a212ed0e98136217ca8e4143" > < div  class = "ttname" > < a  href = "struct_sub_op.html#ad211f879a212ed0e98136217ca8e4143" > SubOp::apply< / a > < / div > < div  class = "ttdeci" > static METAL_FUNC constexpr T apply(T x, T y)< / div > < div  class = "ttdef" > < b > Definition< / b >  steel_attention.h:45< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_sum_op_html" > < div  class = "ttname" > < a  href = "struct_sum_op.html" > SumOp< / a > < / div > < div  class = "ttdef" > < b > Definition< / b >  steel_attention.h:29< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_sum_op_html_aa9563a98cbbe1b1921ade0c63ab38b4d" > < div  class = "ttname" > < a  href = "struct_sum_op.html#aa9563a98cbbe1b1921ade0c63ab38b4d" > SumOp::apply< / a > < / div > < div  class = "ttdeci" > static METAL_FUNC constexpr T apply(T x, T y)< / div > < div  class = "ttdef" > < b > Definition< / b >  steel_attention.h:31< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_transform_scale_html" > < div  class = "ttname" > < a  href = "struct_transform_scale.html" > TransformScale< / a > < / div > < div  class = "ttdef" > < b > Definition< / b >  steel_attention.h:13< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_transform_scale_html_a9dd329422e5b8da43486cdce17132e16" > < div  class = "ttname" > < a  href = "struct_transform_scale.html#a9dd329422e5b8da43486cdce17132e16" > TransformScale::apply< / a > < / div > < div  class = "ttdeci" > METAL_FUNC T apply(T x) const< / div > < div  class = "ttdef" > < b > Definition< / b >  steel_attention.h:17< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_transform_scale_html_aa56b8e107acf16fdf77006625c2b8bc6" > < div  class = "ttname" > < a  href = "struct_transform_scale.html#aa56b8e107acf16fdf77006625c2b8bc6" > TransformScale::scale< / a > < / div > < div  class = "ttdeci" > T scale< / div > < div  class = "ttdef" > < b > Definition< / b >  steel_attention.h:14< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_transform_scale_html_ae109cf7c963ba13df96977e7563f7b70" > < div  class = "ttname" > < a  href = "struct_transform_scale.html#ae109cf7c963ba13df96977e7563f7b70" > TransformScale::TransformScale< / a > < / div > < div  class = "ttdeci" > METAL_FUNC TransformScale(T scale_)< / div > < div  class = "ttdef" > < b > Definition< / b >  steel_attention.h:15< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astructmlx_1_1steel_1_1_attn_params_html" > < div  class = "ttname" > < a  href = "structmlx_1_1steel_1_1_attn_params.html" > mlx::steel::AttnParams< / a > < / div > < div  class = "ttdef" > < b > Definition< / b >  params.h:12< / div > < / div > 
							 
						 
					
						
							
								
									
										
										
										
											2025-02-06 20:16:29 +00:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astructmlx_1_1steel_1_1_base_m_m_a_frag_html" > < div  class = "ttname" > < a  href = "structmlx_1_1steel_1_1_base_m_m_a_frag.html" > mlx::steel::BaseMMAFrag< / a > < / div > < div  class = "ttdef" > < b > Definition< / b >  mma.h:37< / div > < / div > 
							 
						 
					
						
							
								
									
										
										
										
											2024-11-22 20:24:16 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astructmlx_1_1steel_1_1_block_loader_t_html" > < div  class = "ttname" > < a  href = "structmlx_1_1steel_1_1_block_loader_t.html" > mlx::steel::BlockLoaderT< / a > < / div > < div  class = "ttdef" > < b > Definition< / b >  loader.h:153< / div > < / div > 
							 
						 
					
						
							
								
									
										
										
										
											2025-02-06 20:16:29 +00:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astructmlx_1_1steel_1_1_m_m_a_tile_html" > < div  class = "ttname" > < a  href = "structmlx_1_1steel_1_1_m_m_a_tile.html" > mlx::steel::MMATile< / a > < / div > < div  class = "ttdef" > < b > Definition< / b >  mma.h:223< / div > < / div > 
							 
						 
					
						
							
								
									
										
										
										
											2024-11-22 20:24:16 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astructmlx_1_1steel_1_1_m_m_a_tile_html_a1a6b1446e8c8da46885bbaa8e8fdc7e4" > < div  class = "ttname" > < a  href = "structmlx_1_1steel_1_1_m_m_a_tile.html#a1a6b1446e8c8da46885bbaa8e8fdc7e4" > mlx::steel::MMATile::frag_at< / a > < / div > < div  class = "ttdeci" > METAL_FUNC constexpr thread frag_type &  frag_at(const short i, const short j)< / div > < div  class = "ttdef" > < b > Definition< / b >  mma.h:256< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astructmlx_1_1steel_1_1_m_m_a_tile_html_aa97a98e423827a889c13a92217626ec7" > < div  class = "ttname" > < a  href = "structmlx_1_1steel_1_1_m_m_a_tile.html#aa97a98e423827a889c13a92217626ec7" > mlx::steel::MMATile::clear< / a > < / div > < div  class = "ttdeci" > METAL_FUNC constexpr void clear()< / div > < div  class = "ttdef" > < b > Definition< / b >  mma.h:249< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > <!--  fragment  --> < / div > <!--  contents  --> 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > <!--  doc - content  --> 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-09 21:56:20 +00:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								<!--  start footer part  --> 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  id = "nav-path"  class = "navpath" > <!--  id is needed for treeview function!  --> 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  < ul > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    < li  class = "navelem" > < a  class = "el"  href = "dir_938ab0ecf10b8b860ff766c820f665fd.html" > mlx< / a > < / li > < li  class = "navelem" > < a  class = "el"  href = "dir_1d446c9bd3c99228254c9484e0bc5c06.html" > backend< / a > < / li > < li  class = "navelem" > < a  class = "el"  href = "dir_d0c977ea65824390717cdb7efc36c157.html" > metal< / a > < / li > < li  class = "navelem" > < a  class = "el"  href = "dir_70a37effa88bcbd6b791977fa1e64356.html" > kernels< / a > < / li > < li  class = "navelem" > < a  class = "el"  href = "dir_76215a6c54e2b67053e723fc2395583c.html" > steel< / a > < / li > < li  class = "navelem" > < a  class = "el"  href = "dir_e1756c7634b0c14aead026895ad71c6d.html" > attn< / a > < / li > < li  class = "navelem" > < a  class = "el"  href = "dir_5aea41cce495e77a0857a0aecf063e33.html" > kernels< / a > < / li > < li  class = "navelem" > < a  class = "el"  href = "steel__attention_8h.html" > steel_attention.h< / a > < / li > 
							 
						 
					
						
							
								
									
										
										
										
											2025-02-06 20:16:29 +00:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								    < li  class = "footer" > Generated by < a  href = "https://www.doxygen.org/index.html" > < img  class = "footer"  src = "doxygen.svg"  width = "104"  height = "31"  alt = "doxygen" / > < / a >  1.13.2 < / li > 
							 
						 
					
						
							
								
									
										
										
										
											2025-01-09 21:56:20 +00:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								  < / ul > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
									
										
										
										
											2024-11-22 20:24:16 +00:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								< / body > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / html >