2024-06-06 20:28:06 -07:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< html  xmlns = "http://www.w3.org/1999/xhtml"  lang = "en-US" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< head > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< meta  http-equiv = "Content-Type"  content = "text/xhtml;charset=UTF-8" / > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< meta  http-equiv = "X-UA-Compatible"  content = "IE=11" / > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< meta  name = "generator"  content = "Doxygen 1.10.0" / > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< meta  name = "viewport"  content = "width=device-width, initial-scale=1" / > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< title > MLX: mlx/backend/metal/kernels/steel/gemm/kernels/steel_gemm_masked.h Source File< / title > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< link  href = "tabs.css"  rel = "stylesheet"  type = "text/css" / > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript"  src = "jquery.js" > < / script > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript"  src = "dynsections.js" > < / script > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript"  src = "clipboard.js" > < / script > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript"  src = "cookie.js" > < / script > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< link  href = "search/search.css"  rel = "stylesheet"  type = "text/css" / > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript"  src = "search/searchdata.js" > < / script > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript"  src = "search/search.js" > < / script > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< link  href = "doxygen.css"  rel = "stylesheet"  type = "text/css"  / > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / head > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< body > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  id = "top" > <!--  do not remove this div, it is closed by doxygen!  --> 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  id = "titlearea" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< table  cellspacing = "0"  cellpadding = "0" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								 < tbody > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								 < tr  id = "projectrow" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  < td  id = "projectalign" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								   < div  id = "projectname" > MLX
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								   < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  < / td > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								 < / tr > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								 < / tbody > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / table > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								<!--  end header part  --> 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								<!--  Generated by Doxygen 1.10.0  --> 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699& dn=expat.txt MIT */
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								var searchBox = new SearchBox("searchBox", "search/",'.html');
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								/* @license-end */
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / script > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript"  src = "menudata.js" > < / script > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript"  src = "menu.js" > < / script > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699& dn=expat.txt MIT */
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								$(function() {
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  initMenu('',true,false,'search.php','Search');
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  $(function() { init_search(); });
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								});
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								/* @license-end */
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / script > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  id = "main-nav" > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< script  type = "text/javascript" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699& dn=expat.txt MIT */
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								$(function() { codefold.init(0); });
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								/* @license-end */
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / script > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								<!--  window showing the filter options  --> 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  id = "MSearchSelectWindow" 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								     onmouseover="return searchBox.OnSearchSelectShow()"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								     onmouseout="return searchBox.OnSearchSelectHide()"
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								     onkeydown="return searchBox.OnSearchSelectKey(event)">
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								<!--  iframe showing the search results (closed by default)  --> 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  id = "MSearchResultsWindow" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  id = "MSearchResults" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "SRPage" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  id = "SRIndex" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  id = "SRResults" > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "SRStatus"  id = "Loading" > Loading...< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "SRStatus"  id = "Searching" > Searching...< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "SRStatus"  id = "NoMatches" > No Matches< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  id = "nav-path"  class = "navpath" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  < ul > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< li  class = "navelem" > < a  class = "el"  href = "dir_938ab0ecf10b8b860ff766c820f665fd.html" > mlx< / a > < / li > < li  class = "navelem" > < a  class = "el"  href = "dir_1d446c9bd3c99228254c9484e0bc5c06.html" > backend< / a > < / li > < li  class = "navelem" > < a  class = "el"  href = "dir_d0c977ea65824390717cdb7efc36c157.html" > metal< / a > < / li > < li  class = "navelem" > < a  class = "el"  href = "dir_70a37effa88bcbd6b791977fa1e64356.html" > kernels< / a > < / li > < li  class = "navelem" > < a  class = "el"  href = "dir_76215a6c54e2b67053e723fc2395583c.html" > steel< / a > < / li > < li  class = "navelem" > < a  class = "el"  href = "dir_6768c99e6145fb9510ccdb40db8ede25.html" > gemm< / a > < / li > < li  class = "navelem" > < a  class = "el"  href = "dir_9c555e3d0f5b8c3fb3a7397c81fd5bf9.html" > kernels< / a > < / li >   < / ul > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > <!--  top  --> 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "header" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  < div  class = "headertitle" > < div  class = "title" > steel_gemm_masked.h< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > <!-- header --> 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "contents" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< a  href = "steel__gemm__masked_8h.html" > Go to the documentation of this file.< / a > < div  class = "fragment" > < div  class = "line" > < a  id = "l00001"  name = "l00001" > < / a > < span  class = "lineno" >     1< / span > < span  class = "comment" > // Copyright © 2024 Apple Inc.< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00002"  name = "l00002" > < / a > < span  class = "lineno" >     2< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00003"  name = "l00003" > < / a > < span  class = "lineno" >     3< / span > < span  class = "preprocessor" > #include " < a  class = "code"  href = "steel_2defines_8h.html" > mlx/backend/metal/kernels/steel/defines.h< / a > " < / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00004"  name = "l00004" > < / a > < span  class = "lineno" >     4< / span > < span  class = "keyword" > using namespace < / span > < a  class = "code hl_namespace"  href = "namespacemetal.html" > metal< / a > ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00005"  name = "l00005" > < / a > < span  class = "lineno" >     5< / span > < span  class = "keyword" > using namespace < / span > < a  class = "code hl_namespace"  href = "namespacemlx_1_1steel.html" > mlx::steel< / a > ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00006"  name = "l00006" > < / a > < span  class = "lineno" >     6< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00008"  name = "l00008" > < / a > < span  class = "lineno" >     8< / span > < span  class = "comment" > // GEMM kernels< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00010"  name = "l00010" > < / a > < span  class = "lineno" >    10< / span >  < / div > 
							 
						 
					
						
							
								
									
										
										
										
											2024-08-10 09:24:35 -07:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00011"  name = "l00011" > < / a > < span  class = "lineno" >    11< / span > < span  class = "keyword" > struct < / span > < a  class = "code hl_struct"  href = "struct___no_mask.html" > _NoMask< / a >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00012"  name = "l00012" > < / a > < span  class = "lineno" >    12< / span >   < span  class = "keywordtype" > char< / span >  < a  class = "code hl_variable"  href = "struct___no_mask.html#a0c4a4557d5c97ceafe3a2c4e521cdf7e" > x< / a > ;< / div > 
							 
						 
					
						
							
								
									
										
										
										
											2024-06-06 20:28:06 -07:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00013"  name = "l00013" > < / a > < span  class = "lineno" >    13< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00014"  data-start = "{"  data-end = "}" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00014"  name = "l00014" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct___no_mask.html#ad3723c1e70e46beefd283ce6317416cb" >    14< / a > < / span >   < span  class = "keyword" > constexpr< / span >  METAL_FUNC < span  class = "keyword" > operator< / span >  bool() {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00015"  name = "l00015" > < / a > < span  class = "lineno" >    15< / span >     < span  class = "keywordflow" > return< / span >  < span  class = "keyword" > true< / span > ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00016"  name = "l00016" > < / a > < span  class = "lineno" >    16< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00017"  data-start = "{"  data-end = "}" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00017"  name = "l00017" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct___no_mask.html#aafbf8a3201e1cc1abf74dd1f1b7272cd" >    17< / a > < / span >   < span  class = "keyword" > constexpr< / span >  METAL_FUNC < span  class = "keyword" > operator< / span >  bool() const threadgroup {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00018"  name = "l00018" > < / a > < span  class = "lineno" >    18< / span >     < span  class = "keywordflow" > return< / span >  < span  class = "keyword" > true< / span > ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00019"  name = "l00019" > < / a > < span  class = "lineno" >    19< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00020"  data-start = "{"  data-end = "}" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00020"  name = "l00020" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct___no_mask.html#a73e9612a619885cbc97cbd8f40df71e7" >    20< / a > < / span >   < span  class = "keyword" > constexpr< / span >  METAL_FUNC < span  class = "keyword" > operator< / span >  bool() const device {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00021"  name = "l00021" > < / a > < span  class = "lineno" >    21< / span >     < span  class = "keywordflow" > return< / span >  < span  class = "keyword" > true< / span > ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00022"  name = "l00022" > < / a > < span  class = "lineno" >    22< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00023"  data-start = "{"  data-end = "}" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00023"  name = "l00023" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct___no_mask.html#a4bf336d472bc677028250f76b9cdc08c" >    23< / a > < / span >   < span  class = "keyword" > constexpr< / span >  METAL_FUNC < span  class = "keyword" > operator< / span >  bool() const constant {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00024"  name = "l00024" > < / a > < span  class = "lineno" >    24< / span >     < span  class = "keywordflow" > return< / span >  < span  class = "keyword" > true< / span > ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00025"  name = "l00025" > < / a > < span  class = "lineno" >    25< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00026"  name = "l00026" > < / a > < span  class = "lineno" >    26< / span > };< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00027"  name = "l00027" > < / a > < span  class = "lineno" >    27< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00028"  name = "l00028" > < / a > < span  class = "lineno" >    28< / span > < span  class = "keyword" > template< / span >  < < span  class = "keyword" > typename< / span >  OutT, < span  class = "keyword" > typename< / span >  InT = OutT> < / div > 
							 
						 
					
						
							
								
									
										
										
										
											2024-08-10 09:24:35 -07:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00029"  name = "l00029" > < / a > < span  class = "lineno" >    29< / span > < span  class = "keyword" > struct < / span > < a  class = "code hl_struct"  href = "struct_scale_op.html" > ScaleOp< / a >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00030"  name = "l00030" > < / a > < span  class = "lineno" >    30< / span >   OutT < a  class = "code hl_variable"  href = "struct_scale_op.html#a02043fac21c68fb8d6863a01f45ede4b" > scale< / a > ;< / div > 
							 
						 
					
						
							
								
									
										
										
										
											2024-06-06 20:28:06 -07:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00031"  name = "l00031" > < / a > < span  class = "lineno" >    31< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00032"  data-start = "{"  data-end = "}" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00032"  name = "l00032" > < / a > < span  class = "lineno" > < a  class = "line"  href = "struct_scale_op.html#a69f82bc925843a4e1c14dfe8ad2f3218" >    32< / a > < / span >   METAL_FUNC OutT < a  class = "code hl_function"  href = "struct_scale_op.html#a69f82bc925843a4e1c14dfe8ad2f3218" > apply< / a > (InT x)< span  class = "keyword" >  const < / span > {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00033"  name = "l00033" > < / a > < span  class = "lineno" >    33< / span >     < span  class = "keywordflow" > return< / span >  < span  class = "keyword" > static_cast< < / span > OutT< span  class = "keyword" > > < / span > (x) * < a  class = "code hl_variable"  href = "struct_scale_op.html#a02043fac21c68fb8d6863a01f45ede4b" > scale< / a > ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00034"  name = "l00034" > < / a > < span  class = "lineno" >    34< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00035"  name = "l00035" > < / a > < span  class = "lineno" >    35< / span > };< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00036"  name = "l00036" > < / a > < span  class = "lineno" >    36< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00037"  name = "l00037" > < / a > < span  class = "lineno" > < a  class = "line"  href = "steel__gemm__masked_8h.html#a1480c8cdff1cae1462a5a71632969bca" >    37< / a > < / span > < span  class = "keyword" > typedef< / span >  < span  class = "keyword" > struct < / span > < a  class = "code hl_struct"  href = "struct___no_mask.html" > _NoMask< / a >  < a  class = "code hl_struct"  href = "struct___no_mask.html" > nomask_t< / a > ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00038"  name = "l00038" > < / a > < span  class = "lineno" >    38< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00039"  name = "l00039" > < / a > < span  class = "lineno" >    39< / span > < span  class = "keyword" > template< / span >  < < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00040"  name = "l00040" > < / a > < span  class = "lineno" >    40< / span >     < span  class = "keyword" > typename< / span >  T,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00041"  name = "l00041" > < / a > < span  class = "lineno" >    41< / span >     < span  class = "keyword" > typename< / span >  out_mask_t,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00042"  name = "l00042" > < / a > < span  class = "lineno" >    42< / span >     < span  class = "keyword" > typename< / span >  op_mask_t,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00043"  name = "l00043" > < / a > < span  class = "lineno" >    43< / span >     < span  class = "keywordtype" > int< / span >  BM,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00044"  name = "l00044" > < / a > < span  class = "lineno" >    44< / span >     < span  class = "keywordtype" > int< / span >  BN,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00045"  name = "l00045" > < / a > < span  class = "lineno" >    45< / span >     < span  class = "keywordtype" > int< / span >  BK,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00046"  name = "l00046" > < / a > < span  class = "lineno" >    46< / span >     < span  class = "keywordtype" > int< / span >  WM,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00047"  name = "l00047" > < / a > < span  class = "lineno" >    47< / span >     < span  class = "keywordtype" > int< / span >  WN,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00048"  name = "l00048" > < / a > < span  class = "lineno" >    48< / span >     < span  class = "keywordtype" > bool< / span >  transpose_a,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00049"  name = "l00049" > < / a > < span  class = "lineno" >    49< / span >     < span  class = "keywordtype" > bool< / span >  transpose_b,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00050"  name = "l00050" > < / a > < span  class = "lineno" >    50< / span >     < span  class = "keywordtype" > bool< / span >  MN_aligned,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00051"  name = "l00051" > < / a > < span  class = "lineno" >    51< / span >     < span  class = "keywordtype" > bool< / span >  K_aligned> < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00052"  name = "l00052" > < / a > < span  class = "lineno" >    52< / span > [[kernel, max_total_threads_per_threadgroup(WM* WN * 32)]] < span  class = "keywordtype" > void< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00053"  data-start = "{"  data-end = "}" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00053"  name = "l00053" > < / a > < span  class = "lineno" > < a  class = "line"  href = "steel__gemm__masked_8h.html#af805e998b2046ee30c2b4be813e3af97" >    53< / a > < / span > < a  class = "code hl_function"  href = "steel__gemm__masked_8h.html#af805e998b2046ee30c2b4be813e3af97" > block_masked_gemm< / a > (< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00054"  name = "l00054" > < / a > < span  class = "lineno" >    54< / span >     < span  class = "keyword" > const< / span >  device T* A [[buffer(0)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00055"  name = "l00055" > < / a > < span  class = "lineno" >    55< / span >     < span  class = "keyword" > const< / span >  device T* B [[buffer(1)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00056"  name = "l00056" > < / a > < span  class = "lineno" >    56< / span >     device T* D [[buffer(3)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00057"  name = "l00057" > < / a > < span  class = "lineno" >    57< / span >     < span  class = "keyword" > const< / span >  constant < a  class = "code hl_struct"  href = "structmlx_1_1steel_1_1_g_e_m_m_params.html" > GEMMParams< / a > * params [[buffer(4)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00058"  name = "l00058" > < / a > < span  class = "lineno" >    58< / span >     < span  class = "keyword" > const< / span >  constant < span  class = "keywordtype" > int< / span > * batch_shape [[buffer(6)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00059"  name = "l00059" > < / a > < span  class = "lineno" >    59< / span >     < span  class = "keyword" > const< / span >  constant < span  class = "keywordtype" > size_t< / span > * batch_strides [[buffer(7)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00060"  name = "l00060" > < / a > < span  class = "lineno" >    60< / span >     < span  class = "keyword" > const< / span >  device out_mask_t* out_mask [[buffer(10)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00061"  name = "l00061" > < / a > < span  class = "lineno" >    61< / span >     < span  class = "keyword" > const< / span >  device op_mask_t* lhs_mask [[buffer(11)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00062"  name = "l00062" > < / a > < span  class = "lineno" >    62< / span >     < span  class = "keyword" > const< / span >  device op_mask_t* rhs_mask [[buffer(12)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00063"  name = "l00063" > < / a > < span  class = "lineno" >    63< / span >     < span  class = "keyword" > const< / span >  constant < span  class = "keywordtype" > int< / span > * mask_strides [[buffer(13)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00064"  name = "l00064" > < / a > < span  class = "lineno" >    64< / span >     uint simd_lane_id [[thread_index_in_simdgroup]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00065"  name = "l00065" > < / a > < span  class = "lineno" >    65< / span >     uint simd_group_id [[simdgroup_index_in_threadgroup]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00066"  name = "l00066" > < / a > < span  class = "lineno" >    66< / span >     uint3 tid [[threadgroup_position_in_grid]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00067"  name = "l00067" > < / a > < span  class = "lineno" >    67< / span >     uint3 lid [[thread_position_in_threadgroup]]) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00068"  name = "l00068" > < / a > < span  class = "lineno" >    68< / span >   < span  class = "comment" > // Appease the compiler< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00069"  name = "l00069" > < / a > < span  class = "lineno" >    69< / span >   (void)lid;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00070"  name = "l00070" > < / a > < span  class = "lineno" >    70< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00071"  name = "l00071" > < / a > < span  class = "lineno" >    71< / span >   < span  class = "keyword" > static_assert< / span > (< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00072"  name = "l00072" > < / a > < span  class = "lineno" >    72< / span >       BM == BN,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00073"  name = "l00073" > < / a > < span  class = "lineno" >    73< / span >       < span  class = "stringliteral" > " block_masked_gemm must have the same block M and block N size" < / span > );< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00074"  name = "l00074" > < / a > < span  class = "lineno" >    74< / span >   < span  class = "keyword" > static_assert< / span > (BM % BK == 0, < span  class = "stringliteral" > " block_masked_gemm must have BM % BK == 0" < / span > );< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00075"  name = "l00075" > < / a > < span  class = "lineno" >    75< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00076"  name = "l00076" > < / a > < span  class = "lineno" >    76< / span >   < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > bool< / span >  has_operand_mask = !metal::is_same_v< op_mask_t, nomask_t> ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00077"  name = "l00077" > < / a > < span  class = "lineno" >    77< / span >   < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > bool< / span >  has_output_mask = !metal::is_same_v< out_mask_t, nomask_t> ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00078"  name = "l00078" > < / a > < span  class = "lineno" >    78< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00079"  name = "l00079" > < / a > < span  class = "lineno" >    79< / span >   < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > bool< / span >  has_mul_operand_mask =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00080"  name = "l00080" > < / a > < span  class = "lineno" >    80< / span >       has_operand_mask & &  !metal::is_same_v< op_mask_t, bool> ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00081"  name = "l00081" > < / a > < span  class = "lineno" >    81< / span >   < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > bool< / span >  has_mul_output_mask =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00082"  name = "l00082" > < / a > < span  class = "lineno" >    82< / span >       has_output_mask & &  !metal::is_same_v< out_mask_t, bool> ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00083"  name = "l00083" > < / a > < span  class = "lineno" >    83< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00084"  name = "l00084" > < / a > < span  class = "lineno" >    84< / span >   < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > short< / span >  k_mask_factor = short(BM / BK);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00085"  name = "l00085" > < / a > < span  class = "lineno" >    85< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00086"  name = "l00086" > < / a > < span  class = "lineno" >    86< / span >   < span  class = "keyword" > using < / span > gemm_kernel = < a  class = "code hl_struct"  href = "structmlx_1_1steel_1_1_g_e_m_m_kernel.html" > GEMMKernel< / a > < < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00087"  name = "l00087" > < / a > < span  class = "lineno" >    87< / span >       T,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00088"  name = "l00088" > < / a > < span  class = "lineno" >    88< / span >       T,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00089"  name = "l00089" > < / a > < span  class = "lineno" >    89< / span >       BM,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00090"  name = "l00090" > < / a > < span  class = "lineno" >    90< / span >       BN,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00091"  name = "l00091" > < / a > < span  class = "lineno" >    91< / span >       BK,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00092"  name = "l00092" > < / a > < span  class = "lineno" >    92< / span >       WM,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00093"  name = "l00093" > < / a > < span  class = "lineno" >    93< / span >       WN,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00094"  name = "l00094" > < / a > < span  class = "lineno" >    94< / span >       transpose_a,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00095"  name = "l00095" > < / a > < span  class = "lineno" >    95< / span >       transpose_b,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00096"  name = "l00096" > < / a > < span  class = "lineno" >    96< / span >       MN_aligned,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00097"  name = "l00097" > < / a > < span  class = "lineno" >    97< / span >       K_aligned> ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00098"  name = "l00098" > < / a > < span  class = "lineno" >    98< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00099"  name = "l00099" > < / a > < span  class = "lineno" >    99< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > int< / span >  tid_y = ((tid.y) < <  params-> swizzle_log) +< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00100"  name = "l00100" > < / a > < span  class = "lineno" >   100< / span >       ((tid.x) &  ((1 < <  params-> swizzle_log) - 1));< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00101"  name = "l00101" > < / a > < span  class = "lineno" >   101< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > int< / span >  tid_x = (tid.x) > >  params-> swizzle_log;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00102"  name = "l00102" > < / a > < span  class = "lineno" >   102< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00103"  name = "l00103" > < / a > < span  class = "lineno" >   103< / span >   < span  class = "keywordflow" > if< / span >  (params-> tiles_n < = tid_x || params-> tiles_m < = tid_y) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00104"  name = "l00104" > < / a > < span  class = "lineno" >   104< / span >     < span  class = "keywordflow" > return< / span > ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00105"  name = "l00105" > < / a > < span  class = "lineno" >   105< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00106"  name = "l00106" > < / a > < span  class = "lineno" >   106< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00107"  name = "l00107" > < / a > < span  class = "lineno" >   107< / span >   < span  class = "keyword" > const< / span >  constant < span  class = "keywordtype" > size_t< / span > * mask_batch_strides =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00108"  name = "l00108" > < / a > < span  class = "lineno" >   108< / span >       batch_strides + 2 * params-> batch_ndim;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00109"  name = "l00109" > < / a > < span  class = "lineno" >   109< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00110"  name = "l00110" > < / a > < span  class = "lineno" >   110< / span >   < span  class = "keywordflow" > if< / span >  (params-> batch_ndim >  1) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00111"  name = "l00111" > < / a > < span  class = "lineno" >   111< / span >     < span  class = "keywordflow" > if< / span >  (has_output_mask) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00112"  name = "l00112" > < / a > < span  class = "lineno" >   112< / span >       out_mask += < a  class = "code hl_function"  href = "backend_2metal_2kernels_2utils_8h.html#a2e49fa7ab8f6348543455c6c45d7e2a9" > elem_to_loc< / a > (< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00113"  name = "l00113" > < / a > < span  class = "lineno" >   113< / span >           tid.z, batch_shape, mask_batch_strides, params-> batch_ndim);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00114"  name = "l00114" > < / a > < span  class = "lineno" >   114< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00115"  name = "l00115" > < / a > < span  class = "lineno" >   115< / span >       mask_batch_strides += params-> batch_ndim;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00116"  name = "l00116" > < / a > < span  class = "lineno" >   116< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00117"  name = "l00117" > < / a > < span  class = "lineno" >   117< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00118"  name = "l00118" > < / a > < span  class = "lineno" >   118< / span >     < span  class = "keywordflow" > if< / span >  (has_operand_mask) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00119"  name = "l00119" > < / a > < span  class = "lineno" >   119< / span >       < span  class = "keyword" > const< / span >  constant < span  class = "keywordtype" > size_t< / span > * mask_strides_lhs = mask_batch_strides;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00120"  name = "l00120" > < / a > < span  class = "lineno" >   120< / span >       < span  class = "keyword" > const< / span >  constant < span  class = "keywordtype" > size_t< / span > * mask_strides_rhs =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00121"  name = "l00121" > < / a > < span  class = "lineno" >   121< / span >           mask_strides_lhs + params-> batch_ndim;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00122"  name = "l00122" > < / a > < span  class = "lineno" >   122< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00123"  name = "l00123" > < / a > < span  class = "lineno" >   123< / span >       ulong2 batch_offsets = < a  class = "code hl_function"  href = "backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f" > elem_to_loc_broadcast< / a > (< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00124"  name = "l00124" > < / a > < span  class = "lineno" >   124< / span >           tid.z,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00125"  name = "l00125" > < / a > < span  class = "lineno" >   125< / span >           batch_shape,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00126"  name = "l00126" > < / a > < span  class = "lineno" >   126< / span >           mask_strides_lhs,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00127"  name = "l00127" > < / a > < span  class = "lineno" >   127< / span >           mask_strides_rhs,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00128"  name = "l00128" > < / a > < span  class = "lineno" >   128< / span >           params-> batch_ndim);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00129"  name = "l00129" > < / a > < span  class = "lineno" >   129< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00130"  name = "l00130" > < / a > < span  class = "lineno" >   130< / span >       lhs_mask += batch_offsets.x;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00131"  name = "l00131" > < / a > < span  class = "lineno" >   131< / span >       rhs_mask += batch_offsets.y;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00132"  name = "l00132" > < / a > < span  class = "lineno" >   132< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00133"  name = "l00133" > < / a > < span  class = "lineno" >   133< / span >   } < span  class = "keywordflow" > else< / span >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00134"  name = "l00134" > < / a > < span  class = "lineno" >   134< / span >     < span  class = "keywordflow" > if< / span >  (has_output_mask) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00135"  name = "l00135" > < / a > < span  class = "lineno" >   135< / span >       out_mask += tid.z * mask_batch_strides[0];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00136"  name = "l00136" > < / a > < span  class = "lineno" >   136< / span >       mask_batch_strides += params-> batch_ndim;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00137"  name = "l00137" > < / a > < span  class = "lineno" >   137< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00138"  name = "l00138" > < / a > < span  class = "lineno" >   138< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00139"  name = "l00139" > < / a > < span  class = "lineno" >   139< / span >     < span  class = "keywordflow" > if< / span >  (has_operand_mask) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00140"  name = "l00140" > < / a > < span  class = "lineno" >   140< / span >       lhs_mask += tid.z * mask_batch_strides[0];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00141"  name = "l00141" > < / a > < span  class = "lineno" >   141< / span >       rhs_mask += tid.z * mask_batch_strides[params-> batch_ndim];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00142"  name = "l00142" > < / a > < span  class = "lineno" >   142< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00143"  name = "l00143" > < / a > < span  class = "lineno" >   143< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00144"  name = "l00144" > < / a > < span  class = "lineno" >   144< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00145"  name = "l00145" > < / a > < span  class = "lineno" >   145< / span >   < span  class = "comment" > // Adjust for batch< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00146"  name = "l00146" > < / a > < span  class = "lineno" >   146< / span >   < span  class = "keywordflow" > if< / span >  (params-> batch_ndim >  1) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00147"  name = "l00147" > < / a > < span  class = "lineno" >   147< / span >     < span  class = "keyword" > const< / span >  constant < span  class = "keywordtype" > size_t< / span > * A_bstrides = batch_strides;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00148"  name = "l00148" > < / a > < span  class = "lineno" >   148< / span >     < span  class = "keyword" > const< / span >  constant < span  class = "keywordtype" > size_t< / span > * B_bstrides = batch_strides + params-> batch_ndim;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00149"  name = "l00149" > < / a > < span  class = "lineno" >   149< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00150"  name = "l00150" > < / a > < span  class = "lineno" >   150< / span >     ulong2 batch_offsets = < a  class = "code hl_function"  href = "backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f" > elem_to_loc_broadcast< / a > (< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00151"  name = "l00151" > < / a > < span  class = "lineno" >   151< / span >         tid.z, batch_shape, A_bstrides, B_bstrides, params-> batch_ndim);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00152"  name = "l00152" > < / a > < span  class = "lineno" >   152< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00153"  name = "l00153" > < / a > < span  class = "lineno" >   153< / span >     A += batch_offsets.x;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00154"  name = "l00154" > < / a > < span  class = "lineno" >   154< / span >     B += batch_offsets.y;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00155"  name = "l00155" > < / a > < span  class = "lineno" >   155< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00156"  name = "l00156" > < / a > < span  class = "lineno" >   156< / span >   } < span  class = "keywordflow" > else< / span >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00157"  name = "l00157" > < / a > < span  class = "lineno" >   157< / span >     A += params-> batch_stride_a * tid.z;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00158"  name = "l00158" > < / a > < span  class = "lineno" >   158< / span >     B += params-> batch_stride_b * tid.z;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00159"  name = "l00159" > < / a > < span  class = "lineno" >   159< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00160"  name = "l00160" > < / a > < span  class = "lineno" >   160< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00161"  name = "l00161" > < / a > < span  class = "lineno" >   161< / span >   D += params-> batch_stride_d * tid.z;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00162"  name = "l00162" > < / a > < span  class = "lineno" >   162< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00163"  name = "l00163" > < / a > < span  class = "lineno" >   163< / span >   < span  class = "comment" > // Find block in A, B, C< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00164"  name = "l00164" > < / a > < span  class = "lineno" >   164< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > int< / span >  c_row = tid_y * BM;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00165"  name = "l00165" > < / a > < span  class = "lineno" >   165< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > int< / span >  c_col = tid_x * BN;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00166"  name = "l00166" > < / a > < span  class = "lineno" >   166< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > size_t< / span >  c_row_long = size_t(c_row);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00167"  name = "l00167" > < / a > < span  class = "lineno" >   167< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > size_t< / span >  c_col_long = size_t(c_col);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00168"  name = "l00168" > < / a > < span  class = "lineno" >   168< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00169"  name = "l00169" > < / a > < span  class = "lineno" >   169< / span >   A += transpose_a ? c_row_long : c_row_long * params-> lda;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00170"  name = "l00170" > < / a > < span  class = "lineno" >   170< / span >   B += transpose_b ? c_col_long * params-> ldb : c_col_long;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00171"  name = "l00171" > < / a > < span  class = "lineno" >   171< / span >   D += c_row_long * params-> ldd + c_col_long;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00172"  name = "l00172" > < / a > < span  class = "lineno" >   172< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00173"  name = "l00173" > < / a > < span  class = "lineno" >   173< / span >   < span  class = "keyword" > const< / span >  constant < span  class = "keywordtype" > int< / span > * out_mask_strides = mask_strides;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00174"  name = "l00174" > < / a > < span  class = "lineno" >   174< / span >   < span  class = "keyword" > const< / span >  constant < span  class = "keywordtype" > int< / span > * lhs_mask_strides =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00175"  name = "l00175" > < / a > < span  class = "lineno" >   175< / span >       mask_strides + (has_output_mask ? 2 : 0);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00176"  name = "l00176" > < / a > < span  class = "lineno" >   176< / span >   < span  class = "keyword" > const< / span >  constant < span  class = "keywordtype" > int< / span > * rhs_mask_strides =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00177"  name = "l00177" > < / a > < span  class = "lineno" >   177< / span >       lhs_mask_strides + (has_operand_mask ? 2 : 0);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00178"  name = "l00178" > < / a > < span  class = "lineno" >   178< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00179"  name = "l00179" > < / a > < span  class = "lineno" >   179< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > int< / span >  out_mask_offset = !has_output_mask< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00180"  name = "l00180" > < / a > < span  class = "lineno" >   180< / span >       ? 0< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00181"  name = "l00181" > < / a > < span  class = "lineno" >   181< / span >       : tid_y * out_mask_strides[1] + tid_x * out_mask_strides[0];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00182"  name = "l00182" > < / a > < span  class = "lineno" >   182< / span >   < span  class = "keywordtype" > int< / span >  lhs_mask_offset = !has_operand_mask ? 0 : tid_y * lhs_mask_strides[1];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00183"  name = "l00183" > < / a > < span  class = "lineno" >   183< / span >   < span  class = "keywordtype" > int< / span >  rhs_mask_offset = !has_operand_mask ? 0 : tid_x * rhs_mask_strides[0];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00184"  name = "l00184" > < / a > < span  class = "lineno" >   184< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > int< / span >  lhs_mask_step = !has_operand_mask ? 0 : lhs_mask_strides[0];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00185"  name = "l00185" > < / a > < span  class = "lineno" >   185< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > int< / span >  rhs_mask_step = !has_operand_mask ? 0 : rhs_mask_strides[1];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00186"  name = "l00186" > < / a > < span  class = "lineno" >   186< / span >   < span  class = "keywordtype" > short< / span >  k_factor_cnt = k_mask_factor;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00187"  name = "l00187" > < / a > < span  class = "lineno" >   187< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00188"  name = "l00188" > < / a > < span  class = "lineno" >   188< / span >   < a  class = "code hl_struct"  href = "struct_scale_op.html" > ScaleOp< float> < / a >  out_mask_op;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00189"  name = "l00189" > < / a > < span  class = "lineno" >   189< / span >   < a  class = "code hl_struct"  href = "struct_scale_op.html" > ScaleOp< T> < / a >  lhs_mask_op;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00190"  name = "l00190" > < / a > < span  class = "lineno" >   190< / span >   < a  class = "code hl_struct"  href = "struct_scale_op.html" > ScaleOp< T> < / a >  rhs_mask_op;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00191"  name = "l00191" > < / a > < span  class = "lineno" >   191< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00192"  name = "l00192" > < / a > < span  class = "lineno" >   192< / span >   < span  class = "keywordflow" > if< / span >  (has_output_mask) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00193"  name = "l00193" > < / a > < span  class = "lineno" >   193< / span >     < span  class = "keyword" > auto< / span >  mask_out = out_mask[out_mask_offset];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00194"  name = "l00194" > < / a > < span  class = "lineno" >   194< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00195"  name = "l00195" > < / a > < span  class = "lineno" >   195< / span >     < span  class = "keywordflow" > if< / span >  (has_mul_output_mask) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00196"  name = "l00196" > < / a > < span  class = "lineno" >   196< / span >       out_mask_op.< a  class = "code hl_variable"  href = "struct_scale_op.html#a02043fac21c68fb8d6863a01f45ede4b" > scale< / a >  = float(mask_out);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00197"  name = "l00197" > < / a > < span  class = "lineno" >   197< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00198"  name = "l00198" > < / a > < span  class = "lineno" >   198< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00199"  name = "l00199" > < / a > < span  class = "lineno" >   199< / span >     < span  class = "comment" > // Write zeros and return< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00200"  name = "l00200" > < / a > < span  class = "lineno" >   200< / span >     < span  class = "keywordflow" > if< / span >  (!mask_out) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00201"  name = "l00201" > < / a > < span  class = "lineno" >   201< / span >       < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > short< / span >  tgp_size = WM * WN * 32;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00202"  name = "l00202" > < / a > < span  class = "lineno" >   202< / span >       < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > short< / span >  vec_size = 4;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00203"  name = "l00203" > < / a > < span  class = "lineno" >   203< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00204"  name = "l00204" > < / a > < span  class = "lineno" >   204< / span >       < span  class = "comment" > // Tile threads in threadgroup< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00205"  name = "l00205" > < / a > < span  class = "lineno" >   205< / span >       < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > short< / span >  TN = BN / vec_size;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00206"  name = "l00206" > < / a > < span  class = "lineno" >   206< / span >       < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > short< / span >  TM = tgp_size / TN;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00207"  name = "l00207" > < / a > < span  class = "lineno" >   207< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00208"  name = "l00208" > < / a > < span  class = "lineno" >   208< / span >       < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > short< / span >  thread_idx = simd_group_id * 32 + simd_lane_id;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00209"  name = "l00209" > < / a > < span  class = "lineno" >   209< / span >       < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > short< / span >  bi = thread_idx / TN;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00210"  name = "l00210" > < / a > < span  class = "lineno" >   210< / span >       < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > short< / span >  bj = vec_size * (thread_idx % TN);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00211"  name = "l00211" > < / a > < span  class = "lineno" >   211< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00212"  name = "l00212" > < / a > < span  class = "lineno" >   212< / span >       D += bi * params-> ldd + bj;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00213"  name = "l00213" > < / a > < span  class = "lineno" >   213< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00214"  name = "l00214" > < / a > < span  class = "lineno" >   214< / span >       < span  class = "keywordtype" > short< / span >  tgp_bm = < a  class = "code hl_function"  href = "namespacemetal.html#a6653b28c9473087141eddce39878d4d3" > min< / a > (BM, params-> M - c_row);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00215"  name = "l00215" > < / a > < span  class = "lineno" >   215< / span >       < span  class = "keywordtype" > short< / span >  tgp_bn = < a  class = "code hl_function"  href = "namespacemetal.html#a6653b28c9473087141eddce39878d4d3" > min< / a > (BN, params-> N - c_col);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00216"  name = "l00216" > < / a > < span  class = "lineno" >   216< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00217"  name = "l00217" > < / a > < span  class = "lineno" >   217< / span >       < span  class = "keywordflow" > if< / span >  (MN_aligned || (tgp_bm == BM & &  tgp_bn == BN)) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00218"  name = "l00218" > < / a > < span  class = "lineno" >   218< / span >         < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > short< / span >  ti = 0; ti <  BM; ti += TM) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00219"  name = "l00219" > < / a > < span  class = "lineno" >   219< / span >           < a  class = "code hl_define"  href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00220"  name = "l00220" > < / a > < span  class = "lineno" >   220< / span >           < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > short< / span >  j = 0; j <  vec_size; j++) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00221"  name = "l00221" > < / a > < span  class = "lineno" >   221< / span >             D[ti * params-> ldd + j] = T(0.);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00222"  name = "l00222" > < / a > < span  class = "lineno" >   222< / span >           }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00223"  name = "l00223" > < / a > < span  class = "lineno" >   223< / span >         }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00224"  name = "l00224" > < / a > < span  class = "lineno" >   224< / span >       } < span  class = "keywordflow" > else< / span >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00225"  name = "l00225" > < / a > < span  class = "lineno" >   225< / span >         < span  class = "keywordtype" > short< / span >  jmax = tgp_bn - bj;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00226"  name = "l00226" > < / a > < span  class = "lineno" >   226< / span >         jmax = jmax <  vec_size ? jmax : vec_size;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00227"  name = "l00227" > < / a > < span  class = "lineno" >   227< / span >         < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > short< / span >  ti = 0; (bi + ti) <  tgp_bm; ti += TM) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00228"  name = "l00228" > < / a > < span  class = "lineno" >   228< / span >           < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > short< / span >  j = 0; j <  jmax; j++) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00229"  name = "l00229" > < / a > < span  class = "lineno" >   229< / span >             D[ti * params-> ldd + j] = T(0.);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00230"  name = "l00230" > < / a > < span  class = "lineno" >   230< / span >           }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00231"  name = "l00231" > < / a > < span  class = "lineno" >   231< / span >         }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00232"  name = "l00232" > < / a > < span  class = "lineno" >   232< / span >       }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00233"  name = "l00233" > < / a > < span  class = "lineno" >   233< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00234"  name = "l00234" > < / a > < span  class = "lineno" >   234< / span >       < span  class = "keywordflow" > return< / span > ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00235"  name = "l00235" > < / a > < span  class = "lineno" >   235< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00236"  name = "l00236" > < / a > < span  class = "lineno" >   236< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00237"  name = "l00237" > < / a > < span  class = "lineno" >   237< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00238"  name = "l00238" > < / a > < span  class = "lineno" >   238< / span >   threadgroup_barrier(mem_flags::mem_none);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00239"  name = "l00239" > < / a > < span  class = "lineno" >   239< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00240"  name = "l00240" > < / a > < span  class = "lineno" >   240< / span >   < span  class = "comment" > // Prepare threadgroup mma operation< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00241"  name = "l00241" > < / a > < span  class = "lineno" >   241< / span >   thread < span  class = "keyword" > typename< / span >  gemm_kernel::mma_t mma_op(simd_group_id, simd_lane_id);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00242"  name = "l00242" > < / a > < span  class = "lineno" >   242< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00243"  name = "l00243" > < / a > < span  class = "lineno" >   243< / span >   threadgroup T As[gemm_kernel::tgp_mem_size_a];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00244"  name = "l00244" > < / a > < span  class = "lineno" >   244< / span >   threadgroup T Bs[gemm_kernel::tgp_mem_size_b];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00245"  name = "l00245" > < / a > < span  class = "lineno" >   245< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00246"  name = "l00246" > < / a > < span  class = "lineno" >   246< / span >   < span  class = "comment" > // Prepare threadgroup loading operations< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00247"  name = "l00247" > < / a > < span  class = "lineno" >   247< / span >   thread < span  class = "keyword" > typename< / span >  gemm_kernel::loader_a_t loader_a(< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00248"  name = "l00248" > < / a > < span  class = "lineno" >   248< / span >       A, params-> lda, As, simd_group_id, simd_lane_id);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00249"  name = "l00249" > < / a > < span  class = "lineno" >   249< / span >   thread < span  class = "keyword" > typename< / span >  gemm_kernel::loader_b_t loader_b(< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00250"  name = "l00250" > < / a > < span  class = "lineno" >   250< / span >       B, params-> ldb, Bs, simd_group_id, simd_lane_id);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00251"  name = "l00251" > < / a > < span  class = "lineno" >   251< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00252"  name = "l00252" > < / a > < span  class = "lineno" >   252< / span >   < span  class = "comment" > // Prepare threadgroup bounds< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00253"  name = "l00253" > < / a > < span  class = "lineno" >   253< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > short< / span >  tgp_bm =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00254"  name = "l00254" > < / a > < span  class = "lineno" >   254< / span >       MN_aligned ? short(BM) : short(< a  class = "code hl_function"  href = "namespacemetal.html#a6653b28c9473087141eddce39878d4d3" > min< / a > (BM, params-> M - c_row));< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00255"  name = "l00255" > < / a > < span  class = "lineno" >   255< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > short< / span >  tgp_bn =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00256"  name = "l00256" > < / a > < span  class = "lineno" >   256< / span >       MN_aligned ? short(BN) : short(< a  class = "code hl_function"  href = "namespacemetal.html#a6653b28c9473087141eddce39878d4d3" > min< / a > (BN, params-> N - c_col));< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00257"  name = "l00257" > < / a > < span  class = "lineno" >   257< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00258"  name = "l00258" > < / a > < span  class = "lineno" >   258< / span >   < span  class = "keywordtype" > int< / span >  gemm_k_iterations = params-> gemm_k_iterations_aligned;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00259"  name = "l00259" > < / a > < span  class = "lineno" >   259< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00261"  name = "l00261" > < / a > < span  class = "lineno" >   261< / span >   < span  class = "comment" > // Do unaligned K iterations first< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00262"  name = "l00262" > < / a > < span  class = "lineno" >   262< / span >   < span  class = "keywordflow" > if< / span >  (!K_aligned) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00263"  name = "l00263" > < / a > < span  class = "lineno" >   263< / span >     < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > int< / span >  k_last = params-> gemm_k_iterations_aligned * BK;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00264"  name = "l00264" > < / a > < span  class = "lineno" >   264< / span >     < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > int< / span >  mask_idx_last = k_last / BM;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00265"  name = "l00265" > < / a > < span  class = "lineno" >   265< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00266"  name = "l00266" > < / a > < span  class = "lineno" >   266< / span >     < span  class = "keywordflow" > if< / span >  (!has_operand_mask ||< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00267"  name = "l00267" > < / a > < span  class = "lineno" >   267< / span >         (< span  class = "keywordtype" > bool< / span > (lhs_mask[lhs_mask_offset + mask_idx_last * lhs_mask_step]) & & < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00268"  name = "l00268" > < / a > < span  class = "lineno" >   268< / span >          < span  class = "keywordtype" > bool< / span > (rhs_mask[rhs_mask_offset + mask_idx_last * rhs_mask_step]))) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00269"  name = "l00269" > < / a > < span  class = "lineno" >   269< / span >       < span  class = "keywordflow" > if< / span >  (has_mul_operand_mask) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00270"  name = "l00270" > < / a > < span  class = "lineno" >   270< / span >         lhs_mask_op.< a  class = "code hl_variable"  href = "struct_scale_op.html#a02043fac21c68fb8d6863a01f45ede4b" > scale< / a >  =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00271"  name = "l00271" > < / a > < span  class = "lineno" >   271< / span >             lhs_mask[lhs_mask_offset + mask_idx_last * lhs_mask_step];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00272"  name = "l00272" > < / a > < span  class = "lineno" >   272< / span >         rhs_mask_op.< a  class = "code hl_variable"  href = "struct_scale_op.html#a02043fac21c68fb8d6863a01f45ede4b" > scale< / a >  =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00273"  name = "l00273" > < / a > < span  class = "lineno" >   273< / span >             rhs_mask[rhs_mask_offset + mask_idx_last * rhs_mask_step];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00274"  name = "l00274" > < / a > < span  class = "lineno" >   274< / span >       }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00275"  name = "l00275" > < / a > < span  class = "lineno" >   275< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00276"  name = "l00276" > < / a > < span  class = "lineno" >   276< / span >       < span  class = "comment" > // Move loader source ahead to end< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00277"  name = "l00277" > < / a > < span  class = "lineno" >   277< / span >       < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > int< / span >  k_remain = params-> K - k_last;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00278"  name = "l00278" > < / a > < span  class = "lineno" >   278< / span >       < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > size_t< / span >  k_jump_a =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00279"  name = "l00279" > < / a > < span  class = "lineno" >   279< / span >           transpose_a ? params-> lda * size_t(k_last) : size_t(k_last);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00280"  name = "l00280" > < / a > < span  class = "lineno" >   280< / span >       < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > size_t< / span >  k_jump_b =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00281"  name = "l00281" > < / a > < span  class = "lineno" >   281< / span >           transpose_b ? size_t(k_last) : params-> ldb * size_t(k_last);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00282"  name = "l00282" > < / a > < span  class = "lineno" >   282< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00283"  name = "l00283" > < / a > < span  class = "lineno" >   283< / span >       loader_a.src += k_jump_a;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00284"  name = "l00284" > < / a > < span  class = "lineno" >   284< / span >       loader_b.src += k_jump_b;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00285"  name = "l00285" > < / a > < span  class = "lineno" >   285< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00286"  name = "l00286" > < / a > < span  class = "lineno" >   286< / span >       < span  class = "comment" > // Load tile< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00287"  name = "l00287" > < / a > < span  class = "lineno" >   287< / span >       < span  class = "keyword" > const< / span >  short2 tile_dims_A =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00288"  name = "l00288" > < / a > < span  class = "lineno" >   288< / span >           transpose_a ? short2(tgp_bm, k_remain) : short2(k_remain, tgp_bm);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00289"  name = "l00289" > < / a > < span  class = "lineno" >   289< / span >       < span  class = "keyword" > const< / span >  short2 tile_dims_B =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00290"  name = "l00290" > < / a > < span  class = "lineno" >   290< / span >           transpose_b ? short2(k_remain, tgp_bn) : short2(tgp_bn, k_remain);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00291"  name = "l00291" > < / a > < span  class = "lineno" >   291< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00292"  name = "l00292" > < / a > < span  class = "lineno" >   292< / span >       loader_a.load_safe(tile_dims_A);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00293"  name = "l00293" > < / a > < span  class = "lineno" >   293< / span >       loader_b.load_safe(tile_dims_B);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00294"  name = "l00294" > < / a > < span  class = "lineno" >   294< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00295"  name = "l00295" > < / a > < span  class = "lineno" >   295< / span >       < span  class = "keywordflow" > if< / span >  (has_mul_operand_mask) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00296"  name = "l00296" > < / a > < span  class = "lineno" >   296< / span >         loader_a.apply_inplace_op(lhs_mask_op);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00297"  name = "l00297" > < / a > < span  class = "lineno" >   297< / span >         loader_b.apply_inplace_op(rhs_mask_op);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00298"  name = "l00298" > < / a > < span  class = "lineno" >   298< / span >       }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00299"  name = "l00299" > < / a > < span  class = "lineno" >   299< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00300"  name = "l00300" > < / a > < span  class = "lineno" >   300< / span >       threadgroup_barrier(mem_flags::mem_threadgroup);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00301"  name = "l00301" > < / a > < span  class = "lineno" >   301< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00302"  name = "l00302" > < / a > < span  class = "lineno" >   302< / span >       < span  class = "comment" > // Do matmul< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00303"  name = "l00303" > < / a > < span  class = "lineno" >   303< / span >       mma_op.mma(As, Bs);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00304"  name = "l00304" > < / a > < span  class = "lineno" >   304< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00305"  name = "l00305" > < / a > < span  class = "lineno" >   305< / span >       < span  class = "comment" > // Reset source back to start< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00306"  name = "l00306" > < / a > < span  class = "lineno" >   306< / span >       loader_a.src -= k_jump_a;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00307"  name = "l00307" > < / a > < span  class = "lineno" >   307< / span >       loader_b.src -= k_jump_b;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00308"  name = "l00308" > < / a > < span  class = "lineno" >   308< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00309"  name = "l00309" > < / a > < span  class = "lineno" >   309< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00310"  name = "l00310" > < / a > < span  class = "lineno" >   310< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00312"  name = "l00312" > < / a > < span  class = "lineno" >   312< / span >   < span  class = "comment" > // MNK aligned loop< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00313"  name = "l00313" > < / a > < span  class = "lineno" >   313< / span >   < span  class = "keywordflow" > if< / span >  (MN_aligned) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00314"  name = "l00314" > < / a > < span  class = "lineno" >   314< / span >     < span  class = "keywordflow" > for< / span >  (; gemm_k_iterations >  0; gemm_k_iterations--) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00315"  name = "l00315" > < / a > < span  class = "lineno" >   315< / span >       threadgroup_barrier(mem_flags::mem_threadgroup);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00316"  name = "l00316" > < / a > < span  class = "lineno" >   316< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00317"  name = "l00317" > < / a > < span  class = "lineno" >   317< / span >       < span  class = "keywordflow" > if< / span >  (!has_operand_mask ||< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00318"  name = "l00318" > < / a > < span  class = "lineno" >   318< / span >           (< span  class = "keywordtype" > bool< / span > (lhs_mask[lhs_mask_offset]) & & < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00319"  name = "l00319" > < / a > < span  class = "lineno" >   319< / span >            < span  class = "keywordtype" > bool< / span > (rhs_mask[rhs_mask_offset]))) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00320"  name = "l00320" > < / a > < span  class = "lineno" >   320< / span >         < span  class = "keywordflow" > if< / span >  (has_mul_operand_mask) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00321"  name = "l00321" > < / a > < span  class = "lineno" >   321< / span >           lhs_mask_op.< a  class = "code hl_variable"  href = "struct_scale_op.html#a02043fac21c68fb8d6863a01f45ede4b" > scale< / a >  = lhs_mask[lhs_mask_offset];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00322"  name = "l00322" > < / a > < span  class = "lineno" >   322< / span >           rhs_mask_op.< a  class = "code hl_variable"  href = "struct_scale_op.html#a02043fac21c68fb8d6863a01f45ede4b" > scale< / a >  = rhs_mask[rhs_mask_offset];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00323"  name = "l00323" > < / a > < span  class = "lineno" >   323< / span >         }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00324"  name = "l00324" > < / a > < span  class = "lineno" >   324< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00325"  name = "l00325" > < / a > < span  class = "lineno" >   325< / span >         < span  class = "comment" > // Load elements into threadgroup< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00326"  name = "l00326" > < / a > < span  class = "lineno" >   326< / span >         loader_a.load_unsafe();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00327"  name = "l00327" > < / a > < span  class = "lineno" >   327< / span >         loader_b.load_unsafe();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00328"  name = "l00328" > < / a > < span  class = "lineno" >   328< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00329"  name = "l00329" > < / a > < span  class = "lineno" >   329< / span >         < span  class = "keywordflow" > if< / span >  (has_mul_operand_mask) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00330"  name = "l00330" > < / a > < span  class = "lineno" >   330< / span >           loader_a.apply_inplace_op(lhs_mask_op);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00331"  name = "l00331" > < / a > < span  class = "lineno" >   331< / span >           loader_b.apply_inplace_op(rhs_mask_op);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00332"  name = "l00332" > < / a > < span  class = "lineno" >   332< / span >         }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00333"  name = "l00333" > < / a > < span  class = "lineno" >   333< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00334"  name = "l00334" > < / a > < span  class = "lineno" >   334< / span >         threadgroup_barrier(mem_flags::mem_threadgroup);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00335"  name = "l00335" > < / a > < span  class = "lineno" >   335< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00336"  name = "l00336" > < / a > < span  class = "lineno" >   336< / span >         < span  class = "comment" > // Multiply and accumulate threadgroup elements< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00337"  name = "l00337" > < / a > < span  class = "lineno" >   337< / span >         mma_op.mma(As, Bs);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00338"  name = "l00338" > < / a > < span  class = "lineno" >   338< / span >       }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00339"  name = "l00339" > < / a > < span  class = "lineno" >   339< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00340"  name = "l00340" > < / a > < span  class = "lineno" >   340< / span >       < span  class = "comment" > // Prepare for next iteration< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00341"  name = "l00341" > < / a > < span  class = "lineno" >   341< / span >       loader_a.next();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00342"  name = "l00342" > < / a > < span  class = "lineno" >   342< / span >       loader_b.next();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00343"  name = "l00343" > < / a > < span  class = "lineno" >   343< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00344"  name = "l00344" > < / a > < span  class = "lineno" >   344< / span >       k_factor_cnt--;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00345"  name = "l00345" > < / a > < span  class = "lineno" >   345< / span >       lhs_mask_offset += k_factor_cnt == 0 ? lhs_mask_step : 0;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00346"  name = "l00346" > < / a > < span  class = "lineno" >   346< / span >       rhs_mask_offset += k_factor_cnt == 0 ? rhs_mask_step : 0;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00347"  name = "l00347" > < / a > < span  class = "lineno" >   347< / span >       k_factor_cnt = k_factor_cnt == 0 ? k_mask_factor : k_factor_cnt;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00348"  name = "l00348" > < / a > < span  class = "lineno" >   348< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00349"  name = "l00349" > < / a > < span  class = "lineno" >   349< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00350"  name = "l00350" > < / a > < span  class = "lineno" >   350< / span >     < span  class = "keywordflow" > if< / span >  (has_mul_output_mask) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00351"  name = "l00351" > < / a > < span  class = "lineno" >   351< / span >       mma_op.apply_epilogue(out_mask_op);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00352"  name = "l00352" > < / a > < span  class = "lineno" >   352< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00353"  name = "l00353" > < / a > < span  class = "lineno" >   353< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00354"  name = "l00354" > < / a > < span  class = "lineno" >   354< / span >     < span  class = "comment" > // Store results to device memory< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00355"  name = "l00355" > < / a > < span  class = "lineno" >   355< / span >     mma_op.store_result(D, params-> ldd);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00356"  name = "l00356" > < / a > < span  class = "lineno" >   356< / span >     < span  class = "keywordflow" > return< / span > ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00357"  name = "l00357" > < / a > < span  class = "lineno" >   357< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00358"  name = "l00358" > < / a > < span  class = "lineno" >   358< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00360"  name = "l00360" > < / a > < span  class = "lineno" >   360< / span >   < span  class = "comment" > // MN unaligned loop< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00361"  name = "l00361" > < / a > < span  class = "lineno" >   361< / span >   < span  class = "keywordflow" > else< / span >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00362"  name = "l00362" > < / a > < span  class = "lineno" >   362< / span >     < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > bool< / span >  M_aligned = (tgp_bm == BM);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00363"  name = "l00363" > < / a > < span  class = "lineno" >   363< / span >     < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > bool< / span >  N_aligned = (tgp_bn == BN);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00364"  name = "l00364" > < / a > < span  class = "lineno" >   364< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00365"  name = "l00365" > < / a > < span  class = "lineno" >   365< / span >     < span  class = "keyword" > const< / span >  short2 tile_dims_A =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00366"  name = "l00366" > < / a > < span  class = "lineno" >   366< / span >         transpose_a ? short2(tgp_bm, BK) : short2(BK, tgp_bm);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00367"  name = "l00367" > < / a > < span  class = "lineno" >   367< / span >     < span  class = "keyword" > const< / span >  short2 tile_dims_B =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00368"  name = "l00368" > < / a > < span  class = "lineno" >   368< / span >         transpose_b ? short2(BK, tgp_bn) : short2(tgp_bn, BK);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00369"  name = "l00369" > < / a > < span  class = "lineno" >   369< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00370"  name = "l00370" > < / a > < span  class = "lineno" >   370< / span >     < span  class = "keywordflow" > for< / span >  (; gemm_k_iterations >  0; gemm_k_iterations--) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00371"  name = "l00371" > < / a > < span  class = "lineno" >   371< / span >       threadgroup_barrier(mem_flags::mem_threadgroup);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00372"  name = "l00372" > < / a > < span  class = "lineno" >   372< / span >       < span  class = "keywordflow" > if< / span >  (!has_operand_mask ||< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00373"  name = "l00373" > < / a > < span  class = "lineno" >   373< / span >           (< span  class = "keywordtype" > bool< / span > (lhs_mask[lhs_mask_offset]) & & < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00374"  name = "l00374" > < / a > < span  class = "lineno" >   374< / span >            < span  class = "keywordtype" > bool< / span > (rhs_mask[rhs_mask_offset]))) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00375"  name = "l00375" > < / a > < span  class = "lineno" >   375< / span >         < span  class = "keywordflow" > if< / span >  (has_mul_operand_mask) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00376"  name = "l00376" > < / a > < span  class = "lineno" >   376< / span >           lhs_mask_op.< a  class = "code hl_variable"  href = "struct_scale_op.html#a02043fac21c68fb8d6863a01f45ede4b" > scale< / a >  = lhs_mask[lhs_mask_offset];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00377"  name = "l00377" > < / a > < span  class = "lineno" >   377< / span >           rhs_mask_op.< a  class = "code hl_variable"  href = "struct_scale_op.html#a02043fac21c68fb8d6863a01f45ede4b" > scale< / a >  = rhs_mask[rhs_mask_offset];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00378"  name = "l00378" > < / a > < span  class = "lineno" >   378< / span >         }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00379"  name = "l00379" > < / a > < span  class = "lineno" >   379< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00380"  name = "l00380" > < / a > < span  class = "lineno" >   380< / span >         < span  class = "comment" > // Load elements into threadgroup< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00381"  name = "l00381" > < / a > < span  class = "lineno" >   381< / span >         < span  class = "keywordflow" > if< / span >  (M_aligned) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00382"  name = "l00382" > < / a > < span  class = "lineno" >   382< / span >           loader_a.load_unsafe();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00383"  name = "l00383" > < / a > < span  class = "lineno" >   383< / span >         } < span  class = "keywordflow" > else< / span >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00384"  name = "l00384" > < / a > < span  class = "lineno" >   384< / span >           loader_a.load_safe(tile_dims_A);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00385"  name = "l00385" > < / a > < span  class = "lineno" >   385< / span >         }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00386"  name = "l00386" > < / a > < span  class = "lineno" >   386< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00387"  name = "l00387" > < / a > < span  class = "lineno" >   387< / span >         < span  class = "keywordflow" > if< / span >  (N_aligned) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00388"  name = "l00388" > < / a > < span  class = "lineno" >   388< / span >           loader_b.load_unsafe();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00389"  name = "l00389" > < / a > < span  class = "lineno" >   389< / span >         } < span  class = "keywordflow" > else< / span >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00390"  name = "l00390" > < / a > < span  class = "lineno" >   390< / span >           loader_b.load_safe(tile_dims_B);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00391"  name = "l00391" > < / a > < span  class = "lineno" >   391< / span >         }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00392"  name = "l00392" > < / a > < span  class = "lineno" >   392< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00393"  name = "l00393" > < / a > < span  class = "lineno" >   393< / span >         < span  class = "keywordflow" > if< / span >  (has_mul_operand_mask) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00394"  name = "l00394" > < / a > < span  class = "lineno" >   394< / span >           loader_a.apply_inplace_op(lhs_mask_op);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00395"  name = "l00395" > < / a > < span  class = "lineno" >   395< / span >           loader_b.apply_inplace_op(rhs_mask_op);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00396"  name = "l00396" > < / a > < span  class = "lineno" >   396< / span >         }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00397"  name = "l00397" > < / a > < span  class = "lineno" >   397< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00398"  name = "l00398" > < / a > < span  class = "lineno" >   398< / span >         threadgroup_barrier(mem_flags::mem_threadgroup);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00399"  name = "l00399" > < / a > < span  class = "lineno" >   399< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00400"  name = "l00400" > < / a > < span  class = "lineno" >   400< / span >         < span  class = "comment" > // Multiply and accumulate threadgroup elements< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00401"  name = "l00401" > < / a > < span  class = "lineno" >   401< / span >         mma_op.mma(As, Bs);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00402"  name = "l00402" > < / a > < span  class = "lineno" >   402< / span >       }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00403"  name = "l00403" > < / a > < span  class = "lineno" >   403< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00404"  name = "l00404" > < / a > < span  class = "lineno" >   404< / span >       < span  class = "comment" > // Prepare for next iteration< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00405"  name = "l00405" > < / a > < span  class = "lineno" >   405< / span >       loader_a.next();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00406"  name = "l00406" > < / a > < span  class = "lineno" >   406< / span >       loader_b.next();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00407"  name = "l00407" > < / a > < span  class = "lineno" >   407< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00408"  name = "l00408" > < / a > < span  class = "lineno" >   408< / span >       k_factor_cnt--;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00409"  name = "l00409" > < / a > < span  class = "lineno" >   409< / span >       lhs_mask_offset += k_factor_cnt == 0 ? lhs_mask_step : 0;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00410"  name = "l00410" > < / a > < span  class = "lineno" >   410< / span >       rhs_mask_offset += k_factor_cnt == 0 ? rhs_mask_step : 0;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00411"  name = "l00411" > < / a > < span  class = "lineno" >   411< / span >       k_factor_cnt = k_factor_cnt == 0 ? k_mask_factor : k_factor_cnt;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00412"  name = "l00412" > < / a > < span  class = "lineno" >   412< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00413"  name = "l00413" > < / a > < span  class = "lineno" >   413< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00414"  name = "l00414" > < / a > < span  class = "lineno" >   414< / span >     < span  class = "keywordflow" > if< / span >  (has_mul_output_mask) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00415"  name = "l00415" > < / a > < span  class = "lineno" >   415< / span >       mma_op.apply_epilogue(out_mask_op);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00416"  name = "l00416" > < / a > < span  class = "lineno" >   416< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00417"  name = "l00417" > < / a > < span  class = "lineno" >   417< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00418"  name = "l00418" > < / a > < span  class = "lineno" >   418< / span >     < span  class = "keywordflow" > if< / span >  (M_aligned & &  N_aligned) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00419"  name = "l00419" > < / a > < span  class = "lineno" >   419< / span >       mma_op.store_result(D, params-> ldd);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00420"  name = "l00420" > < / a > < span  class = "lineno" >   420< / span >     } < span  class = "keywordflow" > else< / span >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00421"  name = "l00421" > < / a > < span  class = "lineno" >   421< / span >       mma_op.store_result_safe(D, params-> ldd, short2(tgp_bn, tgp_bm));< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00422"  name = "l00422" > < / a > < span  class = "lineno" >   422< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00423"  name = "l00423" > < / a > < span  class = "lineno" >   423< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00424"  name = "l00424" > < / a > < span  class = "lineno" >   424< / span > }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00425"  name = "l00425" > < / a > < span  class = "lineno" >   425< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00426"  name = "l00426" > < / a > < span  class = "lineno" >   426< / span > < span  class = "keyword" > template< / span >  < < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00427"  name = "l00427" > < / a > < span  class = "lineno" >   427< / span >     < span  class = "keyword" > typename< / span >  T,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00428"  name = "l00428" > < / a > < span  class = "lineno" >   428< / span >     < span  class = "keywordtype" > int< / span >  BM,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00429"  name = "l00429" > < / a > < span  class = "lineno" >   429< / span >     < span  class = "keywordtype" > int< / span >  BN,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00430"  name = "l00430" > < / a > < span  class = "lineno" >   430< / span >     < span  class = "keywordtype" > int< / span >  BK,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00431"  name = "l00431" > < / a > < span  class = "lineno" >   431< / span >     < span  class = "keywordtype" > int< / span >  WM,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00432"  name = "l00432" > < / a > < span  class = "lineno" >   432< / span >     < span  class = "keywordtype" > int< / span >  WN,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00433"  name = "l00433" > < / a > < span  class = "lineno" >   433< / span >     < span  class = "keywordtype" > bool< / span >  transpose_a,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00434"  name = "l00434" > < / a > < span  class = "lineno" >   434< / span >     < span  class = "keywordtype" > bool< / span >  transpose_b,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00435"  name = "l00435" > < / a > < span  class = "lineno" >   435< / span >     < span  class = "keywordtype" > bool< / span >  MN_aligned,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00436"  name = "l00436" > < / a > < span  class = "lineno" >   436< / span >     < span  class = "keywordtype" > bool< / span >  K_aligned,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00437"  name = "l00437" > < / a > < span  class = "lineno" >   437< / span >     < span  class = "keywordtype" > bool< / span >  has_operand_mask = < span  class = "keyword" > false< / span > > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00438"  name = "l00438" > < / a > < span  class = "lineno" >   438< / span > [[kernel, max_total_threads_per_threadgroup(WM* WN * 32)]] < span  class = "keywordtype" > void< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "foldopen"  id = "foldopen00439"  data-start = "{"  data-end = "}" > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00439"  name = "l00439" > < / a > < span  class = "lineno" > < a  class = "line"  href = "steel__gemm__masked_8h.html#a477932e2ae9d49366f7ede6db63f9cac" >   439< / a > < / span > < a  class = "code hl_function"  href = "steel__gemm__masked_8h.html#af805e998b2046ee30c2b4be813e3af97" > block_masked_gemm< / a > (< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00440"  name = "l00440" > < / a > < span  class = "lineno" >   440< / span >     < span  class = "keyword" > const< / span >  device T* A [[buffer(0)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00441"  name = "l00441" > < / a > < span  class = "lineno" >   441< / span >     < span  class = "keyword" > const< / span >  device T* B [[buffer(1)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00442"  name = "l00442" > < / a > < span  class = "lineno" >   442< / span >     device T* D [[buffer(3)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00443"  name = "l00443" > < / a > < span  class = "lineno" >   443< / span >     < span  class = "keyword" > const< / span >  constant < a  class = "code hl_struct"  href = "structmlx_1_1steel_1_1_g_e_m_m_params.html" > GEMMParams< / a > * params [[buffer(4)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00444"  name = "l00444" > < / a > < span  class = "lineno" >   444< / span >     < span  class = "keyword" > const< / span >  constant < span  class = "keywordtype" > int< / span > * batch_shape [[buffer(6)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00445"  name = "l00445" > < / a > < span  class = "lineno" >   445< / span >     < span  class = "keyword" > const< / span >  constant < span  class = "keywordtype" > size_t< / span > * batch_strides [[buffer(7)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00446"  name = "l00446" > < / a > < span  class = "lineno" >   446< / span >     < span  class = "keyword" > const< / span >  device < span  class = "keywordtype" > bool< / span > * out_mask [[buffer(10)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00447"  name = "l00447" > < / a > < span  class = "lineno" >   447< / span >     < span  class = "keyword" > const< / span >  device < span  class = "keywordtype" > bool< / span > * lhs_mask [[buffer(11)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00448"  name = "l00448" > < / a > < span  class = "lineno" >   448< / span >     < span  class = "keyword" > const< / span >  device < span  class = "keywordtype" > bool< / span > * rhs_mask [[buffer(12)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00449"  name = "l00449" > < / a > < span  class = "lineno" >   449< / span >     < span  class = "keyword" > const< / span >  constant < span  class = "keywordtype" > int< / span > * mask_strides [[buffer(13)]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00450"  name = "l00450" > < / a > < span  class = "lineno" >   450< / span >     uint simd_lane_id [[thread_index_in_simdgroup]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00451"  name = "l00451" > < / a > < span  class = "lineno" >   451< / span >     uint simd_group_id [[simdgroup_index_in_threadgroup]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00452"  name = "l00452" > < / a > < span  class = "lineno" >   452< / span >     uint3 tid [[threadgroup_position_in_grid]],< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00453"  name = "l00453" > < / a > < span  class = "lineno" >   453< / span >     uint3 lid [[thread_position_in_threadgroup]]) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00454"  name = "l00454" > < / a > < span  class = "lineno" >   454< / span >   < span  class = "comment" > // Appease the compiler< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00455"  name = "l00455" > < / a > < span  class = "lineno" >   455< / span >   (void)lid;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00456"  name = "l00456" > < / a > < span  class = "lineno" >   456< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00457"  name = "l00457" > < / a > < span  class = "lineno" >   457< / span >   < span  class = "keyword" > using < / span > gemm_kernel = < a  class = "code hl_struct"  href = "structmlx_1_1steel_1_1_g_e_m_m_kernel.html" > GEMMKernel< / a > < < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00458"  name = "l00458" > < / a > < span  class = "lineno" >   458< / span >       T,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00459"  name = "l00459" > < / a > < span  class = "lineno" >   459< / span >       T,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00460"  name = "l00460" > < / a > < span  class = "lineno" >   460< / span >       BM,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00461"  name = "l00461" > < / a > < span  class = "lineno" >   461< / span >       BN,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00462"  name = "l00462" > < / a > < span  class = "lineno" >   462< / span >       BK,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00463"  name = "l00463" > < / a > < span  class = "lineno" >   463< / span >       WM,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00464"  name = "l00464" > < / a > < span  class = "lineno" >   464< / span >       WN,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00465"  name = "l00465" > < / a > < span  class = "lineno" >   465< / span >       transpose_a,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00466"  name = "l00466" > < / a > < span  class = "lineno" >   466< / span >       transpose_b,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00467"  name = "l00467" > < / a > < span  class = "lineno" >   467< / span >       MN_aligned,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00468"  name = "l00468" > < / a > < span  class = "lineno" >   468< / span >       K_aligned> ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00469"  name = "l00469" > < / a > < span  class = "lineno" >   469< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00470"  name = "l00470" > < / a > < span  class = "lineno" >   470< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > int< / span >  tid_y = ((tid.y) < <  params-> swizzle_log) +< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00471"  name = "l00471" > < / a > < span  class = "lineno" >   471< / span >       ((tid.x) &  ((1 < <  params-> swizzle_log) - 1));< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00472"  name = "l00472" > < / a > < span  class = "lineno" >   472< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > int< / span >  tid_x = (tid.x) > >  params-> swizzle_log;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00473"  name = "l00473" > < / a > < span  class = "lineno" >   473< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00474"  name = "l00474" > < / a > < span  class = "lineno" >   474< / span >   < span  class = "keywordflow" > if< / span >  (params-> tiles_n < = tid_x || params-> tiles_m < = tid_y) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00475"  name = "l00475" > < / a > < span  class = "lineno" >   475< / span >     < span  class = "keywordflow" > return< / span > ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00476"  name = "l00476" > < / a > < span  class = "lineno" >   476< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00477"  name = "l00477" > < / a > < span  class = "lineno" >   477< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00478"  name = "l00478" > < / a > < span  class = "lineno" >   478< / span >   < span  class = "keywordflow" > if< / span >  (params-> batch_ndim >  1) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00479"  name = "l00479" > < / a > < span  class = "lineno" >   479< / span >     < span  class = "keyword" > const< / span >  constant < span  class = "keywordtype" > size_t< / span > * mask_batch_strides =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00480"  name = "l00480" > < / a > < span  class = "lineno" >   480< / span >         batch_strides + 2 * params-> batch_ndim;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00481"  name = "l00481" > < / a > < span  class = "lineno" >   481< / span >     out_mask +=< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00482"  name = "l00482" > < / a > < span  class = "lineno" >   482< / span >         < a  class = "code hl_function"  href = "backend_2metal_2kernels_2utils_8h.html#a2e49fa7ab8f6348543455c6c45d7e2a9" > elem_to_loc< / a > (tid.z, batch_shape, mask_batch_strides, params-> batch_ndim);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00483"  name = "l00483" > < / a > < span  class = "lineno" >   483< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00484"  name = "l00484" > < / a > < span  class = "lineno" >   484< / span >     < span  class = "keywordflow" > if< / span >  (has_operand_mask) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00485"  name = "l00485" > < / a > < span  class = "lineno" >   485< / span >       < span  class = "keyword" > const< / span >  constant < span  class = "keywordtype" > size_t< / span > * mask_strides_lhs =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00486"  name = "l00486" > < / a > < span  class = "lineno" >   486< / span >           mask_batch_strides + params-> batch_ndim;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00487"  name = "l00487" > < / a > < span  class = "lineno" >   487< / span >       < span  class = "keyword" > const< / span >  constant < span  class = "keywordtype" > size_t< / span > * mask_strides_rhs =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00488"  name = "l00488" > < / a > < span  class = "lineno" >   488< / span >           mask_strides_lhs + params-> batch_ndim;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00489"  name = "l00489" > < / a > < span  class = "lineno" >   489< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00490"  name = "l00490" > < / a > < span  class = "lineno" >   490< / span >       ulong2 batch_offsets = < a  class = "code hl_function"  href = "backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f" > elem_to_loc_broadcast< / a > (< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00491"  name = "l00491" > < / a > < span  class = "lineno" >   491< / span >           tid.z,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00492"  name = "l00492" > < / a > < span  class = "lineno" >   492< / span >           batch_shape,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00493"  name = "l00493" > < / a > < span  class = "lineno" >   493< / span >           mask_strides_lhs,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00494"  name = "l00494" > < / a > < span  class = "lineno" >   494< / span >           mask_strides_rhs,< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00495"  name = "l00495" > < / a > < span  class = "lineno" >   495< / span >           params-> batch_ndim);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00496"  name = "l00496" > < / a > < span  class = "lineno" >   496< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00497"  name = "l00497" > < / a > < span  class = "lineno" >   497< / span >       lhs_mask += batch_offsets.x;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00498"  name = "l00498" > < / a > < span  class = "lineno" >   498< / span >       rhs_mask += batch_offsets.y;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00499"  name = "l00499" > < / a > < span  class = "lineno" >   499< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00500"  name = "l00500" > < / a > < span  class = "lineno" >   500< / span >   } < span  class = "keywordflow" > else< / span >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00501"  name = "l00501" > < / a > < span  class = "lineno" >   501< / span >     out_mask += tid.z * batch_strides[2 * params-> batch_ndim];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00502"  name = "l00502" > < / a > < span  class = "lineno" >   502< / span >     < span  class = "keywordflow" > if< / span >  (has_operand_mask) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00503"  name = "l00503" > < / a > < span  class = "lineno" >   503< / span >       lhs_mask += tid.z * batch_strides[3 * params-> batch_ndim];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00504"  name = "l00504" > < / a > < span  class = "lineno" >   504< / span >       rhs_mask += tid.z * batch_strides[4 * params-> batch_ndim];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00505"  name = "l00505" > < / a > < span  class = "lineno" >   505< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00506"  name = "l00506" > < / a > < span  class = "lineno" >   506< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00507"  name = "l00507" > < / a > < span  class = "lineno" >   507< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00508"  name = "l00508" > < / a > < span  class = "lineno" >   508< / span >   < span  class = "comment" > // Adjust for batch< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00509"  name = "l00509" > < / a > < span  class = "lineno" >   509< / span >   < span  class = "keywordflow" > if< / span >  (params-> batch_ndim >  1) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00510"  name = "l00510" > < / a > < span  class = "lineno" >   510< / span >     < span  class = "keyword" > const< / span >  constant < span  class = "keywordtype" > size_t< / span > * A_bstrides = batch_strides;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00511"  name = "l00511" > < / a > < span  class = "lineno" >   511< / span >     < span  class = "keyword" > const< / span >  constant < span  class = "keywordtype" > size_t< / span > * B_bstrides = batch_strides + params-> batch_ndim;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00512"  name = "l00512" > < / a > < span  class = "lineno" >   512< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00513"  name = "l00513" > < / a > < span  class = "lineno" >   513< / span >     ulong2 batch_offsets = < a  class = "code hl_function"  href = "backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f" > elem_to_loc_broadcast< / a > (< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00514"  name = "l00514" > < / a > < span  class = "lineno" >   514< / span >         tid.z, batch_shape, A_bstrides, B_bstrides, params-> batch_ndim);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00515"  name = "l00515" > < / a > < span  class = "lineno" >   515< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00516"  name = "l00516" > < / a > < span  class = "lineno" >   516< / span >     A += batch_offsets.x;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00517"  name = "l00517" > < / a > < span  class = "lineno" >   517< / span >     B += batch_offsets.y;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00518"  name = "l00518" > < / a > < span  class = "lineno" >   518< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00519"  name = "l00519" > < / a > < span  class = "lineno" >   519< / span >   } < span  class = "keywordflow" > else< / span >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00520"  name = "l00520" > < / a > < span  class = "lineno" >   520< / span >     A += params-> batch_stride_a * tid.z;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00521"  name = "l00521" > < / a > < span  class = "lineno" >   521< / span >     B += params-> batch_stride_b * tid.z;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00522"  name = "l00522" > < / a > < span  class = "lineno" >   522< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00523"  name = "l00523" > < / a > < span  class = "lineno" >   523< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00524"  name = "l00524" > < / a > < span  class = "lineno" >   524< / span >   D += params-> batch_stride_d * tid.z;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00525"  name = "l00525" > < / a > < span  class = "lineno" >   525< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00526"  name = "l00526" > < / a > < span  class = "lineno" >   526< / span >   < span  class = "comment" > // Find block in A, B, C< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00527"  name = "l00527" > < / a > < span  class = "lineno" >   527< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > int< / span >  c_row = tid_y * BM;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00528"  name = "l00528" > < / a > < span  class = "lineno" >   528< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > int< / span >  c_col = tid_x * BN;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00529"  name = "l00529" > < / a > < span  class = "lineno" >   529< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > size_t< / span >  c_row_long = size_t(c_row);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00530"  name = "l00530" > < / a > < span  class = "lineno" >   530< / span >   < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > size_t< / span >  c_col_long = size_t(c_col);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00531"  name = "l00531" > < / a > < span  class = "lineno" >   531< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00532"  name = "l00532" > < / a > < span  class = "lineno" >   532< / span >   A += transpose_a ? c_row_long : c_row_long * params-> lda;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00533"  name = "l00533" > < / a > < span  class = "lineno" >   533< / span >   B += transpose_b ? c_col_long * params-> ldb : c_col_long;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00534"  name = "l00534" > < / a > < span  class = "lineno" >   534< / span >   D += c_row_long * params-> ldd + c_col_long;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00535"  name = "l00535" > < / a > < span  class = "lineno" >   535< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00536"  name = "l00536" > < / a > < span  class = "lineno" >   536< / span >   < span  class = "keywordtype" > bool< / span >  mask_out = out_mask[tid_y * mask_strides[1] + tid_x * mask_strides[0]];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00537"  name = "l00537" > < / a > < span  class = "lineno" >   537< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00538"  name = "l00538" > < / a > < span  class = "lineno" >   538< / span >   < span  class = "comment" > // Write zeros and return< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00539"  name = "l00539" > < / a > < span  class = "lineno" >   539< / span >   < span  class = "keywordflow" > if< / span >  (!mask_out) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00540"  name = "l00540" > < / a > < span  class = "lineno" >   540< / span >     < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > short< / span >  tgp_size = WM * WN * 32;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00541"  name = "l00541" > < / a > < span  class = "lineno" >   541< / span >     < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > short< / span >  vec_size = 4;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00542"  name = "l00542" > < / a > < span  class = "lineno" >   542< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00543"  name = "l00543" > < / a > < span  class = "lineno" >   543< / span >     < span  class = "comment" > // Tile threads in threadgroup< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00544"  name = "l00544" > < / a > < span  class = "lineno" >   544< / span >     < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > short< / span >  TN = BN / vec_size;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00545"  name = "l00545" > < / a > < span  class = "lineno" >   545< / span >     < span  class = "keyword" > constexpr< / span >  < span  class = "keywordtype" > short< / span >  TM = tgp_size / TN;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00546"  name = "l00546" > < / a > < span  class = "lineno" >   546< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00547"  name = "l00547" > < / a > < span  class = "lineno" >   547< / span >     < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > short< / span >  thread_idx = simd_group_id * 32 + simd_lane_id;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00548"  name = "l00548" > < / a > < span  class = "lineno" >   548< / span >     < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > short< / span >  bi = thread_idx / TN;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00549"  name = "l00549" > < / a > < span  class = "lineno" >   549< / span >     < span  class = "keyword" > const< / span >  < span  class = "keywordtype" > short< / span >  bj = vec_size * (thread_idx % TN);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00550"  name = "l00550" > < / a > < span  class = "lineno" >   550< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00551"  name = "l00551" > < / a > < span  class = "lineno" >   551< / span >     D += bi * params-> ldd + bj;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00552"  name = "l00552" > < / a > < span  class = "lineno" >   552< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00553"  name = "l00553" > < / a > < span  class = "lineno" >   553< / span >     < span  class = "keywordtype" > short< / span >  tgp_bm = < a  class = "code hl_function"  href = "namespacemetal.html#a6653b28c9473087141eddce39878d4d3" > min< / a > (BM, params-> M - c_row);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00554"  name = "l00554" > < / a > < span  class = "lineno" >   554< / span >     < span  class = "keywordtype" > short< / span >  tgp_bn = < a  class = "code hl_function"  href = "namespacemetal.html#a6653b28c9473087141eddce39878d4d3" > min< / a > (BN, params-> N - c_col);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00555"  name = "l00555" > < / a > < span  class = "lineno" >   555< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00556"  name = "l00556" > < / a > < span  class = "lineno" >   556< / span >     < span  class = "keywordflow" > if< / span >  (MN_aligned || (tgp_bm == BM & &  tgp_bn == BN)) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00557"  name = "l00557" > < / a > < span  class = "lineno" >   557< / span >       < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > short< / span >  ti = 0; ti <  BM; ti += TM) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00558"  name = "l00558" > < / a > < span  class = "lineno" >   558< / span >         < a  class = "code hl_define"  href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00559"  name = "l00559" > < / a > < span  class = "lineno" >   559< / span >         < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > short< / span >  j = 0; j <  vec_size; j++) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00560"  name = "l00560" > < / a > < span  class = "lineno" >   560< / span >           D[ti * params-> ldd + j] = T(0.);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00561"  name = "l00561" > < / a > < span  class = "lineno" >   561< / span >         }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00562"  name = "l00562" > < / a > < span  class = "lineno" >   562< / span >       }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00563"  name = "l00563" > < / a > < span  class = "lineno" >   563< / span >     } < span  class = "keywordflow" > else< / span >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00564"  name = "l00564" > < / a > < span  class = "lineno" >   564< / span >       < span  class = "keywordtype" > short< / span >  jmax = tgp_bn - bj;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00565"  name = "l00565" > < / a > < span  class = "lineno" >   565< / span >       jmax = jmax <  vec_size ? jmax : vec_size;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00566"  name = "l00566" > < / a > < span  class = "lineno" >   566< / span >       < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > short< / span >  ti = 0; (bi + ti) <  tgp_bm; ti += TM) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00567"  name = "l00567" > < / a > < span  class = "lineno" >   567< / span >         < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > short< / span >  j = 0; j <  jmax; j++) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00568"  name = "l00568" > < / a > < span  class = "lineno" >   568< / span >           D[ti * params-> ldd + j] = T(0.);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00569"  name = "l00569" > < / a > < span  class = "lineno" >   569< / span >         }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00570"  name = "l00570" > < / a > < span  class = "lineno" >   570< / span >       }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00571"  name = "l00571" > < / a > < span  class = "lineno" >   571< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00572"  name = "l00572" > < / a > < span  class = "lineno" >   572< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00573"  name = "l00573" > < / a > < span  class = "lineno" >   573< / span >     < span  class = "keywordflow" > return< / span > ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00574"  name = "l00574" > < / a > < span  class = "lineno" >   574< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00575"  name = "l00575" > < / a > < span  class = "lineno" >   575< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00576"  name = "l00576" > < / a > < span  class = "lineno" >   576< / span >   threadgroup_barrier(mem_flags::mem_none);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00577"  name = "l00577" > < / a > < span  class = "lineno" >   577< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00578"  name = "l00578" > < / a > < span  class = "lineno" >   578< / span >   < span  class = "comment" > // Prepare threadgroup mma operation< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00579"  name = "l00579" > < / a > < span  class = "lineno" >   579< / span >   thread < span  class = "keyword" > typename< / span >  gemm_kernel::mma_t mma_op(simd_group_id, simd_lane_id);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00580"  name = "l00580" > < / a > < span  class = "lineno" >   580< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00581"  name = "l00581" > < / a > < span  class = "lineno" >   581< / span >   < span  class = "keywordtype" > int< / span >  gemm_k_iterations = params-> gemm_k_iterations_aligned;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00582"  name = "l00582" > < / a > < span  class = "lineno" >   582< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00583"  name = "l00583" > < / a > < span  class = "lineno" >   583< / span >   threadgroup T As[gemm_kernel::tgp_mem_size_a];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00584"  name = "l00584" > < / a > < span  class = "lineno" >   584< / span >   threadgroup T Bs[gemm_kernel::tgp_mem_size_b];< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00585"  name = "l00585" > < / a > < span  class = "lineno" >   585< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00586"  name = "l00586" > < / a > < span  class = "lineno" >   586< / span >   < span  class = "comment" > // Prepare threadgroup loading operations< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00587"  name = "l00587" > < / a > < span  class = "lineno" >   587< / span >   thread < span  class = "keyword" > typename< / span >  gemm_kernel::loader_a_t loader_a(< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00588"  name = "l00588" > < / a > < span  class = "lineno" >   588< / span >       A, params-> lda, As, simd_group_id, simd_lane_id);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00589"  name = "l00589" > < / a > < span  class = "lineno" >   589< / span >   thread < span  class = "keyword" > typename< / span >  gemm_kernel::loader_b_t loader_b(< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00590"  name = "l00590" > < / a > < span  class = "lineno" >   590< / span >       B, params-> ldb, Bs, simd_group_id, simd_lane_id);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00591"  name = "l00591" > < / a > < span  class = "lineno" >   591< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00593"  name = "l00593" > < / a > < span  class = "lineno" >   593< / span >   < span  class = "comment" > // MNK aligned loop< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00594"  name = "l00594" > < / a > < span  class = "lineno" >   594< / span >   < span  class = "keywordflow" > if< / span >  (MN_aligned) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00595"  name = "l00595" > < / a > < span  class = "lineno" >   595< / span >     < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > int< / span >  k = 0; k <  gemm_k_iterations; k++) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00596"  name = "l00596" > < / a > < span  class = "lineno" >   596< / span >       threadgroup_barrier(mem_flags::mem_threadgroup);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00597"  name = "l00597" > < / a > < span  class = "lineno" >   597< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00598"  name = "l00598" > < / a > < span  class = "lineno" >   598< / span >       < span  class = "keywordflow" > if< / span >  (!has_operand_mask ||< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00599"  name = "l00599" > < / a > < span  class = "lineno" >   599< / span >           (lhs_mask< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00600"  name = "l00600" > < / a > < span  class = "lineno" >   600< / span >                [tid_y * mask_strides[3] + ((k * BK) / BM) * mask_strides[2]] & & < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00601"  name = "l00601" > < / a > < span  class = "lineno" >   601< / span >            rhs_mask< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00602"  name = "l00602" > < / a > < span  class = "lineno" >   602< / span >                [((k * BK) / BM) * mask_strides[5] + tid_x * mask_strides[4]])) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00603"  name = "l00603" > < / a > < span  class = "lineno" >   603< / span >         < span  class = "comment" > // Load elements into threadgroup< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00604"  name = "l00604" > < / a > < span  class = "lineno" >   604< / span >         loader_a.load_unsafe();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00605"  name = "l00605" > < / a > < span  class = "lineno" >   605< / span >         loader_b.load_unsafe();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00606"  name = "l00606" > < / a > < span  class = "lineno" >   606< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00607"  name = "l00607" > < / a > < span  class = "lineno" >   607< / span >         threadgroup_barrier(mem_flags::mem_threadgroup);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00608"  name = "l00608" > < / a > < span  class = "lineno" >   608< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00609"  name = "l00609" > < / a > < span  class = "lineno" >   609< / span >         < span  class = "comment" > // Multiply and accumulate threadgroup elements< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00610"  name = "l00610" > < / a > < span  class = "lineno" >   610< / span >         mma_op.mma(As, Bs);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00611"  name = "l00611" > < / a > < span  class = "lineno" >   611< / span >       }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00612"  name = "l00612" > < / a > < span  class = "lineno" >   612< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00613"  name = "l00613" > < / a > < span  class = "lineno" >   613< / span >       < span  class = "comment" > // Prepare for next iteration< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00614"  name = "l00614" > < / a > < span  class = "lineno" >   614< / span >       loader_a.next();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00615"  name = "l00615" > < / a > < span  class = "lineno" >   615< / span >       loader_b.next();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00616"  name = "l00616" > < / a > < span  class = "lineno" >   616< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00617"  name = "l00617" > < / a > < span  class = "lineno" >   617< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00618"  name = "l00618" > < / a > < span  class = "lineno" >   618< / span >     threadgroup_barrier(mem_flags::mem_none);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00619"  name = "l00619" > < / a > < span  class = "lineno" >   619< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00620"  name = "l00620" > < / a > < span  class = "lineno" >   620< / span >     < span  class = "comment" > // Loop tail< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00621"  name = "l00621" > < / a > < span  class = "lineno" >   621< / span >     < span  class = "keywordflow" > if< / span >  (!K_aligned) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00622"  name = "l00622" > < / a > < span  class = "lineno" >   622< / span >       < span  class = "keywordflow" > if< / span >  (!has_operand_mask ||< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00623"  name = "l00623" > < / a > < span  class = "lineno" >   623< / span >           (lhs_mask< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00624"  name = "l00624" > < / a > < span  class = "lineno" >   624< / span >                [tid_y * mask_strides[3] + (params-> K / BM) * mask_strides[2]] & & < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00625"  name = "l00625" > < / a > < span  class = "lineno" >   625< / span >            rhs_mask< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00626"  name = "l00626" > < / a > < span  class = "lineno" >   626< / span >                [(params-> K / BM) * mask_strides[5] +< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00627"  name = "l00627" > < / a > < span  class = "lineno" >   627< / span >                 tid_x * mask_strides[4]])) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00628"  name = "l00628" > < / a > < span  class = "lineno" >   628< / span >         < span  class = "keywordtype" > int< / span >  lbk = params-> K - params-> gemm_k_iterations_aligned * BK;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00629"  name = "l00629" > < / a > < span  class = "lineno" >   629< / span >         short2 tile_dims_A = transpose_a ? short2(BM, lbk) : short2(lbk, BM);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00630"  name = "l00630" > < / a > < span  class = "lineno" >   630< / span >         short2 tile_dims_B = transpose_b ? short2(lbk, BN) : short2(BN, lbk);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00631"  name = "l00631" > < / a > < span  class = "lineno" >   631< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00632"  name = "l00632" > < / a > < span  class = "lineno" >   632< / span >         loader_a.load_safe(tile_dims_A);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00633"  name = "l00633" > < / a > < span  class = "lineno" >   633< / span >         loader_b.load_safe(tile_dims_B);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00634"  name = "l00634" > < / a > < span  class = "lineno" >   634< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00635"  name = "l00635" > < / a > < span  class = "lineno" >   635< / span >         threadgroup_barrier(mem_flags::mem_threadgroup);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00636"  name = "l00636" > < / a > < span  class = "lineno" >   636< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00637"  name = "l00637" > < / a > < span  class = "lineno" >   637< / span >         mma_op.mma(As, Bs);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00638"  name = "l00638" > < / a > < span  class = "lineno" >   638< / span >       }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00639"  name = "l00639" > < / a > < span  class = "lineno" >   639< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00640"  name = "l00640" > < / a > < span  class = "lineno" >   640< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00641"  name = "l00641" > < / a > < span  class = "lineno" >   641< / span >     < span  class = "comment" > // Store results to device memory< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00642"  name = "l00642" > < / a > < span  class = "lineno" >   642< / span >     mma_op.store_result(D, params-> ldd);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00643"  name = "l00643" > < / a > < span  class = "lineno" >   643< / span >     < span  class = "keywordflow" > return< / span > ;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00644"  name = "l00644" > < / a > < span  class = "lineno" >   644< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00645"  name = "l00645" > < / a > < span  class = "lineno" >   645< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00647"  name = "l00647" > < / a > < span  class = "lineno" >   647< / span >   < span  class = "comment" > // MN unaligned loop< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00648"  name = "l00648" > < / a > < span  class = "lineno" >   648< / span >   < span  class = "keywordflow" > else< / span >  { < span  class = "comment" > // Loop over K - unaligned case< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00649"  name = "l00649" > < / a > < span  class = "lineno" >   649< / span >     < span  class = "keywordtype" > short< / span >  tgp_bm = < a  class = "code hl_function"  href = "namespacemetal.html#a6653b28c9473087141eddce39878d4d3" > min< / a > (BM, params-> M - c_row);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00650"  name = "l00650" > < / a > < span  class = "lineno" >   650< / span >     < span  class = "keywordtype" > short< / span >  tgp_bn = < a  class = "code hl_function"  href = "namespacemetal.html#a6653b28c9473087141eddce39878d4d3" > min< / a > (BN, params-> N - c_col);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00651"  name = "l00651" > < / a > < span  class = "lineno" >   651< / span >     < span  class = "keywordtype" > short< / span >  lbk = params-> K - params-> gemm_k_iterations_aligned * BK;< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00652"  name = "l00652" > < / a > < span  class = "lineno" >   652< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00653"  name = "l00653" > < / a > < span  class = "lineno" >   653< / span >     < span  class = "keywordtype" > bool< / span >  M_aligned = (tgp_bm == BM);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00654"  name = "l00654" > < / a > < span  class = "lineno" >   654< / span >     < span  class = "keywordtype" > bool< / span >  N_aligned = (tgp_bn == BN);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00655"  name = "l00655" > < / a > < span  class = "lineno" >   655< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00656"  name = "l00656" > < / a > < span  class = "lineno" >   656< / span >     short2 tile_dims_A = transpose_a ? short2(tgp_bm, BK) : short2(BK, tgp_bm);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00657"  name = "l00657" > < / a > < span  class = "lineno" >   657< / span >     short2 tile_dims_B = transpose_b ? short2(BK, tgp_bn) : short2(tgp_bn, BK);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00658"  name = "l00658" > < / a > < span  class = "lineno" >   658< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00659"  name = "l00659" > < / a > < span  class = "lineno" >   659< / span >     < span  class = "keywordflow" > for< / span >  (< span  class = "keywordtype" > int< / span >  k = 0; k <  gemm_k_iterations; k++) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00660"  name = "l00660" > < / a > < span  class = "lineno" >   660< / span >       threadgroup_barrier(mem_flags::mem_threadgroup);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00661"  name = "l00661" > < / a > < span  class = "lineno" >   661< / span >       < span  class = "keywordflow" > if< / span >  (!has_operand_mask ||< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00662"  name = "l00662" > < / a > < span  class = "lineno" >   662< / span >           (lhs_mask< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00663"  name = "l00663" > < / a > < span  class = "lineno" >   663< / span >                [tid_y * mask_strides[3] + ((k * BK) / BM) * mask_strides[2]] & & < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00664"  name = "l00664" > < / a > < span  class = "lineno" >   664< / span >            rhs_mask< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00665"  name = "l00665" > < / a > < span  class = "lineno" >   665< / span >                [((k * BK) / BM) * mask_strides[5] + tid_x * mask_strides[4]])) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00666"  name = "l00666" > < / a > < span  class = "lineno" >   666< / span >         < span  class = "comment" > // Load elements into threadgroup< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00667"  name = "l00667" > < / a > < span  class = "lineno" >   667< / span >         < span  class = "keywordflow" > if< / span >  (M_aligned) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00668"  name = "l00668" > < / a > < span  class = "lineno" >   668< / span >           loader_a.load_unsafe();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00669"  name = "l00669" > < / a > < span  class = "lineno" >   669< / span >         } < span  class = "keywordflow" > else< / span >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00670"  name = "l00670" > < / a > < span  class = "lineno" >   670< / span >           loader_a.load_safe(tile_dims_A);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00671"  name = "l00671" > < / a > < span  class = "lineno" >   671< / span >         }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00672"  name = "l00672" > < / a > < span  class = "lineno" >   672< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00673"  name = "l00673" > < / a > < span  class = "lineno" >   673< / span >         < span  class = "keywordflow" > if< / span >  (N_aligned) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00674"  name = "l00674" > < / a > < span  class = "lineno" >   674< / span >           loader_b.load_unsafe();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00675"  name = "l00675" > < / a > < span  class = "lineno" >   675< / span >         } < span  class = "keywordflow" > else< / span >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00676"  name = "l00676" > < / a > < span  class = "lineno" >   676< / span >           loader_b.load_safe(tile_dims_B);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00677"  name = "l00677" > < / a > < span  class = "lineno" >   677< / span >         }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00678"  name = "l00678" > < / a > < span  class = "lineno" >   678< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00679"  name = "l00679" > < / a > < span  class = "lineno" >   679< / span >         threadgroup_barrier(mem_flags::mem_threadgroup);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00680"  name = "l00680" > < / a > < span  class = "lineno" >   680< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00681"  name = "l00681" > < / a > < span  class = "lineno" >   681< / span >         < span  class = "comment" > // Multiply and accumulate threadgroup elements< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00682"  name = "l00682" > < / a > < span  class = "lineno" >   682< / span >         mma_op.mma(As, Bs);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00683"  name = "l00683" > < / a > < span  class = "lineno" >   683< / span >       }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00684"  name = "l00684" > < / a > < span  class = "lineno" >   684< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00685"  name = "l00685" > < / a > < span  class = "lineno" >   685< / span >       < span  class = "comment" > // Prepare for next iteration< / span > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00686"  name = "l00686" > < / a > < span  class = "lineno" >   686< / span >       loader_a.next();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00687"  name = "l00687" > < / a > < span  class = "lineno" >   687< / span >       loader_b.next();< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00688"  name = "l00688" > < / a > < span  class = "lineno" >   688< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00689"  name = "l00689" > < / a > < span  class = "lineno" >   689< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00690"  name = "l00690" > < / a > < span  class = "lineno" >   690< / span >     < span  class = "keywordflow" > if< / span >  (!K_aligned) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00691"  name = "l00691" > < / a > < span  class = "lineno" >   691< / span >       threadgroup_barrier(mem_flags::mem_threadgroup);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00692"  name = "l00692" > < / a > < span  class = "lineno" >   692< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00693"  name = "l00693" > < / a > < span  class = "lineno" >   693< / span >       < span  class = "keywordflow" > if< / span >  (!has_operand_mask ||< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00694"  name = "l00694" > < / a > < span  class = "lineno" >   694< / span >           (lhs_mask< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00695"  name = "l00695" > < / a > < span  class = "lineno" >   695< / span >                [tid_y * mask_strides[3] + (params-> K / BM) * mask_strides[2]] & & < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00696"  name = "l00696" > < / a > < span  class = "lineno" >   696< / span >            rhs_mask< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00697"  name = "l00697" > < / a > < span  class = "lineno" >   697< / span >                [(params-> K / BM) * mask_strides[5] +< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00698"  name = "l00698" > < / a > < span  class = "lineno" >   698< / span >                 tid_x * mask_strides[4]])) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00699"  name = "l00699" > < / a > < span  class = "lineno" >   699< / span >         short2 tile_dims_A_last =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00700"  name = "l00700" > < / a > < span  class = "lineno" >   700< / span >             transpose_a ? short2(tgp_bm, lbk) : short2(lbk, tgp_bm);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00701"  name = "l00701" > < / a > < span  class = "lineno" >   701< / span >         short2 tile_dims_B_last =< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00702"  name = "l00702" > < / a > < span  class = "lineno" >   702< / span >             transpose_b ? short2(lbk, tgp_bn) : short2(tgp_bn, lbk);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00703"  name = "l00703" > < / a > < span  class = "lineno" >   703< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00704"  name = "l00704" > < / a > < span  class = "lineno" >   704< / span >         loader_a.load_safe(tile_dims_A_last);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00705"  name = "l00705" > < / a > < span  class = "lineno" >   705< / span >         loader_b.load_safe(tile_dims_B_last);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00706"  name = "l00706" > < / a > < span  class = "lineno" >   706< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00707"  name = "l00707" > < / a > < span  class = "lineno" >   707< / span >         threadgroup_barrier(mem_flags::mem_threadgroup);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00708"  name = "l00708" > < / a > < span  class = "lineno" >   708< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00709"  name = "l00709" > < / a > < span  class = "lineno" >   709< / span >         mma_op.mma(As, Bs);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00710"  name = "l00710" > < / a > < span  class = "lineno" >   710< / span >       }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00711"  name = "l00711" > < / a > < span  class = "lineno" >   711< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00712"  name = "l00712" > < / a > < span  class = "lineno" >   712< / span >  < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00713"  name = "l00713" > < / a > < span  class = "lineno" >   713< / span >     < span  class = "keywordflow" > if< / span >  (M_aligned & &  N_aligned) {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00714"  name = "l00714" > < / a > < span  class = "lineno" >   714< / span >       mma_op.store_result(D, params-> ldd);< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00715"  name = "l00715" > < / a > < span  class = "lineno" >   715< / span >     } < span  class = "keywordflow" > else< / span >  {< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00716"  name = "l00716" > < / a > < span  class = "lineno" >   716< / span >       mma_op.store_result_safe(D, params-> ldd, short2(tgp_bn, tgp_bm));< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00717"  name = "l00717" > < / a > < span  class = "lineno" >   717< / span >     }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00718"  name = "l00718" > < / a > < span  class = "lineno" >   718< / span >   }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "line" > < a  id = "l00719"  name = "l00719" > < / a > < span  class = "lineno" >   719< / span > }< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "abackend_2metal_2kernels_2steel_2utils_8h_html_aaf4974425147d6f26d031691e321637f" > < div  class = "ttname" > < a  href = "backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f" > elem_to_loc_broadcast< / a > < / div > < div  class = "ttdeci" > METAL_FUNC ulong2 elem_to_loc_broadcast(uint elem, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, int ndim)< / div > < div  class = "ttdef" > < b > Definition< / b >  utils.h:7< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "abackend_2metal_2kernels_2utils_8h_html_a2e49fa7ab8f6348543455c6c45d7e2a9" > < div  class = "ttname" > < a  href = "backend_2metal_2kernels_2utils_8h.html#a2e49fa7ab8f6348543455c6c45d7e2a9" > elem_to_loc< / a > < / div > < div  class = "ttdeci" > METAL_FUNC stride_t elem_to_loc(uint elem, device const int *shape, device const stride_t *strides, int ndim)< / div > < div  class = "ttdef" > < b > Definition< / b >  utils.h:77< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "anamespacemetal_html" > < div  class = "ttname" > < a  href = "namespacemetal.html" > metal< / a > < / div > < div  class = "ttdef" > < b > Definition< / b >  bf16.h:265< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "anamespacemetal_html_a6653b28c9473087141eddce39878d4d3" > < div  class = "ttname" > < a  href = "namespacemetal.html#a6653b28c9473087141eddce39878d4d3" > metal::min< / a > < / div > < div  class = "ttdeci" > METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)< / div > < div  class = "ttdef" > < b > Definition< / b >  bf16_math.h:234< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "anamespacemlx_1_1steel_html" > < div  class = "ttname" > < a  href = "namespacemlx_1_1steel.html" > mlx::steel< / a > < / div > < div  class = "ttdef" > < b > Definition< / b >  loader_channel_l.h:14< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "asteel_2defines_8h_html" > < div  class = "ttname" > < a  href = "steel_2defines_8h.html" > defines.h< / a > < / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "asteel_2defines_8h_html_a5a5c3095b132a7589bc19cd5cb80e2c6" > < div  class = "ttname" > < a  href = "steel_2defines_8h.html#a5a5c3095b132a7589bc19cd5cb80e2c6" > STEEL_PRAGMA_UNROLL< / a > < / div > < div  class = "ttdeci" > #define STEEL_PRAGMA_UNROLL< / div > < div  class = "ttdef" > < b > Definition< / b >  defines.h:4< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "asteel__gemm__masked_8h_html_af805e998b2046ee30c2b4be813e3af97" > < div  class = "ttname" > < a  href = "steel__gemm__masked_8h.html#af805e998b2046ee30c2b4be813e3af97" > block_masked_gemm< / a > < / div > < div  class = "ttdeci" > void block_masked_gemm(const device T *A, const device T *B, device T *D, const constant GEMMParams *params, const constant int *batch_shape, const constant size_t *batch_strides, const device out_mask_t *out_mask, const device op_mask_t *lhs_mask, const device op_mask_t *rhs_mask, const constant int *mask_strides, uint simd_lane_id, uint simd_group_id, uint3 tid, uint3 lid)< / div > < div  class = "ttdef" > < b > Definition< / b >  steel_gemm_masked.h:53< / div > < / div > 
							 
						 
					
						
							
								
									
										
										
										
											2024-08-10 09:24:35 -07:00 
										
									 
								 
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct___no_mask_html" > < div  class = "ttname" > < a  href = "struct___no_mask.html" > _NoMask< / a > < / div > < div  class = "ttdef" > < b > Definition< / b >  gemv_masked.h:10< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct___no_mask_html_a0c4a4557d5c97ceafe3a2c4e521cdf7e" > < div  class = "ttname" > < a  href = "struct___no_mask.html#a0c4a4557d5c97ceafe3a2c4e521cdf7e" > _NoMask::x< / a > < / div > < div  class = "ttdeci" > char x< / div > < div  class = "ttdef" > < b > Definition< / b >  gemv_masked.h:11< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_scale_op_html" > < div  class = "ttname" > < a  href = "struct_scale_op.html" > ScaleOp< / a > < / div > < div  class = "ttdef" > < b > Definition< / b >  gemv_masked.h:30< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_scale_op_html_a02043fac21c68fb8d6863a01f45ede4b" > < div  class = "ttname" > < a  href = "struct_scale_op.html#a02043fac21c68fb8d6863a01f45ede4b" > ScaleOp::scale< / a > < / div > < div  class = "ttdeci" > OutT scale< / div > < div  class = "ttdef" > < b > Definition< / b >  gemv_masked.h:31< / div > < / div > 
							 
						 
					
						
							
								
									
										
										
										
											2024-06-06 20:28:06 -07:00 
										
									 
								 
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astruct_scale_op_html_a69f82bc925843a4e1c14dfe8ad2f3218" > < div  class = "ttname" > < a  href = "struct_scale_op.html#a69f82bc925843a4e1c14dfe8ad2f3218" > ScaleOp::apply< / a > < / div > < div  class = "ttdeci" > METAL_FUNC OutT apply(InT x) const< / div > < div  class = "ttdef" > < b > Definition< / b >  steel_gemm_masked.h:32< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astructmlx_1_1steel_1_1_g_e_m_m_kernel_html" > < div  class = "ttname" > < a  href = "structmlx_1_1steel_1_1_g_e_m_m_kernel.html" > mlx::steel::GEMMKernel< / a > < / div > < div  class = "ttdef" > < b > Definition< / b >  gemm.h:37< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< div  class = "ttc"  id = "astructmlx_1_1steel_1_1_g_e_m_m_params_html" > < div  class = "ttname" > < a  href = "structmlx_1_1steel_1_1_g_e_m_m_params.html" > mlx::steel::GEMMParams< / a > < / div > < div  class = "ttdef" > < b > Definition< / b >  params.h:12< / div > < / div > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / div > <!--  fragment  --> < / div > <!--  contents  --> 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								<!--  start footer part  --> 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< hr  class = "footer" / > < address  class = "footer" > < small > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								Generated by  < a  href = "https://www.doxygen.org/index.html" > < img  class = "footer"  src = "doxygen.svg"  width = "104"  height = "31"  alt = "doxygen" / > < / a >  1.10.0
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / small > < / address > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / body > 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								< / html >