2024-10-15 23:12:17 +08:00
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
< html xmlns = "http://www.w3.org/1999/xhtml" lang = "en-US" >
< head >
< meta http-equiv = "Content-Type" content = "text/xhtml;charset=UTF-8" / >
< meta http-equiv = "X-UA-Compatible" content = "IE=11" / >
< meta name = "generator" content = "Doxygen 1.12.0" / >
< meta name = "viewport" content = "width=device-width, initial-scale=1" / >
< title > MLX: mlx/backend/metal/kernels/quantized.h Source File< / title >
< link href = "tabs.css" rel = "stylesheet" type = "text/css" / >
< script type = "text/javascript" src = "jquery.js" > < / script >
< script type = "text/javascript" src = "dynsections.js" > < / script >
< script type = "text/javascript" src = "clipboard.js" > < / script >
< link href = "navtree.css" rel = "stylesheet" type = "text/css" / >
< script type = "text/javascript" src = "resize.js" > < / script >
< script type = "text/javascript" src = "cookie.js" > < / script >
< link href = "search/search.css" rel = "stylesheet" type = "text/css" / >
< script type = "text/javascript" src = "search/searchdata.js" > < / script >
< script type = "text/javascript" src = "search/search.js" > < / script >
< link href = "doxygen.css" rel = "stylesheet" type = "text/css" / >
< / head >
< body >
< div id = "top" > <!-- do not remove this div, it is closed by doxygen! -->
< div id = "titlearea" >
< table cellspacing = "0" cellpadding = "0" >
< tbody >
< tr id = "projectrow" >
< td id = "projectalign" >
< div id = "projectname" > MLX
< / div >
< / td >
< / tr >
< / tbody >
< / table >
< / div >
<!-- end header part -->
<!-- Generated by Doxygen 1.12.0 -->
< script type = "text/javascript" >
/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699& dn=expat.txt MIT */
var searchBox = new SearchBox("searchBox", "search/",'.html');
/* @license-end */
< / script >
< script type = "text/javascript" >
/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699& dn=expat.txt MIT */
$(function() { codefold.init(0); });
/* @license-end */
< / script >
< script type = "text/javascript" src = "menudata.js" > < / script >
< script type = "text/javascript" src = "menu.js" > < / script >
< script type = "text/javascript" >
/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699& dn=expat.txt MIT */
$(function() {
initMenu('',true,false,'search.php','Search',false);
$(function() { init_search(); });
});
/* @license-end */
< / script >
< div id = "main-nav" > < / div >
< script type = "text/javascript" >
/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699& dn=expat.txt MIT */
$(function(){ initResizable(false); });
/* @license-end */
< / script >
<!-- window showing the filter options -->
< div id = "MSearchSelectWindow"
onmouseover="return searchBox.OnSearchSelectShow()"
onmouseout="return searchBox.OnSearchSelectHide()"
onkeydown="return searchBox.OnSearchSelectKey(event)">
< / div >
<!-- iframe showing the search results (closed by default) -->
< div id = "MSearchResultsWindow" >
< div id = "MSearchResults" >
< div class = "SRPage" >
< div id = "SRIndex" >
< div id = "SRResults" > < / div >
< div class = "SRStatus" id = "Loading" > Loading...< / div >
< div class = "SRStatus" id = "Searching" > Searching...< / div >
< div class = "SRStatus" id = "NoMatches" > No Matches< / div >
< / div >
< / div >
< / div >
< / div >
< div id = "nav-path" class = "navpath" >
< ul >
< li class = "navelem" > < a class = "el" href = "dir_938ab0ecf10b8b860ff766c820f665fd.html" > mlx< / a > < / li > < li class = "navelem" > < a class = "el" href = "dir_1d446c9bd3c99228254c9484e0bc5c06.html" > backend< / a > < / li > < li class = "navelem" > < a class = "el" href = "dir_d0c977ea65824390717cdb7efc36c157.html" > metal< / a > < / li > < li class = "navelem" > < a class = "el" href = "dir_70a37effa88bcbd6b791977fa1e64356.html" > kernels< / a > < / li > < / ul >
< / div >
< / div > <!-- top -->
< div id = "doc-content" >
< div class = "header" >
< div class = "headertitle" > < div class = "title" > quantized.h< / div > < / div >
< / div > <!-- header -->
< div class = "contents" >
< a href = "quantized_8h.html" > Go to the documentation of this file.< / a > < div class = "fragment" > < div class = "line" > < a id = "l00001" name = "l00001" > < / a > < span class = "lineno" > 1< / span > < span class = "comment" > // Copyright © 2023-2024 Apple Inc.< / span > < / div >
< div class = "line" > < a id = "l00002" name = "l00002" > < / a > < span class = "lineno" > 2< / span > < / div >
< div class = "line" > < a id = "l00003" name = "l00003" > < / a > < span class = "lineno" > 3< / span > < span class = "preprocessor" > #include < metal_simdgroup> < / span > < / div >
< div class = "line" > < a id = "l00004" name = "l00004" > < / a > < span class = "lineno" > 4< / span > < span class = "preprocessor" > #include < metal_stdlib> < / span > < / div >
< div class = "line" > < a id = "l00005" name = "l00005" > < / a > < span class = "lineno" > 5< / span > < / div >
< div class = "line" > < a id = "l00006" name = "l00006" > < / a > < span class = "lineno" > 6< / span > < span class = "keyword" > using namespace < / span > < a class = "code hl_namespace" href = "namespacemetal.html" > metal< / a > ;< / div >
< div class = "line" > < a id = "l00007" name = "l00007" > < / a > < span class = "lineno" > 7< / span > < / div >
< div class = "line" > < a id = "l00008" name = "l00008" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd" > 8< / a > < / span > < span class = "preprocessor" > #define MLX_MTL_CONST static constant constexpr const< / span > < / div >
< div class = "line" > < a id = "l00009" name = "l00009" > < / a > < span class = "lineno" > 9< / span > < / div >
< div class = "line" > < a id = "l00010" name = "l00010" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#a62969a218d93680f5e35d0c61b160b99" > 10< / a > < / span > < a class = "code hl_define" href = "quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd" > MLX_MTL_CONST< / a > < span class = "keywordtype" > int< / span > < a class = "code hl_variable" href = "quantized_8h.html#a62969a218d93680f5e35d0c61b160b99" > SIMD_SIZE< / a > = 32;< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00011" name = "l00011" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#a803e4d5a1459844ba647aea5b004e133" > 11< / a > < / span > < a class = "code hl_define" href = "quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd" > MLX_MTL_CONST< / a > < span class = "keywordtype" > int< / span > < a class = "code hl_variable" href = "quantized_8h.html#a803e4d5a1459844ba647aea5b004e133" > QUAD_SIZE< / a > = 4;< / div >
< div class = "line" > < a id = "l00012" name = "l00012" > < / a > < span class = "lineno" > 12< / span > < / div >
< div class = "line" > < a id = "l00013" name = "l00013" > < / a > < span class = "lineno" > 13< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T, < span class = "keyword" > typename< / span > U, < span class = "keywordtype" > int< / span > values_per_thread, < span class = "keywordtype" > int< / span > bits> < / div >
< div class = "foldopen" id = "foldopen00014" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00014" name = "l00014" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9" > 14< / a > < / span > < span class = "keyword" > inline< / span > U < a class = "code hl_function" href = "quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9" > load_vector< / a > (< span class = "keyword" > const< / span > device T* x, thread U* x_thread) {< / div >
< div class = "line" > < a id = "l00015" name = "l00015" > < / a > < span class = "lineno" > 15< / span > < span class = "keyword" > static_assert< / span > (< / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00016" name = "l00016" > < / a > < span class = "lineno" > 16< / span > bits == 2 || bits == 3 || bits == 4 || bits == 6 || bits == 8,< / div >
< div class = "line" > < a id = "l00017" name = "l00017" > < / a > < span class = "lineno" > 17< / span > < span class = "stringliteral" > " Template undefined for bits not in {2, 3, 4, 6, 8}" < / span > );< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00018" name = "l00018" > < / a > < span class = "lineno" > 18< / span > < / div >
< div class = "line" > < a id = "l00019" name = "l00019" > < / a > < span class = "lineno" > 19< / span > U sum = 0;< / div >
< div class = "line" > < a id = "l00020" name = "l00020" > < / a > < span class = "lineno" > 20< / span > < / div >
< div class = "line" > < a id = "l00021" name = "l00021" > < / a > < span class = "lineno" > 21< / span > < span class = "keywordflow" > if< / span > (bits == 2) {< / div >
< div class = "line" > < a id = "l00022" name = "l00022" > < / a > < span class = "lineno" > 22< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < values_per_thread; i += 4) {< / div >
< div class = "line" > < a id = "l00023" name = "l00023" > < / a > < span class = "lineno" > 23< / span > sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];< / div >
< div class = "line" > < a id = "l00024" name = "l00024" > < / a > < span class = "lineno" > 24< / span > x_thread[i] = x[i];< / div >
< div class = "line" > < a id = "l00025" name = "l00025" > < / a > < span class = "lineno" > 25< / span > x_thread[i + 1] = x[i + 1] / 4.0f;< / div >
< div class = "line" > < a id = "l00026" name = "l00026" > < / a > < span class = "lineno" > 26< / span > x_thread[i + 2] = x[i + 2] / 16.0f;< / div >
< div class = "line" > < a id = "l00027" name = "l00027" > < / a > < span class = "lineno" > 27< / span > x_thread[i + 3] = x[i + 3] / 64.0f;< / div >
< div class = "line" > < a id = "l00028" name = "l00028" > < / a > < span class = "lineno" > 28< / span > }< / div >
< div class = "line" > < a id = "l00029" name = "l00029" > < / a > < span class = "lineno" > 29< / span > }< / div >
< div class = "line" > < a id = "l00030" name = "l00030" > < / a > < span class = "lineno" > 30< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00031" name = "l00031" > < / a > < span class = "lineno" > 31< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 3) {< / div >
< div class = "line" > < a id = "l00032" name = "l00032" > < / a > < span class = "lineno" > 32< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < values_per_thread; i += 8) {< / div >
< div class = "line" > < a id = "l00033" name = "l00033" > < / a > < span class = "lineno" > 33< / span > sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3] + x[i + 4] + x[i + 5] +< / div >
< div class = "line" > < a id = "l00034" name = "l00034" > < / a > < span class = "lineno" > 34< / span > x[i + 6] + x[i + 7];< / div >
< div class = "line" > < a id = "l00035" name = "l00035" > < / a > < span class = "lineno" > 35< / span > x_thread[i] = x[i];< / div >
< div class = "line" > < a id = "l00036" name = "l00036" > < / a > < span class = "lineno" > 36< / span > x_thread[i + 1] = x[i + 1] / 8.0f;< / div >
< div class = "line" > < a id = "l00037" name = "l00037" > < / a > < span class = "lineno" > 37< / span > x_thread[i + 2] = x[i + 2] / 64.0f;< / div >
< div class = "line" > < a id = "l00038" name = "l00038" > < / a > < span class = "lineno" > 38< / span > x_thread[i + 3] = x[i + 3] / 2.0f;< / div >
< div class = "line" > < a id = "l00039" name = "l00039" > < / a > < span class = "lineno" > 39< / span > x_thread[i + 4] = x[i + 4] / 16.0f;< / div >
< div class = "line" > < a id = "l00040" name = "l00040" > < / a > < span class = "lineno" > 40< / span > x_thread[i + 5] = x[i + 5] / 128.0f;< / div >
< div class = "line" > < a id = "l00041" name = "l00041" > < / a > < span class = "lineno" > 41< / span > x_thread[i + 6] = x[i + 6] / 4.0f;< / div >
< div class = "line" > < a id = "l00042" name = "l00042" > < / a > < span class = "lineno" > 42< / span > x_thread[i + 7] = x[i + 7] / 32.0f;< / div >
< div class = "line" > < a id = "l00043" name = "l00043" > < / a > < span class = "lineno" > 43< / span > }< / div >
< div class = "line" > < a id = "l00044" name = "l00044" > < / a > < span class = "lineno" > 44< / span > }< / div >
< div class = "line" > < a id = "l00045" name = "l00045" > < / a > < span class = "lineno" > 45< / span > < / div >
< div class = "line" > < a id = "l00046" name = "l00046" > < / a > < span class = "lineno" > 46< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 4) {< / div >
< div class = "line" > < a id = "l00047" name = "l00047" > < / a > < span class = "lineno" > 47< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < values_per_thread; i += 4) {< / div >
< div class = "line" > < a id = "l00048" name = "l00048" > < / a > < span class = "lineno" > 48< / span > sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];< / div >
< div class = "line" > < a id = "l00049" name = "l00049" > < / a > < span class = "lineno" > 49< / span > x_thread[i] = x[i];< / div >
< div class = "line" > < a id = "l00050" name = "l00050" > < / a > < span class = "lineno" > 50< / span > x_thread[i + 1] = x[i + 1] / 16.0f;< / div >
< div class = "line" > < a id = "l00051" name = "l00051" > < / a > < span class = "lineno" > 51< / span > x_thread[i + 2] = x[i + 2] / 256.0f;< / div >
< div class = "line" > < a id = "l00052" name = "l00052" > < / a > < span class = "lineno" > 52< / span > x_thread[i + 3] = x[i + 3] / 4096.0f;< / div >
< div class = "line" > < a id = "l00053" name = "l00053" > < / a > < span class = "lineno" > 53< / span > }< / div >
< div class = "line" > < a id = "l00054" name = "l00054" > < / a > < span class = "lineno" > 54< / span > }< / div >
< div class = "line" > < a id = "l00055" name = "l00055" > < / a > < span class = "lineno" > 55< / span > < / div >
< div class = "line" > < a id = "l00056" name = "l00056" > < / a > < span class = "lineno" > 56< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 6) {< / div >
< div class = "line" > < a id = "l00057" name = "l00057" > < / a > < span class = "lineno" > 57< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < values_per_thread; i += 4) {< / div >
< div class = "line" > < a id = "l00058" name = "l00058" > < / a > < span class = "lineno" > 58< / span > sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];< / div >
< div class = "line" > < a id = "l00059" name = "l00059" > < / a > < span class = "lineno" > 59< / span > x_thread[i] = x[i];< / div >
< div class = "line" > < a id = "l00060" name = "l00060" > < / a > < span class = "lineno" > 60< / span > x_thread[i + 1] = x[i + 1] / 64.0f;< / div >
< div class = "line" > < a id = "l00061" name = "l00061" > < / a > < span class = "lineno" > 61< / span > x_thread[i + 2] = x[i + 2] / 16.0f;< / div >
< div class = "line" > < a id = "l00062" name = "l00062" > < / a > < span class = "lineno" > 62< / span > x_thread[i + 3] = x[i + 3] / 4.0f;< / div >
< div class = "line" > < a id = "l00063" name = "l00063" > < / a > < span class = "lineno" > 63< / span > }< / div >
< div class = "line" > < a id = "l00064" name = "l00064" > < / a > < span class = "lineno" > 64< / span > }< / div >
< div class = "line" > < a id = "l00065" name = "l00065" > < / a > < span class = "lineno" > 65< / span > < / div >
< div class = "line" > < a id = "l00066" name = "l00066" > < / a > < span class = "lineno" > 66< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 8) {< / div >
< div class = "line" > < a id = "l00067" name = "l00067" > < / a > < span class = "lineno" > 67< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < values_per_thread; i++) {< / div >
< div class = "line" > < a id = "l00068" name = "l00068" > < / a > < span class = "lineno" > 68< / span > sum += x[i];< / div >
< div class = "line" > < a id = "l00069" name = "l00069" > < / a > < span class = "lineno" > 69< / span > x_thread[i] = x[i];< / div >
< div class = "line" > < a id = "l00070" name = "l00070" > < / a > < span class = "lineno" > 70< / span > }< / div >
< div class = "line" > < a id = "l00071" name = "l00071" > < / a > < span class = "lineno" > 71< / span > }< / div >
< div class = "line" > < a id = "l00072" name = "l00072" > < / a > < span class = "lineno" > 72< / span > < / div >
< div class = "line" > < a id = "l00073" name = "l00073" > < / a > < span class = "lineno" > 73< / span > < span class = "keywordflow" > return< / span > sum;< / div >
< div class = "line" > < a id = "l00074" name = "l00074" > < / a > < span class = "lineno" > 74< / span > }< / div >
2024-10-15 23:12:17 +08:00
< / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00075" name = "l00075" > < / a > < span class = "lineno" > 75< / span > < / div >
< div class = "line" > < a id = "l00076" name = "l00076" > < / a > < span class = "lineno" > 76< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T, < span class = "keyword" > typename< / span > U, < span class = "keywordtype" > int< / span > values_per_thread, < span class = "keywordtype" > int< / span > bits> < / div >
< div class = "foldopen" id = "foldopen00077" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00077" name = "l00077" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7" > 77< / a > < / span > < span class = "keyword" > inline< / span > U < a class = "code hl_function" href = "quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7" > load_vector_safe< / a > (< span class = "keyword" > const< / span > device T* x, thread U* x_thread, < span class = "keywordtype" > int< / span > N) {< / div >
< div class = "line" > < a id = "l00078" name = "l00078" > < / a > < span class = "lineno" > 78< / span > < span class = "keyword" > static_assert< / span > (< / div >
< div class = "line" > < a id = "l00079" name = "l00079" > < / a > < span class = "lineno" > 79< / span > bits == 2 || bits == 3 || bits == 4 || bits == 6 || bits == 8,< / div >
< div class = "line" > < a id = "l00080" name = "l00080" > < / a > < span class = "lineno" > 80< / span > < span class = "stringliteral" > " Template undefined for bits not in {2, 3, 4, 6, 8}" < / span > );< / div >
< div class = "line" > < a id = "l00081" name = "l00081" > < / a > < span class = "lineno" > 81< / span > < / div >
< div class = "line" > < a id = "l00082" name = "l00082" > < / a > < span class = "lineno" > 82< / span > U sum = 0;< / div >
< div class = "line" > < a id = "l00083" name = "l00083" > < / a > < span class = "lineno" > 83< / span > < / div >
< div class = "line" > < a id = "l00084" name = "l00084" > < / a > < span class = "lineno" > 84< / span > < span class = "keywordflow" > if< / span > (bits == 2) {< / div >
< div class = "line" > < a id = "l00085" name = "l00085" > < / a > < span class = "lineno" > 85< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < N; i += 4) {< / div >
< div class = "line" > < a id = "l00086" name = "l00086" > < / a > < span class = "lineno" > 86< / span > sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];< / div >
< div class = "line" > < a id = "l00087" name = "l00087" > < / a > < span class = "lineno" > 87< / span > x_thread[i] = x[i];< / div >
< div class = "line" > < a id = "l00088" name = "l00088" > < / a > < span class = "lineno" > 88< / span > x_thread[i + 1] = x[i + 1] / 4.0f;< / div >
< div class = "line" > < a id = "l00089" name = "l00089" > < / a > < span class = "lineno" > 89< / span > x_thread[i + 2] = x[i + 2] / 16.0f;< / div >
< div class = "line" > < a id = "l00090" name = "l00090" > < / a > < span class = "lineno" > 90< / span > x_thread[i + 3] = x[i + 3] / 64.0f;< / div >
< div class = "line" > < a id = "l00091" name = "l00091" > < / a > < span class = "lineno" > 91< / span > }< / div >
< div class = "line" > < a id = "l00092" name = "l00092" > < / a > < span class = "lineno" > 92< / span > }< / div >
< div class = "line" > < a id = "l00093" name = "l00093" > < / a > < span class = "lineno" > 93< / span > < / div >
< div class = "line" > < a id = "l00094" name = "l00094" > < / a > < span class = "lineno" > 94< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 3) {< / div >
< div class = "line" > < a id = "l00095" name = "l00095" > < / a > < span class = "lineno" > 95< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < N; i += 8) {< / div >
< div class = "line" > < a id = "l00096" name = "l00096" > < / a > < span class = "lineno" > 96< / span > sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3] + x[i + 4] + x[i + 5] +< / div >
< div class = "line" > < a id = "l00097" name = "l00097" > < / a > < span class = "lineno" > 97< / span > x[i + 6] + x[i + 7];< / div >
< div class = "line" > < a id = "l00098" name = "l00098" > < / a > < span class = "lineno" > 98< / span > < / div >
< div class = "line" > < a id = "l00099" name = "l00099" > < / a > < span class = "lineno" > 99< / span > x_thread[i] = x[i];< / div >
< div class = "line" > < a id = "l00100" name = "l00100" > < / a > < span class = "lineno" > 100< / span > x_thread[i + 1] = x[i + 1] / 8.0f;< / div >
< div class = "line" > < a id = "l00101" name = "l00101" > < / a > < span class = "lineno" > 101< / span > x_thread[i + 2] = x[i + 2] / 64.0f;< / div >
< div class = "line" > < a id = "l00102" name = "l00102" > < / a > < span class = "lineno" > 102< / span > x_thread[i + 3] = x[i + 3] / 2.0f;< / div >
< div class = "line" > < a id = "l00103" name = "l00103" > < / a > < span class = "lineno" > 103< / span > x_thread[i + 4] = x[i + 4] / 16.0f;< / div >
< div class = "line" > < a id = "l00104" name = "l00104" > < / a > < span class = "lineno" > 104< / span > x_thread[i + 5] = x[i + 5] / 128.0f;< / div >
< div class = "line" > < a id = "l00105" name = "l00105" > < / a > < span class = "lineno" > 105< / span > x_thread[i + 6] = x[i + 6] / 4.0f;< / div >
< div class = "line" > < a id = "l00106" name = "l00106" > < / a > < span class = "lineno" > 106< / span > x_thread[i + 7] = x[i + 7] / 32.0f;< / div >
< div class = "line" > < a id = "l00107" name = "l00107" > < / a > < span class = "lineno" > 107< / span > }< / div >
< div class = "line" > < a id = "l00108" name = "l00108" > < / a > < span class = "lineno" > 108< / span > }< / div >
< div class = "line" > < a id = "l00109" name = "l00109" > < / a > < span class = "lineno" > 109< / span > < / div >
< div class = "line" > < a id = "l00110" name = "l00110" > < / a > < span class = "lineno" > 110< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 4) {< / div >
< div class = "line" > < a id = "l00111" name = "l00111" > < / a > < span class = "lineno" > 111< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < N; i += 4) {< / div >
< div class = "line" > < a id = "l00112" name = "l00112" > < / a > < span class = "lineno" > 112< / span > sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];< / div >
< div class = "line" > < a id = "l00113" name = "l00113" > < / a > < span class = "lineno" > 113< / span > x_thread[i] = x[i];< / div >
< div class = "line" > < a id = "l00114" name = "l00114" > < / a > < span class = "lineno" > 114< / span > x_thread[i + 1] = x[i + 1] / 16.0f;< / div >
< div class = "line" > < a id = "l00115" name = "l00115" > < / a > < span class = "lineno" > 115< / span > x_thread[i + 2] = x[i + 2] / 256.0f;< / div >
< div class = "line" > < a id = "l00116" name = "l00116" > < / a > < span class = "lineno" > 116< / span > x_thread[i + 3] = x[i + 3] / 4096.0f;< / div >
< div class = "line" > < a id = "l00117" name = "l00117" > < / a > < span class = "lineno" > 117< / span > }< / div >
< div class = "line" > < a id = "l00118" name = "l00118" > < / a > < span class = "lineno" > 118< / span > }< / div >
< div class = "line" > < a id = "l00119" name = "l00119" > < / a > < span class = "lineno" > 119< / span > < / div >
< div class = "line" > < a id = "l00120" name = "l00120" > < / a > < span class = "lineno" > 120< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 6) {< / div >
< div class = "line" > < a id = "l00121" name = "l00121" > < / a > < span class = "lineno" > 121< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < N; i += 4) {< / div >
< div class = "line" > < a id = "l00122" name = "l00122" > < / a > < span class = "lineno" > 122< / span > sum += x[i] + x[i + 1] + x[i + 2] + x[i + 3];< / div >
< div class = "line" > < a id = "l00123" name = "l00123" > < / a > < span class = "lineno" > 123< / span > x_thread[i] = x[i];< / div >
< div class = "line" > < a id = "l00124" name = "l00124" > < / a > < span class = "lineno" > 124< / span > x_thread[i + 1] = x[i + 1] / 64.0f;< / div >
< div class = "line" > < a id = "l00125" name = "l00125" > < / a > < span class = "lineno" > 125< / span > x_thread[i + 2] = x[i + 2] / 16.0f;< / div >
< div class = "line" > < a id = "l00126" name = "l00126" > < / a > < span class = "lineno" > 126< / span > x_thread[i + 3] = x[i + 3] / 4.0f;< / div >
< div class = "line" > < a id = "l00127" name = "l00127" > < / a > < span class = "lineno" > 127< / span > }< / div >
< div class = "line" > < a id = "l00128" name = "l00128" > < / a > < span class = "lineno" > 128< / span > }< / div >
< div class = "line" > < a id = "l00129" name = "l00129" > < / a > < span class = "lineno" > 129< / span > < / div >
< div class = "line" > < a id = "l00130" name = "l00130" > < / a > < span class = "lineno" > 130< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 8) {< / div >
< div class = "line" > < a id = "l00131" name = "l00131" > < / a > < span class = "lineno" > 131< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < N; i++) {< / div >
< div class = "line" > < a id = "l00132" name = "l00132" > < / a > < span class = "lineno" > 132< / span > sum += x[i];< / div >
< div class = "line" > < a id = "l00133" name = "l00133" > < / a > < span class = "lineno" > 133< / span > x_thread[i] = x[i];< / div >
< div class = "line" > < a id = "l00134" name = "l00134" > < / a > < span class = "lineno" > 134< / span > }< / div >
< div class = "line" > < a id = "l00135" name = "l00135" > < / a > < span class = "lineno" > 135< / span > }< / div >
< div class = "line" > < a id = "l00136" name = "l00136" > < / a > < span class = "lineno" > 136< / span > < / div >
< div class = "line" > < a id = "l00137" name = "l00137" > < / a > < span class = "lineno" > 137< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = N; i < values_per_thread; i++) {< / div >
< div class = "line" > < a id = "l00138" name = "l00138" > < / a > < span class = "lineno" > 138< / span > x_thread[i] = 0;< / div >
< div class = "line" > < a id = "l00139" name = "l00139" > < / a > < span class = "lineno" > 139< / span > }< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00140" name = "l00140" > < / a > < span class = "lineno" > 140< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00141" name = "l00141" > < / a > < span class = "lineno" > 141< / span > < span class = "keywordflow" > return< / span > sum;< / div >
< div class = "line" > < a id = "l00142" name = "l00142" > < / a > < span class = "lineno" > 142< / span > }< / div >
< / div >
< div class = "line" > < a id = "l00143" name = "l00143" > < / a > < span class = "lineno" > 143< / span > < / div >
< div class = "line" > < a id = "l00144" name = "l00144" > < / a > < span class = "lineno" > 144< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > U, < span class = "keywordtype" > int< / span > values_per_thread, < span class = "keywordtype" > int< / span > bits> < / div >
< div class = "foldopen" id = "foldopen00145" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00145" name = "l00145" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#ab364d58ab652e3ad87a8f80910556071" > 145< / a > < / span > < span class = "keyword" > inline< / span > U < a class = "code hl_function" href = "quantized_8h.html#ab364d58ab652e3ad87a8f80910556071" > qdot< / a > (< / div >
< div class = "line" > < a id = "l00146" name = "l00146" > < / a > < span class = "lineno" > 146< / span > < span class = "keyword" > const< / span > device uint8_t* w,< / div >
< div class = "line" > < a id = "l00147" name = "l00147" > < / a > < span class = "lineno" > 147< / span > < span class = "keyword" > const< / span > thread U* x_thread,< / div >
< div class = "line" > < a id = "l00148" name = "l00148" > < / a > < span class = "lineno" > 148< / span > U scale,< / div >
< div class = "line" > < a id = "l00149" name = "l00149" > < / a > < span class = "lineno" > 149< / span > U bias,< / div >
< div class = "line" > < a id = "l00150" name = "l00150" > < / a > < span class = "lineno" > 150< / span > U sum) {< / div >
< div class = "line" > < a id = "l00151" name = "l00151" > < / a > < span class = "lineno" > 151< / span > < span class = "keyword" > static_assert< / span > (< / div >
< div class = "line" > < a id = "l00152" name = "l00152" > < / a > < span class = "lineno" > 152< / span > bits == 2 || bits == 3 || bits == 4 || bits == 6 || bits == 8,< / div >
< div class = "line" > < a id = "l00153" name = "l00153" > < / a > < span class = "lineno" > 153< / span > < span class = "stringliteral" > " Template undefined for bits not in {2, 3, 4, 6, 8}" < / span > );< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00154" name = "l00154" > < / a > < span class = "lineno" > 154< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00155" name = "l00155" > < / a > < span class = "lineno" > 155< / span > U accum = 0;< / div >
< div class = "line" > < a id = "l00156" name = "l00156" > < / a > < span class = "lineno" > 156< / span > < / div >
< div class = "line" > < a id = "l00157" name = "l00157" > < / a > < span class = "lineno" > 157< / span > < span class = "keywordflow" > if< / span > (bits == 2) {< / div >
< div class = "line" > < a id = "l00158" name = "l00158" > < / a > < span class = "lineno" > 158< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < (values_per_thread / 4); i++) {< / div >
< div class = "line" > < a id = "l00159" name = "l00159" > < / a > < span class = "lineno" > 159< / span > accum +=< / div >
< div class = "line" > < a id = "l00160" name = "l00160" > < / a > < span class = "lineno" > 160< / span > (x_thread[4 * i] * (w[i] & 0x03) +< / div >
< div class = "line" > < a id = "l00161" name = "l00161" > < / a > < span class = "lineno" > 161< / span > x_thread[4 * i + 1] * (w[i] & 0x0c) +< / div >
< div class = "line" > < a id = "l00162" name = "l00162" > < / a > < span class = "lineno" > 162< / span > x_thread[4 * i + 2] * (w[i] & 0x30) +< / div >
< div class = "line" > < a id = "l00163" name = "l00163" > < / a > < span class = "lineno" > 163< / span > x_thread[4 * i + 3] * (w[i] & 0xc0));< / div >
< div class = "line" > < a id = "l00164" name = "l00164" > < / a > < span class = "lineno" > 164< / span > }< / div >
< div class = "line" > < a id = "l00165" name = "l00165" > < / a > < span class = "lineno" > 165< / span > }< / div >
< div class = "line" > < a id = "l00166" name = "l00166" > < / a > < span class = "lineno" > 166< / span > < / div >
< div class = "line" > < a id = "l00167" name = "l00167" > < / a > < span class = "lineno" > 167< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 3) {< / div >
< div class = "line" > < a id = "l00168" name = "l00168" > < / a > < span class = "lineno" > 168< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < (values_per_thread / 8); i++) {< / div >
< div class = "line" > < a id = "l00169" name = "l00169" > < / a > < span class = "lineno" > 169< / span > x_thread += 8 * i;< / div >
< div class = "line" > < a id = "l00170" name = "l00170" > < / a > < span class = "lineno" > 170< / span > w += 3 * i;< / div >
< div class = "line" > < a id = "l00171" name = "l00171" > < / a > < span class = "lineno" > 171< / span > < / div >
< div class = "line" > < a id = "l00172" name = "l00172" > < / a > < span class = "lineno" > 172< / span > accum += (w[0] & 0x07) * x_thread[0];< / div >
< div class = "line" > < a id = "l00173" name = "l00173" > < / a > < span class = "lineno" > 173< / span > accum += (w[0] & 0x38) * x_thread[1];< / div >
< div class = "line" > < a id = "l00174" name = "l00174" > < / a > < span class = "lineno" > 174< / span > accum += (w[0] & 0xc0) * x_thread[2];< / div >
< div class = "line" > < a id = "l00175" name = "l00175" > < / a > < span class = "lineno" > 175< / span > accum += (w[1] & 0x01) * (x_thread[2] * 256.0f);< / div >
< div class = "line" > < a id = "l00176" name = "l00176" > < / a > < span class = "lineno" > 176< / span > < / div >
< div class = "line" > < a id = "l00177" name = "l00177" > < / a > < span class = "lineno" > 177< / span > accum += (w[1] & 0x0e) * x_thread[3];< / div >
< div class = "line" > < a id = "l00178" name = "l00178" > < / a > < span class = "lineno" > 178< / span > accum += (w[1] & 0x70) * x_thread[4];< / div >
< div class = "line" > < a id = "l00179" name = "l00179" > < / a > < span class = "lineno" > 179< / span > accum += (w[1] & 0x80) * x_thread[5];< / div >
< div class = "line" > < a id = "l00180" name = "l00180" > < / a > < span class = "lineno" > 180< / span > accum += (w[2] & 0x03) * (x_thread[5] * 256.0f);< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00181" name = "l00181" > < / a > < span class = "lineno" > 181< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00182" name = "l00182" > < / a > < span class = "lineno" > 182< / span > accum += (w[2] & 0x1c) * x_thread[6];< / div >
< div class = "line" > < a id = "l00183" name = "l00183" > < / a > < span class = "lineno" > 183< / span > accum += (w[2] & 0xe0) * x_thread[7];< / div >
< div class = "line" > < a id = "l00184" name = "l00184" > < / a > < span class = "lineno" > 184< / span > }< / div >
< div class = "line" > < a id = "l00185" name = "l00185" > < / a > < span class = "lineno" > 185< / span > }< / div >
< div class = "line" > < a id = "l00186" name = "l00186" > < / a > < span class = "lineno" > 186< / span > < / div >
< div class = "line" > < a id = "l00187" name = "l00187" > < / a > < span class = "lineno" > 187< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 4) {< / div >
< div class = "line" > < a id = "l00188" name = "l00188" > < / a > < span class = "lineno" > 188< / span > < span class = "keyword" > const< / span > device uint16_t* ws = (< span class = "keyword" > const< / span > device uint16_t*)w;< / div >
< div class = "line" > < a id = "l00189" name = "l00189" > < / a > < span class = "lineno" > 189< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < (values_per_thread / 4); i++) {< / div >
< div class = "line" > < a id = "l00190" name = "l00190" > < / a > < span class = "lineno" > 190< / span > accum +=< / div >
< div class = "line" > < a id = "l00191" name = "l00191" > < / a > < span class = "lineno" > 191< / span > (x_thread[4 * i] * (ws[i] & 0x000f) +< / div >
< div class = "line" > < a id = "l00192" name = "l00192" > < / a > < span class = "lineno" > 192< / span > x_thread[4 * i + 1] * (ws[i] & 0x00f0) +< / div >
< div class = "line" > < a id = "l00193" name = "l00193" > < / a > < span class = "lineno" > 193< / span > x_thread[4 * i + 2] * (ws[i] & 0x0f00) +< / div >
< div class = "line" > < a id = "l00194" name = "l00194" > < / a > < span class = "lineno" > 194< / span > x_thread[4 * i + 3] * (ws[i] & 0xf000));< / div >
< div class = "line" > < a id = "l00195" name = "l00195" > < / a > < span class = "lineno" > 195< / span > }< / div >
< div class = "line" > < a id = "l00196" name = "l00196" > < / a > < span class = "lineno" > 196< / span > }< / div >
< div class = "line" > < a id = "l00197" name = "l00197" > < / a > < span class = "lineno" > 197< / span > < / div >
< div class = "line" > < a id = "l00198" name = "l00198" > < / a > < span class = "lineno" > 198< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 6) {< / div >
< div class = "line" > < a id = "l00199" name = "l00199" > < / a > < span class = "lineno" > 199< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < (values_per_thread / 4); i++) {< / div >
< div class = "line" > < a id = "l00200" name = "l00200" > < / a > < span class = "lineno" > 200< / span > x_thread += 4 * i;< / div >
< div class = "line" > < a id = "l00201" name = "l00201" > < / a > < span class = "lineno" > 201< / span > w += 3 * i;< / div >
< div class = "line" > < a id = "l00202" name = "l00202" > < / a > < span class = "lineno" > 202< / span > < / div >
< div class = "line" > < a id = "l00203" name = "l00203" > < / a > < span class = "lineno" > 203< / span > accum += (w[0] & 0x3f) * x_thread[0];< / div >
< div class = "line" > < a id = "l00204" name = "l00204" > < / a > < span class = "lineno" > 204< / span > < / div >
< div class = "line" > < a id = "l00205" name = "l00205" > < / a > < span class = "lineno" > 205< / span > accum += (w[0] & 0xc0) * x_thread[1];< / div >
< div class = "line" > < a id = "l00206" name = "l00206" > < / a > < span class = "lineno" > 206< / span > accum += (w[1] & 0x0f) * (x_thread[1] * 256.0f);< / div >
< div class = "line" > < a id = "l00207" name = "l00207" > < / a > < span class = "lineno" > 207< / span > < / div >
< div class = "line" > < a id = "l00208" name = "l00208" > < / a > < span class = "lineno" > 208< / span > accum += (w[1] & 0xf0) * x_thread[2];< / div >
< div class = "line" > < a id = "l00209" name = "l00209" > < / a > < span class = "lineno" > 209< / span > accum += (w[2] & 0x03) * (x_thread[2] * 256.0f);< / div >
< div class = "line" > < a id = "l00210" name = "l00210" > < / a > < span class = "lineno" > 210< / span > < / div >
< div class = "line" > < a id = "l00211" name = "l00211" > < / a > < span class = "lineno" > 211< / span > accum += (w[2] & 0xfc) * x_thread[3];< / div >
< div class = "line" > < a id = "l00212" name = "l00212" > < / a > < span class = "lineno" > 212< / span > }< / div >
< div class = "line" > < a id = "l00213" name = "l00213" > < / a > < span class = "lineno" > 213< / span > }< / div >
< div class = "line" > < a id = "l00214" name = "l00214" > < / a > < span class = "lineno" > 214< / span > < / div >
< div class = "line" > < a id = "l00215" name = "l00215" > < / a > < span class = "lineno" > 215< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 8) {< / div >
< div class = "line" > < a id = "l00216" name = "l00216" > < / a > < span class = "lineno" > 216< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < values_per_thread; i++) {< / div >
< div class = "line" > < a id = "l00217" name = "l00217" > < / a > < span class = "lineno" > 217< / span > accum += x_thread[i] * w[i];< / div >
< div class = "line" > < a id = "l00218" name = "l00218" > < / a > < span class = "lineno" > 218< / span > }< / div >
< div class = "line" > < a id = "l00219" name = "l00219" > < / a > < span class = "lineno" > 219< / span > }< / div >
< div class = "line" > < a id = "l00220" name = "l00220" > < / a > < span class = "lineno" > 220< / span > < / div >
< div class = "line" > < a id = "l00221" name = "l00221" > < / a > < span class = "lineno" > 221< / span > < span class = "keywordflow" > return< / span > scale * accum + sum * bias;< / div >
< div class = "line" > < a id = "l00222" name = "l00222" > < / a > < span class = "lineno" > 222< / span > }< / div >
2024-10-15 23:12:17 +08:00
< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00223" name = "l00223" > < / a > < span class = "lineno" > 223< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00224" name = "l00224" > < / a > < span class = "lineno" > 224< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > U, < span class = "keywordtype" > int< / span > values_per_thread, < span class = "keywordtype" > int< / span > bits> < / div >
< div class = "foldopen" id = "foldopen00225" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00225" name = "l00225" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#a07b26d2d0b0d65dfe925c452c453fa42" > 225< / a > < / span > < span class = "keyword" > inline< / span > U < a class = "code hl_function" href = "quantized_8h.html#a07b26d2d0b0d65dfe925c452c453fa42" > qdot_safe< / a > (< / div >
< div class = "line" > < a id = "l00226" name = "l00226" > < / a > < span class = "lineno" > 226< / span > < span class = "keyword" > const< / span > device uint8_t* w,< / div >
< div class = "line" > < a id = "l00227" name = "l00227" > < / a > < span class = "lineno" > 227< / span > < span class = "keyword" > const< / span > thread U* x_thread,< / div >
< div class = "line" > < a id = "l00228" name = "l00228" > < / a > < span class = "lineno" > 228< / span > U scale,< / div >
< div class = "line" > < a id = "l00229" name = "l00229" > < / a > < span class = "lineno" > 229< / span > U bias,< / div >
< div class = "line" > < a id = "l00230" name = "l00230" > < / a > < span class = "lineno" > 230< / span > U sum,< / div >
< div class = "line" > < a id = "l00231" name = "l00231" > < / a > < span class = "lineno" > 231< / span > < span class = "keywordtype" > int< / span > N) {< / div >
< div class = "line" > < a id = "l00232" name = "l00232" > < / a > < span class = "lineno" > 232< / span > < span class = "keyword" > static_assert< / span > (< / div >
< div class = "line" > < a id = "l00233" name = "l00233" > < / a > < span class = "lineno" > 233< / span > bits == 2 || bits == 3 || bits == 4 || bits == 6 || bits == 8,< / div >
< div class = "line" > < a id = "l00234" name = "l00234" > < / a > < span class = "lineno" > 234< / span > < span class = "stringliteral" > " Template undefined for bits not in {2, 3, 4, 6, 8}" < / span > );< / div >
< div class = "line" > < a id = "l00235" name = "l00235" > < / a > < span class = "lineno" > 235< / span > < / div >
< div class = "line" > < a id = "l00236" name = "l00236" > < / a > < span class = "lineno" > 236< / span > U accum = 0;< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00237" name = "l00237" > < / a > < span class = "lineno" > 237< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00238" name = "l00238" > < / a > < span class = "lineno" > 238< / span > < span class = "keywordflow" > if< / span > (bits == 2) {< / div >
< div class = "line" > < a id = "l00239" name = "l00239" > < / a > < span class = "lineno" > 239< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < (N / 4); i++) {< / div >
< div class = "line" > < a id = "l00240" name = "l00240" > < / a > < span class = "lineno" > 240< / span > accum +=< / div >
< div class = "line" > < a id = "l00241" name = "l00241" > < / a > < span class = "lineno" > 241< / span > (x_thread[4 * i] * (w[i] & 0x03) +< / div >
< div class = "line" > < a id = "l00242" name = "l00242" > < / a > < span class = "lineno" > 242< / span > x_thread[4 * i + 1] * (w[i] & 0x0c) +< / div >
< div class = "line" > < a id = "l00243" name = "l00243" > < / a > < span class = "lineno" > 243< / span > x_thread[4 * i + 2] * (w[i] & 0x30) +< / div >
< div class = "line" > < a id = "l00244" name = "l00244" > < / a > < span class = "lineno" > 244< / span > x_thread[4 * i + 3] * (w[i] & 0xc0));< / div >
< div class = "line" > < a id = "l00245" name = "l00245" > < / a > < span class = "lineno" > 245< / span > }< / div >
< div class = "line" > < a id = "l00246" name = "l00246" > < / a > < span class = "lineno" > 246< / span > }< / div >
< div class = "line" > < a id = "l00247" name = "l00247" > < / a > < span class = "lineno" > 247< / span > < / div >
< div class = "line" > < a id = "l00248" name = "l00248" > < / a > < span class = "lineno" > 248< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 3) {< / div >
< div class = "line" > < a id = "l00249" name = "l00249" > < / a > < span class = "lineno" > 249< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < (N / 8); i++) {< / div >
< div class = "line" > < a id = "l00250" name = "l00250" > < / a > < span class = "lineno" > 250< / span > x_thread += 8 * i;< / div >
< div class = "line" > < a id = "l00251" name = "l00251" > < / a > < span class = "lineno" > 251< / span > w += 3 * i;< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00252" name = "l00252" > < / a > < span class = "lineno" > 252< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00253" name = "l00253" > < / a > < span class = "lineno" > 253< / span > accum += (w[0] & 0x07) * x_thread[0];< / div >
< div class = "line" > < a id = "l00254" name = "l00254" > < / a > < span class = "lineno" > 254< / span > accum += (w[0] & 0x38) * x_thread[1];< / div >
< div class = "line" > < a id = "l00255" name = "l00255" > < / a > < span class = "lineno" > 255< / span > accum += (w[0] & 0xc0) * x_thread[2];< / div >
< div class = "line" > < a id = "l00256" name = "l00256" > < / a > < span class = "lineno" > 256< / span > accum += (w[1] & 0x01) * (x_thread[2] * 256.0f);< / div >
< div class = "line" > < a id = "l00257" name = "l00257" > < / a > < span class = "lineno" > 257< / span > < / div >
< div class = "line" > < a id = "l00258" name = "l00258" > < / a > < span class = "lineno" > 258< / span > accum += (w[1] & 0x0e) * x_thread[3];< / div >
< div class = "line" > < a id = "l00259" name = "l00259" > < / a > < span class = "lineno" > 259< / span > accum += (w[1] & 0x70) * x_thread[4];< / div >
< div class = "line" > < a id = "l00260" name = "l00260" > < / a > < span class = "lineno" > 260< / span > accum += (w[1] & 0x80) * x_thread[5];< / div >
< div class = "line" > < a id = "l00261" name = "l00261" > < / a > < span class = "lineno" > 261< / span > accum += (w[2] & 0x03) * (x_thread[5] * 256.0f);< / div >
< div class = "line" > < a id = "l00262" name = "l00262" > < / a > < span class = "lineno" > 262< / span > < / div >
< div class = "line" > < a id = "l00263" name = "l00263" > < / a > < span class = "lineno" > 263< / span > accum += (w[2] & 0x1c) * x_thread[6];< / div >
< div class = "line" > < a id = "l00264" name = "l00264" > < / a > < span class = "lineno" > 264< / span > accum += (w[2] & 0xe0) * x_thread[7];< / div >
< div class = "line" > < a id = "l00265" name = "l00265" > < / a > < span class = "lineno" > 265< / span > }< / div >
< div class = "line" > < a id = "l00266" name = "l00266" > < / a > < span class = "lineno" > 266< / span > }< / div >
< div class = "line" > < a id = "l00267" name = "l00267" > < / a > < span class = "lineno" > 267< / span > < / div >
< div class = "line" > < a id = "l00268" name = "l00268" > < / a > < span class = "lineno" > 268< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 4) {< / div >
< div class = "line" > < a id = "l00269" name = "l00269" > < / a > < span class = "lineno" > 269< / span > < span class = "keyword" > const< / span > device uint16_t* ws = (< span class = "keyword" > const< / span > device uint16_t*)w;< / div >
< div class = "line" > < a id = "l00270" name = "l00270" > < / a > < span class = "lineno" > 270< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < (N / 4); i++) {< / div >
< div class = "line" > < a id = "l00271" name = "l00271" > < / a > < span class = "lineno" > 271< / span > accum +=< / div >
< div class = "line" > < a id = "l00272" name = "l00272" > < / a > < span class = "lineno" > 272< / span > (x_thread[4 * i] * (ws[i] & 0x000f) +< / div >
< div class = "line" > < a id = "l00273" name = "l00273" > < / a > < span class = "lineno" > 273< / span > x_thread[4 * i + 1] * (ws[i] & 0x00f0) +< / div >
< div class = "line" > < a id = "l00274" name = "l00274" > < / a > < span class = "lineno" > 274< / span > x_thread[4 * i + 2] * (ws[i] & 0x0f00) +< / div >
< div class = "line" > < a id = "l00275" name = "l00275" > < / a > < span class = "lineno" > 275< / span > x_thread[4 * i + 3] * (ws[i] & 0xf000));< / div >
< div class = "line" > < a id = "l00276" name = "l00276" > < / a > < span class = "lineno" > 276< / span > }< / div >
< div class = "line" > < a id = "l00277" name = "l00277" > < / a > < span class = "lineno" > 277< / span > }< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00278" name = "l00278" > < / a > < span class = "lineno" > 278< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00279" name = "l00279" > < / a > < span class = "lineno" > 279< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 6) {< / div >
< div class = "line" > < a id = "l00280" name = "l00280" > < / a > < span class = "lineno" > 280< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < (N / 4); i++) {< / div >
< div class = "line" > < a id = "l00281" name = "l00281" > < / a > < span class = "lineno" > 281< / span > x_thread += 4 * i;< / div >
< div class = "line" > < a id = "l00282" name = "l00282" > < / a > < span class = "lineno" > 282< / span > w += 3 * i;< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00283" name = "l00283" > < / a > < span class = "lineno" > 283< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00284" name = "l00284" > < / a > < span class = "lineno" > 284< / span > accum += (w[0] & 0x3f) * x_thread[0];< / div >
< div class = "line" > < a id = "l00285" name = "l00285" > < / a > < span class = "lineno" > 285< / span > < / div >
< div class = "line" > < a id = "l00286" name = "l00286" > < / a > < span class = "lineno" > 286< / span > accum += (w[0] & 0xc0) * x_thread[1];< / div >
< div class = "line" > < a id = "l00287" name = "l00287" > < / a > < span class = "lineno" > 287< / span > accum += (w[1] & 0x0f) * (x_thread[1] * 256.0f);< / div >
< div class = "line" > < a id = "l00288" name = "l00288" > < / a > < span class = "lineno" > 288< / span > < / div >
< div class = "line" > < a id = "l00289" name = "l00289" > < / a > < span class = "lineno" > 289< / span > accum += (w[1] & 0xf0) * x_thread[2];< / div >
< div class = "line" > < a id = "l00290" name = "l00290" > < / a > < span class = "lineno" > 290< / span > accum += (w[2] & 0x03) * (x_thread[2] * 256.0f);< / div >
< div class = "line" > < a id = "l00291" name = "l00291" > < / a > < span class = "lineno" > 291< / span > < / div >
< div class = "line" > < a id = "l00292" name = "l00292" > < / a > < span class = "lineno" > 292< / span > accum += (w[2] & 0xfc) * x_thread[3];< / div >
< div class = "line" > < a id = "l00293" name = "l00293" > < / a > < span class = "lineno" > 293< / span > }< / div >
< div class = "line" > < a id = "l00294" name = "l00294" > < / a > < span class = "lineno" > 294< / span > }< / div >
< div class = "line" > < a id = "l00295" name = "l00295" > < / a > < span class = "lineno" > 295< / span > < / div >
< div class = "line" > < a id = "l00296" name = "l00296" > < / a > < span class = "lineno" > 296< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 8) {< / div >
< div class = "line" > < a id = "l00297" name = "l00297" > < / a > < span class = "lineno" > 297< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < N; i++) {< / div >
< div class = "line" > < a id = "l00298" name = "l00298" > < / a > < span class = "lineno" > 298< / span > accum += x_thread[i] * w[i];< / div >
< div class = "line" > < a id = "l00299" name = "l00299" > < / a > < span class = "lineno" > 299< / span > }< / div >
< div class = "line" > < a id = "l00300" name = "l00300" > < / a > < span class = "lineno" > 300< / span > }< / div >
< div class = "line" > < a id = "l00301" name = "l00301" > < / a > < span class = "lineno" > 301< / span > < / div >
< div class = "line" > < a id = "l00302" name = "l00302" > < / a > < span class = "lineno" > 302< / span > < span class = "keywordflow" > return< / span > scale * accum + sum * bias;< / div >
< div class = "line" > < a id = "l00303" name = "l00303" > < / a > < span class = "lineno" > 303< / span > }< / div >
2024-10-15 23:12:17 +08:00
< / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00304" name = "l00304" > < / a > < span class = "lineno" > 304< / span > < / div >
< div class = "line" > < a id = "l00305" name = "l00305" > < / a > < span class = "lineno" > 305< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > U, < span class = "keywordtype" > int< / span > values_per_thread, < span class = "keywordtype" > int< / span > bits> < / div >
< div class = "line" > < a id = "l00306" name = "l00306" > < / a > < span class = "lineno" > 306< / span > < span class = "keyword" > inline< / span > < span class = "keywordtype" > void< / span > < / div >
< div class = "foldopen" id = "foldopen00307" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00307" name = "l00307" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58" > 307< / a > < / span > < a class = "code hl_function" href = "quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58" > qouter< / a > (< span class = "keyword" > const< / span > thread uint8_t* w, U x, U scale, U bias, thread U* result) {< / div >
< div class = "line" > < a id = "l00308" name = "l00308" > < / a > < span class = "lineno" > 308< / span > < span class = "keyword" > static_assert< / span > (< / div >
< div class = "line" > < a id = "l00309" name = "l00309" > < / a > < span class = "lineno" > 309< / span > bits == 2 || bits == 3 || bits == 4 || bits == 6 || bits == 8,< / div >
< div class = "line" > < a id = "l00310" name = "l00310" > < / a > < span class = "lineno" > 310< / span > < span class = "stringliteral" > " Template undefined for bits not in {2, 3, 4, 6, 8}" < / span > );< / div >
< div class = "line" > < a id = "l00311" name = "l00311" > < / a > < span class = "lineno" > 311< / span > < / div >
< div class = "line" > < a id = "l00312" name = "l00312" > < / a > < span class = "lineno" > 312< / span > < span class = "keywordflow" > if< / span > (bits == 2) {< / div >
< div class = "line" > < a id = "l00313" name = "l00313" > < / a > < span class = "lineno" > 313< / span > U s[4] = {scale, scale / 4.0f, scale / 16.0f, scale / 64.0f};< / div >
< div class = "line" > < a id = "l00314" name = "l00314" > < / a > < span class = "lineno" > 314< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < (values_per_thread / 4); i++) {< / div >
< div class = "line" > < a id = "l00315" name = "l00315" > < / a > < span class = "lineno" > 315< / span > result[4 * i] += x * (s[0] * (w[i] & 0x03) + bias);< / div >
< div class = "line" > < a id = "l00316" name = "l00316" > < / a > < span class = "lineno" > 316< / span > result[4 * i + 1] += x * (s[1] * (w[i] & 0x0c) + bias);< / div >
< div class = "line" > < a id = "l00317" name = "l00317" > < / a > < span class = "lineno" > 317< / span > result[4 * i + 2] += x * (s[2] * (w[i] & 0x30) + bias);< / div >
< div class = "line" > < a id = "l00318" name = "l00318" > < / a > < span class = "lineno" > 318< / span > result[4 * i + 3] += x * (s[3] * (w[i] & 0xc0) + bias);< / div >
< div class = "line" > < a id = "l00319" name = "l00319" > < / a > < span class = "lineno" > 319< / span > }< / div >
< div class = "line" > < a id = "l00320" name = "l00320" > < / a > < span class = "lineno" > 320< / span > }< / div >
< div class = "line" > < a id = "l00321" name = "l00321" > < / a > < span class = "lineno" > 321< / span > < / div >
< div class = "line" > < a id = "l00322" name = "l00322" > < / a > < span class = "lineno" > 322< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 3) {< / div >
< div class = "line" > < a id = "l00323" name = "l00323" > < / a > < span class = "lineno" > 323< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < (values_per_thread / 8); i++) {< / div >
< div class = "line" > < a id = "l00324" name = "l00324" > < / a > < span class = "lineno" > 324< / span > uint8_t w0 = w[3 * i];< / div >
< div class = "line" > < a id = "l00325" name = "l00325" > < / a > < span class = "lineno" > 325< / span > uint8_t w1 = w[3 * i + 1];< / div >
< div class = "line" > < a id = "l00326" name = "l00326" > < / a > < span class = "lineno" > 326< / span > uint8_t w2 = w[3 * i + 2];< / div >
< div class = "line" > < a id = "l00327" name = "l00327" > < / a > < span class = "lineno" > 327< / span > < / div >
< div class = "line" > < a id = "l00328" name = "l00328" > < / a > < span class = "lineno" > 328< / span > result[8 * i] += x * ((w0 & 0x7) * scale + bias);< / div >
< div class = "line" > < a id = "l00329" name = "l00329" > < / a > < span class = "lineno" > 329< / span > result[8 * i + 1] += x * (((w0 & 0x38) > > 3) * scale + bias);< / div >
< div class = "line" > < a id = "l00330" name = "l00330" > < / a > < span class = "lineno" > 330< / span > result[8 * i + 2] +=< / div >
< div class = "line" > < a id = "l00331" name = "l00331" > < / a > < span class = "lineno" > 331< / span > x * ((((w0 & 0xc0) > > 6) + ((w1 & 0x1) < < 2)) * scale + bias);< / div >
< div class = "line" > < a id = "l00332" name = "l00332" > < / a > < span class = "lineno" > 332< / span > result[8 * i + 3] += x * (((w1 & 0xe) > > 1) * scale + bias);< / div >
< div class = "line" > < a id = "l00333" name = "l00333" > < / a > < span class = "lineno" > 333< / span > result[8 * i + 4] += x * (((w1 & 0x70) > > 4) * scale + bias);< / div >
< div class = "line" > < a id = "l00334" name = "l00334" > < / a > < span class = "lineno" > 334< / span > result[8 * i + 5] +=< / div >
< div class = "line" > < a id = "l00335" name = "l00335" > < / a > < span class = "lineno" > 335< / span > x * ((((w1 & 0x80) > > 7) + ((w2 & 0x3) < < 1)) * scale + bias);< / div >
< div class = "line" > < a id = "l00336" name = "l00336" > < / a > < span class = "lineno" > 336< / span > result[8 * i + 6] += x * (((w2 & 0x1c) > > 2) * scale + bias);< / div >
< div class = "line" > < a id = "l00337" name = "l00337" > < / a > < span class = "lineno" > 337< / span > result[8 * i + 7] += x * (((w2 & 0xe0) > > 5) * scale + bias);< / div >
< div class = "line" > < a id = "l00338" name = "l00338" > < / a > < span class = "lineno" > 338< / span > }< / div >
< div class = "line" > < a id = "l00339" name = "l00339" > < / a > < span class = "lineno" > 339< / span > }< / div >
< div class = "line" > < a id = "l00340" name = "l00340" > < / a > < span class = "lineno" > 340< / span > < / div >
< div class = "line" > < a id = "l00341" name = "l00341" > < / a > < span class = "lineno" > 341< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 4) {< / div >
< div class = "line" > < a id = "l00342" name = "l00342" > < / a > < span class = "lineno" > 342< / span > U s[2] = {scale, scale / 16.0f};< / div >
< div class = "line" > < a id = "l00343" name = "l00343" > < / a > < span class = "lineno" > 343< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < (values_per_thread / 2); i++) {< / div >
< div class = "line" > < a id = "l00344" name = "l00344" > < / a > < span class = "lineno" > 344< / span > result[2 * i] += x * (s[0] * (w[i] & 0x0f) + bias);< / div >
< div class = "line" > < a id = "l00345" name = "l00345" > < / a > < span class = "lineno" > 345< / span > result[2 * i + 1] += x * (s[1] * (w[i] & 0xf0) + bias);< / div >
< div class = "line" > < a id = "l00346" name = "l00346" > < / a > < span class = "lineno" > 346< / span > }< / div >
< div class = "line" > < a id = "l00347" name = "l00347" > < / a > < span class = "lineno" > 347< / span > < / div >
< div class = "line" > < a id = "l00348" name = "l00348" > < / a > < span class = "lineno" > 348< / span > } < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 6) {< / div >
< div class = "line" > < a id = "l00349" name = "l00349" > < / a > < span class = "lineno" > 349< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < (values_per_thread / 4); i++) {< / div >
< div class = "line" > < a id = "l00350" name = "l00350" > < / a > < span class = "lineno" > 350< / span > uint8_t w0 = w[3 * i];< / div >
< div class = "line" > < a id = "l00351" name = "l00351" > < / a > < span class = "lineno" > 351< / span > uint8_t w1 = w[3 * i + 1];< / div >
< div class = "line" > < a id = "l00352" name = "l00352" > < / a > < span class = "lineno" > 352< / span > uint8_t w2 = w[3 * i + 2];< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00353" name = "l00353" > < / a > < span class = "lineno" > 353< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00354" name = "l00354" > < / a > < span class = "lineno" > 354< / span > result[4 * i] += x * ((w0 & 0x3f) * scale + bias);< / div >
< div class = "line" > < a id = "l00355" name = "l00355" > < / a > < span class = "lineno" > 355< / span > result[4 * i + 1] +=< / div >
< div class = "line" > < a id = "l00356" name = "l00356" > < / a > < span class = "lineno" > 356< / span > x * ((((w0 > > 6) & 0x03) + ((w1 & 0x0f) < < 2)) * scale + bias);< / div >
< div class = "line" > < a id = "l00357" name = "l00357" > < / a > < span class = "lineno" > 357< / span > result[4 * i + 2] +=< / div >
< div class = "line" > < a id = "l00358" name = "l00358" > < / a > < span class = "lineno" > 358< / span > x * ((((w1 > > 4) & 0x0f) + ((w2 & 0x03) < < 4)) * scale + bias);< / div >
< div class = "line" > < a id = "l00359" name = "l00359" > < / a > < span class = "lineno" > 359< / span > result[4 * i + 3] += x * (((w2 > > 2) & 0x3f) * scale + bias);< / div >
< div class = "line" > < a id = "l00360" name = "l00360" > < / a > < span class = "lineno" > 360< / span > }< / div >
< div class = "line" > < a id = "l00361" name = "l00361" > < / a > < span class = "lineno" > 361< / span > }< / div >
< div class = "line" > < a id = "l00362" name = "l00362" > < / a > < span class = "lineno" > 362< / span > < / div >
< div class = "line" > < a id = "l00363" name = "l00363" > < / a > < span class = "lineno" > 363< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 8) {< / div >
< div class = "line" > < a id = "l00364" name = "l00364" > < / a > < span class = "lineno" > 364< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < values_per_thread; i++) {< / div >
< div class = "line" > < a id = "l00365" name = "l00365" > < / a > < span class = "lineno" > 365< / span > result[i] += x * (scale * w[i] + bias);< / div >
< div class = "line" > < a id = "l00366" name = "l00366" > < / a > < span class = "lineno" > 366< / span > }< / div >
< div class = "line" > < a id = "l00367" name = "l00367" > < / a > < span class = "lineno" > 367< / span > }< / div >
< div class = "line" > < a id = "l00368" name = "l00368" > < / a > < span class = "lineno" > 368< / span > }< / div >
2024-10-15 23:12:17 +08:00
< / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00369" name = "l00369" > < / a > < span class = "lineno" > 369< / span > < / div >
< div class = "line" > < a id = "l00370" name = "l00370" > < / a > < span class = "lineno" > 370< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > U, < span class = "keywordtype" > int< / span > N, < span class = "keywordtype" > int< / span > bits> < / div >
< div class = "line" > < a id = "l00371" name = "l00371" > < / a > < span class = "lineno" > 371< / span > < span class = "keyword" > inline< / span > < span class = "keywordtype" > void< / span > < / div >
< div class = "foldopen" id = "foldopen00372" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00372" name = "l00372" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#aecff265b63566d0d5689cfc4e5b037d2" > 372< / a > < / span > < a class = "code hl_function" href = "quantized_8h.html#aecff265b63566d0d5689cfc4e5b037d2" > dequantize< / a > (< span class = "keyword" > const< / span > device uint8_t* w, U scale, U bias, threadgroup U* w_local) {< / div >
< div class = "line" > < a id = "l00373" name = "l00373" > < / a > < span class = "lineno" > 373< / span > < span class = "keyword" > static_assert< / span > (< / div >
< div class = "line" > < a id = "l00374" name = "l00374" > < / a > < span class = "lineno" > 374< / span > bits == 2 || bits == 3 || bits == 4 || bits == 6 || bits == 8,< / div >
< div class = "line" > < a id = "l00375" name = "l00375" > < / a > < span class = "lineno" > 375< / span > < span class = "stringliteral" > " Template undefined for bits not in {2, 3, 4, 6, 8}" < / span > );< / div >
< div class = "line" > < a id = "l00376" name = "l00376" > < / a > < span class = "lineno" > 376< / span > < / div >
< div class = "line" > < a id = "l00377" name = "l00377" > < / a > < span class = "lineno" > 377< / span > < span class = "keywordflow" > if< / span > (bits == 2) {< / div >
< div class = "line" > < a id = "l00378" name = "l00378" > < / a > < span class = "lineno" > 378< / span > U s[4] = {< / div >
< div class = "line" > < a id = "l00379" name = "l00379" > < / a > < span class = "lineno" > 379< / span > scale,< / div >
< div class = "line" > < a id = "l00380" name = "l00380" > < / a > < span class = "lineno" > 380< / span > scale / < span class = "keyword" > static_cast< < / span > U< span class = "keyword" > > < / span > (4.0f),< / div >
< div class = "line" > < a id = "l00381" name = "l00381" > < / a > < span class = "lineno" > 381< / span > scale / < span class = "keyword" > static_cast< < / span > U< span class = "keyword" > > < / span > (16.0f),< / div >
< div class = "line" > < a id = "l00382" name = "l00382" > < / a > < span class = "lineno" > 382< / span > scale / < span class = "keyword" > static_cast< < / span > U< span class = "keyword" > > < / span > (64.0f)};< / div >
< div class = "line" > < a id = "l00383" name = "l00383" > < / a > < span class = "lineno" > 383< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < (N / 4); i++) {< / div >
< div class = "line" > < a id = "l00384" name = "l00384" > < / a > < span class = "lineno" > 384< / span > w_local[4 * i] = s[0] * (w[i] & 0x03) + bias;< / div >
< div class = "line" > < a id = "l00385" name = "l00385" > < / a > < span class = "lineno" > 385< / span > w_local[4 * i + 1] = s[1] * (w[i] & 0x0c) + bias;< / div >
< div class = "line" > < a id = "l00386" name = "l00386" > < / a > < span class = "lineno" > 386< / span > w_local[4 * i + 2] = s[2] * (w[i] & 0x30) + bias;< / div >
< div class = "line" > < a id = "l00387" name = "l00387" > < / a > < span class = "lineno" > 387< / span > w_local[4 * i + 3] = s[3] * (w[i] & 0xc0) + bias;< / div >
< div class = "line" > < a id = "l00388" name = "l00388" > < / a > < span class = "lineno" > 388< / span > }< / div >
< div class = "line" > < a id = "l00389" name = "l00389" > < / a > < span class = "lineno" > 389< / span > }< / div >
< div class = "line" > < a id = "l00390" name = "l00390" > < / a > < span class = "lineno" > 390< / span > < / div >
< div class = "line" > < a id = "l00391" name = "l00391" > < / a > < span class = "lineno" > 391< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 3) {< / div >
< div class = "line" > < a id = "l00392" name = "l00392" > < / a > < span class = "lineno" > 392< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < (N / 8); i++) {< / div >
< div class = "line" > < a id = "l00393" name = "l00393" > < / a > < span class = "lineno" > 393< / span > w_local += 8 * i;< / div >
< div class = "line" > < a id = "l00394" name = "l00394" > < / a > < span class = "lineno" > 394< / span > w += 3 * i;< / div >
2024-10-15 23:12:17 +08:00
< div class = "line" > < a id = "l00395" name = "l00395" > < / a > < span class = "lineno" > 395< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00396" name = "l00396" > < / a > < span class = "lineno" > 396< / span > w_local[0] = (w[0] & 0x7) * scale + bias;< / div >
< div class = "line" > < a id = "l00397" name = "l00397" > < / a > < span class = "lineno" > 397< / span > w_local[1] = ((w[0] & 0x38) > > 3) * scale + bias;< / div >
< div class = "line" > < a id = "l00398" name = "l00398" > < / a > < span class = "lineno" > 398< / span > w_local[2] = (((w[0] & 0xc0) > > 6) + ((w[1] & 0x1) < < 2)) * scale + bias;< / div >
< div class = "line" > < a id = "l00399" name = "l00399" > < / a > < span class = "lineno" > 399< / span > w_local[3] = ((w[1] & 0xe) > > 1) * scale + bias;< / div >
< div class = "line" > < a id = "l00400" name = "l00400" > < / a > < span class = "lineno" > 400< / span > w_local[4] = ((w[1] & 0x70) > > 4) * scale + bias;< / div >
< div class = "line" > < a id = "l00401" name = "l00401" > < / a > < span class = "lineno" > 401< / span > w_local[5] = (((w[1] & 0x80) > > 7) + ((w[2] & 0x3) < < 1)) * scale + bias;< / div >
< div class = "line" > < a id = "l00402" name = "l00402" > < / a > < span class = "lineno" > 402< / span > w_local[6] = ((w[2] & 0x1c) > > 2) * scale + bias;< / div >
< div class = "line" > < a id = "l00403" name = "l00403" > < / a > < span class = "lineno" > 403< / span > w_local[7] = ((w[2] & 0xe0) > > 5) * scale + bias;< / div >
< div class = "line" > < a id = "l00404" name = "l00404" > < / a > < span class = "lineno" > 404< / span > }< / div >
< div class = "line" > < a id = "l00405" name = "l00405" > < / a > < span class = "lineno" > 405< / span > }< / div >
< div class = "line" > < a id = "l00406" name = "l00406" > < / a > < span class = "lineno" > 406< / span > < / div >
< div class = "line" > < a id = "l00407" name = "l00407" > < / a > < span class = "lineno" > 407< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 4) {< / div >
< div class = "line" > < a id = "l00408" name = "l00408" > < / a > < span class = "lineno" > 408< / span > U s[2] = {scale, scale / < span class = "keyword" > static_cast< < / span > U< span class = "keyword" > > < / span > (16.0f)};< / div >
< div class = "line" > < a id = "l00409" name = "l00409" > < / a > < span class = "lineno" > 409< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < (N / 2); i++) {< / div >
< div class = "line" > < a id = "l00410" name = "l00410" > < / a > < span class = "lineno" > 410< / span > w_local[2 * i] = s[0] * (w[i] & 0x0f) + bias;< / div >
< div class = "line" > < a id = "l00411" name = "l00411" > < / a > < span class = "lineno" > 411< / span > w_local[2 * i + 1] = s[1] * (w[i] & 0xf0) + bias;< / div >
< div class = "line" > < a id = "l00412" name = "l00412" > < / a > < span class = "lineno" > 412< / span > }< / div >
< div class = "line" > < a id = "l00413" name = "l00413" > < / a > < span class = "lineno" > 413< / span > }< / div >
< div class = "line" > < a id = "l00414" name = "l00414" > < / a > < span class = "lineno" > 414< / span > < / div >
< div class = "line" > < a id = "l00415" name = "l00415" > < / a > < span class = "lineno" > 415< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 6) {< / div >
< div class = "line" > < a id = "l00416" name = "l00416" > < / a > < span class = "lineno" > 416< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < (N / 4); i++) {< / div >
< div class = "line" > < a id = "l00417" name = "l00417" > < / a > < span class = "lineno" > 417< / span > w_local += 4 * i;< / div >
< div class = "line" > < a id = "l00418" name = "l00418" > < / a > < span class = "lineno" > 418< / span > w += 3 * i;< / div >
< div class = "line" > < a id = "l00419" name = "l00419" > < / a > < span class = "lineno" > 419< / span > < / div >
< div class = "line" > < a id = "l00420" name = "l00420" > < / a > < span class = "lineno" > 420< / span > w_local[0] = (w[0] & 0x3f) * scale + bias;< / div >
< div class = "line" > < a id = "l00421" name = "l00421" > < / a > < span class = "lineno" > 421< / span > w_local[1] = (((w[0] > > 6) & 0x03) + ((w[1] & 0x0f) < < 2)) * scale + bias;< / div >
< div class = "line" > < a id = "l00422" name = "l00422" > < / a > < span class = "lineno" > 422< / span > w_local[2] = (((w[1] > > 4) & 0x0f) + ((w[2] & 0x03) < < 4)) * scale + bias;< / div >
< div class = "line" > < a id = "l00423" name = "l00423" > < / a > < span class = "lineno" > 423< / span > w_local[3] = ((w[2] > > 2) & 0x3f) * scale + bias;< / div >
< div class = "line" > < a id = "l00424" name = "l00424" > < / a > < span class = "lineno" > 424< / span > }< / div >
< div class = "line" > < a id = "l00425" name = "l00425" > < / a > < span class = "lineno" > 425< / span > }< / div >
< div class = "line" > < a id = "l00426" name = "l00426" > < / a > < span class = "lineno" > 426< / span > < / div >
< div class = "line" > < a id = "l00427" name = "l00427" > < / a > < span class = "lineno" > 427< / span > < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 8) {< / div >
< div class = "line" > < a id = "l00428" name = "l00428" > < / a > < span class = "lineno" > 428< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < N; i++) {< / div >
< div class = "line" > < a id = "l00429" name = "l00429" > < / a > < span class = "lineno" > 429< / span > w_local[i] = scale * w[i] + bias;< / div >
< div class = "line" > < a id = "l00430" name = "l00430" > < / a > < span class = "lineno" > 430< / span > }< / div >
< div class = "line" > < a id = "l00431" name = "l00431" > < / a > < span class = "lineno" > 431< / span > }< / div >
< div class = "line" > < a id = "l00432" name = "l00432" > < / a > < span class = "lineno" > 432< / span > }< / div >
2024-10-15 23:12:17 +08:00
< / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00433" name = "l00433" > < / a > < span class = "lineno" > 433< / span > < / div >
< div class = "line" > < a id = "l00434" name = "l00434" > < / a > < span class = "lineno" > 434< / span > < span class = "keyword" > template< / span > < < / div >
< div class = "line" > < a id = "l00435" name = "l00435" > < / a > < span class = "lineno" > 435< / span > < span class = "keyword" > typename< / span > T,< / div >
< div class = "line" > < a id = "l00436" name = "l00436" > < / a > < span class = "lineno" > 436< / span > < span class = "keywordtype" > short< / span > BROWS,< / div >
< div class = "line" > < a id = "l00437" name = "l00437" > < / a > < span class = "lineno" > 437< / span > < span class = "keywordtype" > short< / span > BCOLS,< / div >
< div class = "line" > < a id = "l00438" name = "l00438" > < / a > < span class = "lineno" > 438< / span > < span class = "keywordtype" > short< / span > dst_ld,< / div >
< div class = "line" > < a id = "l00439" name = "l00439" > < / a > < span class = "lineno" > 439< / span > < span class = "keywordtype" > short< / span > reduction_dim,< / div >
< div class = "line" > < a id = "l00440" name = "l00440" > < / a > < span class = "lineno" > 440< / span > < span class = "keywordtype" > short< / span > tgp_size,< / div >
< div class = "line" > < a id = "l00441" name = "l00441" > < / a > < span class = "lineno" > 441< / span > < span class = "keywordtype" > short< / span > group_size,< / div >
< div class = "line" > < a id = "l00442" name = "l00442" > < / a > < span class = "lineno" > 442< / span > < span class = "keywordtype" > short< / span > < a class = "code hl_function" href = "namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825" > bits< / a > > < / div >
< div class = "foldopen" id = "foldopen00443" data-start = "{" data-end = "};" >
< div class = "line" > < a id = "l00443" name = "l00443" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html" > 443< / a > < / span > < span class = "keyword" > struct < / span > < a class = "code hl_struct" href = "struct_quantized_block_loader.html" > QuantizedBlockLoader< / a > {< / div >
< div class = "line" > < a id = "l00444" name = "l00444" > < / a > < span class = "lineno" > 444< / span > < span class = "keyword" > static_assert< / span > (< / div >
< div class = "line" > < a id = "l00445" name = "l00445" > < / a > < span class = "lineno" > 445< / span > BCOLS < = group_size,< / div >
< div class = "line" > < a id = "l00446" name = "l00446" > < / a > < span class = "lineno" > 446< / span > < span class = "stringliteral" > " The group size should be larger than the columns" < / span > );< / div >
< div class = "line" > < a id = "l00447" name = "l00447" > < / a > < span class = "lineno" > 447< / span > < span class = "keyword" > static_assert< / span > (< / div >
< div class = "line" > < a id = "l00448" name = "l00448" > < / a > < span class = "lineno" > 448< / span > group_size % BCOLS == 0,< / div >
< div class = "line" > < a id = "l00449" name = "l00449" > < / a > < span class = "lineno" > 449< / span > < span class = "stringliteral" > " The group size should be divisible by the columns" < / span > );< / div >
< div class = "line" > < a id = "l00450" name = "l00450" > < / a > < span class = "lineno" > 450< / span > < span class = "keyword" > static_assert< / span > (< / div >
< div class = "line" > < a id = "l00451" name = "l00451" > < / a > < span class = "lineno" > 451< / span > bits == 2 || bits == 3 || bits == 4 || bits == 6 || bits == 8,< / div >
< div class = "line" > < a id = "l00452" name = "l00452" > < / a > < span class = "lineno" > 452< / span > < span class = "stringliteral" > " Template undefined for bits not in {2, 3, 4, 6, 8}" < / span > );< / div >
< div class = "line" > < a id = "l00453" name = "l00453" > < / a > < span class = "lineno" > 453< / span > < / div >
< div class = "line" > < a id = "l00454" name = "l00454" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d" > 454< / a > < / span > < a class = "code hl_define" href = "quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd" > MLX_MTL_CONST< / a > < span class = "keywordtype" > short< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d" > pack_factor< / a > = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;< / div >
< div class = "line" > < a id = "l00455" name = "l00455" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db" > 455< / a > < / span > < a class = "code hl_define" href = "quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd" > MLX_MTL_CONST< / a > < span class = "keywordtype" > short< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db" > bytes_per_pack< / a > = (bits == 3 || bits == 6) ? 3 : 1;< / div >
< div class = "line" > < a id = "l00456" name = "l00456" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb" > 456< / a > < / span > < a class = "code hl_define" href = "quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd" > MLX_MTL_CONST< / a > < span class = "keywordtype" > short< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb" > BCOLS_PACKED< / a > = BCOLS / < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d" > pack_factor< / a > ;< / div >
< div class = "line" > < a id = "l00457" name = "l00457" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb" > 457< / a > < / span > < a class = "code hl_define" href = "quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd" > MLX_MTL_CONST< / a > < span class = "keywordtype" > short< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb" > n_reads< / a > =< / div >
< div class = "line" > < a id = "l00458" name = "l00458" > < / a > < span class = "lineno" > 458< / span > (< a class = "code hl_variable" href = "struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb" > BCOLS_PACKED< / a > * BROWS < tgp_size) ? 1 : (< a class = "code hl_variable" href = "struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb" > BCOLS_PACKED< / a > * BROWS) / tgp_size;< / div >
< div class = "line" > < a id = "l00459" name = "l00459" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba" > 459< / a > < / span > < a class = "code hl_define" href = "quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd" > MLX_MTL_CONST< / a > < span class = "keywordtype" > short< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba" > group_steps< / a > = group_size / BCOLS;< / div >
< div class = "line" > < a id = "l00460" name = "l00460" > < / a > < span class = "lineno" > 460< / span > < / div >
< div class = "line" > < a id = "l00461" name = "l00461" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e" > 461< / a > < / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e" > src_ld< / a > ;< / div >
< div class = "line" > < a id = "l00462" name = "l00462" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320" > 462< / a > < / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320" > tile_stride< / a > ;< / div >
< div class = "line" > < a id = "l00463" name = "l00463" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6" > 463< / a > < / span > < span class = "keywordtype" > short< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6" > group_step_cnt< / a > ;< / div >
< div class = "line" > < a id = "l00464" name = "l00464" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab" > 464< / a > < / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab" > group_stride< / a > ;< / div >
< div class = "line" > < a id = "l00465" name = "l00465" > < / a > < span class = "lineno" > 465< / span > < / div >
< div class = "line" > < a id = "l00466" name = "l00466" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475" > 466< / a > < / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > short< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475" > thread_idx< / a > ;< / div >
< div class = "line" > < a id = "l00467" name = "l00467" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906" > 467< / a > < / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > short< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906" > bi< / a > ;< / div >
< div class = "line" > < a id = "l00468" name = "l00468" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00" > 468< / a > < / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > short< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00" > bj< / a > ;< / div >
< div class = "line" > < a id = "l00469" name = "l00469" > < / a > < span class = "lineno" > 469< / span > < / div >
< div class = "line" > < a id = "l00470" name = "l00470" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83" > 470< / a > < / span > threadgroup T* < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83" > dst< / a > ;< / div >
< div class = "line" > < a id = "l00471" name = "l00471" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76" > 471< / a > < / span > < span class = "keyword" > const< / span > device uint8_t* < a class = "code hl_variable" href = "struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76" > src< / a > ;< / div >
< div class = "line" > < a id = "l00472" name = "l00472" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf" > 472< / a > < / span > < span class = "keyword" > const< / span > device T* < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf" > scales< / a > ;< / div >
< div class = "line" > < a id = "l00473" name = "l00473" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd" > 473< / a > < / span > < span class = "keyword" > const< / span > device T* < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd" > biases< / a > ;< / div >
< div class = "line" > < a id = "l00474" name = "l00474" > < / a > < span class = "lineno" > 474< / span > < / div >
< div class = "foldopen" id = "foldopen00475" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00475" name = "l00475" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#a60713ce7498aa683cbb2a0f19ab16589" > 475< / a > < / span > < a class = "code hl_function" href = "struct_quantized_block_loader.html#a60713ce7498aa683cbb2a0f19ab16589" > QuantizedBlockLoader< / a > (< / div >
< div class = "line" > < a id = "l00476" name = "l00476" > < / a > < span class = "lineno" > 476< / span > < span class = "keyword" > const< / span > device uint8_t* src_,< / div >
< div class = "line" > < a id = "l00477" name = "l00477" > < / a > < span class = "lineno" > 477< / span > < span class = "keyword" > const< / span > device T* scales_,< / div >
< div class = "line" > < a id = "l00478" name = "l00478" > < / a > < span class = "lineno" > 478< / span > < span class = "keyword" > const< / span > device T* biases_,< / div >
< div class = "line" > < a id = "l00479" name = "l00479" > < / a > < span class = "lineno" > 479< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > src_ld_,< / div >
< div class = "line" > < a id = "l00480" name = "l00480" > < / a > < span class = "lineno" > 480< / span > threadgroup T* dst_,< / div >
< div class = "line" > < a id = "l00481" name = "l00481" > < / a > < span class = "lineno" > 481< / span > ushort simd_group_id [[simdgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l00482" name = "l00482" > < / a > < span class = "lineno" > 482< / span > ushort simd_lane_id [[thread_index_in_simdgroup]])< / div >
< div class = "line" > < a id = "l00483" name = "l00483" > < / a > < span class = "lineno" > 483< / span > : < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e" > src_ld< / a > (src_ld_),< / div >
< div class = "line" > < a id = "l00484" name = "l00484" > < / a > < span class = "lineno" > 484< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320" > tile_stride< / a > (< / div >
< div class = "line" > < a id = "l00485" name = "l00485" > < / a > < span class = "lineno" > 485< / span > reduction_dim ? < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb" > BCOLS_PACKED< / a > * < a class = "code hl_variable" href = "struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db" > bytes_per_pack< / a > < / div >
< div class = "line" > < a id = "l00486" name = "l00486" > < / a > < span class = "lineno" > 486< / span > : BROWS * < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e" > src_ld< / a > * < a class = "code hl_variable" href = "struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db" > bytes_per_pack< / a > / < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d" > pack_factor< / a > ),< / div >
< div class = "line" > < a id = "l00487" name = "l00487" > < / a > < span class = "lineno" > 487< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6" > group_step_cnt< / a > (0),< / div >
< div class = "line" > < a id = "l00488" name = "l00488" > < / a > < span class = "lineno" > 488< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab" > group_stride< / a > (BROWS * < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e" > src_ld< / a > / group_size),< / div >
< div class = "line" > < a id = "l00489" name = "l00489" > < / a > < span class = "lineno" > 489< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475" > thread_idx< / a > (simd_group_id * 32 + simd_lane_id),< / div >
< div class = "line" > < a id = "l00490" name = "l00490" > < / a > < span class = "lineno" > 490< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906" > bi< / a > (< a class = "code hl_variable" href = "struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb" > n_reads< / a > * < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475" > thread_idx< / a > / < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb" > BCOLS_PACKED< / a > ),< / div >
< div class = "line" > < a id = "l00491" name = "l00491" > < / a > < span class = "lineno" > 491< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00" > bj< / a > ((< a class = "code hl_variable" href = "struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb" > n_reads< / a > * < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475" > thread_idx< / a > ) % < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb" > BCOLS_PACKED< / a > ),< / div >
< div class = "line" > < a id = "l00492" name = "l00492" > < / a > < span class = "lineno" > 492< / span > dst(dst_ + < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906" > bi< / a > * dst_ld + < a class = "code hl_variable" href = "struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00" > bj< / a > * < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d" > pack_factor< / a > ),< / div >
< div class = "line" > < a id = "l00493" name = "l00493" > < / a > < span class = "lineno" > 493< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76" > src< / a > (src_ + < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906" > bi< / a > * < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e" > src_ld< / a > * < a class = "code hl_variable" href = "struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db" > bytes_per_pack< / a > / < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d" > pack_factor< / a > +< / div >
< div class = "line" > < a id = "l00494" name = "l00494" > < / a > < span class = "lineno" > 494< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00" > bj< / a > * < a class = "code hl_variable" href = "struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db" > bytes_per_pack< / a > ),< / div >
< div class = "line" > < a id = "l00495" name = "l00495" > < / a > < span class = "lineno" > 495< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf" > scales< / a > (scales_ + < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906" > bi< / a > * < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e" > src_ld< / a > / group_size),< / div >
< div class = "line" > < a id = "l00496" name = "l00496" > < / a > < span class = "lineno" > 496< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd" > biases< / a > (biases_ + < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906" > bi< / a > * < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e" > src_ld< / a > / group_size) {}< / div >
2024-10-26 04:23:45 +08:00
< / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00497" name = "l00497" > < / a > < span class = "lineno" > 497< / span > < / div >
2024-10-26 04:23:45 +08:00
< div class = "foldopen" id = "foldopen00498" data-start = "{" data-end = "}" >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00498" name = "l00498" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc" > 498< / a > < / span > < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc" > load_unsafe< / a > ()< span class = "keyword" > const < / span > {< / div >
< div class = "line" > < a id = "l00499" name = "l00499" > < / a > < span class = "lineno" > 499< / span > < span class = "keywordflow" > if< / span > (< a class = "code hl_variable" href = "struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb" > BCOLS_PACKED< / a > * BROWS < tgp_size & & bi > = BROWS) {< / div >
< div class = "line" > < a id = "l00500" name = "l00500" > < / a > < span class = "lineno" > 500< / span > < span class = "keywordflow" > return< / span > ;< / div >
< div class = "line" > < a id = "l00501" name = "l00501" > < / a > < span class = "lineno" > 501< / span > }< / div >
< div class = "line" > < a id = "l00502" name = "l00502" > < / a > < span class = "lineno" > 502< / span > < / div >
< div class = "line" > < a id = "l00503" name = "l00503" > < / a > < span class = "lineno" > 503< / span > T scale = *< a class = "code hl_variable" href = "struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf" > scales< / a > ;< / div >
< div class = "line" > < a id = "l00504" name = "l00504" > < / a > < span class = "lineno" > 504< / span > T bias = *< a class = "code hl_variable" href = "struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd" > biases< / a > ;< / div >
< div class = "line" > < a id = "l00505" name = "l00505" > < / a > < span class = "lineno" > 505< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb" > n_reads< / a > ; i++) {< / div >
< div class = "line" > < a id = "l00506" name = "l00506" > < / a > < span class = "lineno" > 506< / span > < a class = "code hl_function" href = "quantized_8h.html#aecff265b63566d0d5689cfc4e5b037d2" > dequantize< T, pack_factor, bits> < / a > (< / div >
< div class = "line" > < a id = "l00507" name = "l00507" > < / a > < span class = "lineno" > 507< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76" > src< / a > + i * < a class = "code hl_variable" href = "struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db" > bytes_per_pack< / a > , scale, bias, dst + i * < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d" > pack_factor< / a > );< / div >
< div class = "line" > < a id = "l00508" name = "l00508" > < / a > < span class = "lineno" > 508< / span > }< / div >
< div class = "line" > < a id = "l00509" name = "l00509" > < / a > < span class = "lineno" > 509< / span > }< / div >
< / div >
< div class = "line" > < a id = "l00510" name = "l00510" > < / a > < span class = "lineno" > 510< / span > < / div >
< div class = "foldopen" id = "foldopen00511" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00511" name = "l00511" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#a699dc9aa284b8fbf870310bbb224465b" > 511< / a > < / span > < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "struct_quantized_block_loader.html#a699dc9aa284b8fbf870310bbb224465b" > load_safe< / a > (short2 src_tile_dim)< span class = "keyword" > const < / span > {< / div >
< div class = "line" > < a id = "l00512" name = "l00512" > < / a > < span class = "lineno" > 512< / span > < span class = "keywordflow" > if< / span > (< a class = "code hl_variable" href = "struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb" > BCOLS_PACKED< / a > * BROWS < tgp_size & & bi > = BROWS) {< / div >
< div class = "line" > < a id = "l00513" name = "l00513" > < / a > < span class = "lineno" > 513< / span > < span class = "keywordflow" > return< / span > ;< / div >
< div class = "line" > < a id = "l00514" name = "l00514" > < / a > < span class = "lineno" > 514< / span > }< / div >
< div class = "line" > < a id = "l00515" name = "l00515" > < / a > < span class = "lineno" > 515< / span > < / div >
< div class = "line" > < a id = "l00516" name = "l00516" > < / a > < span class = "lineno" > 516< / span > < span class = "keywordflow" > if< / span > (reduction_dim == 1 & & < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906" > bi< / a > > = src_tile_dim.y) {< / div >
< div class = "line" > < a id = "l00517" name = "l00517" > < / a > < span class = "lineno" > 517< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb" > n_reads< / a > * < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d" > pack_factor< / a > ; i++) {< / div >
< div class = "line" > < a id = "l00518" name = "l00518" > < / a > < span class = "lineno" > 518< / span > dst[i] = T(0);< / div >
< div class = "line" > < a id = "l00519" name = "l00519" > < / a > < span class = "lineno" > 519< / span > }< / div >
< div class = "line" > < a id = "l00520" name = "l00520" > < / a > < span class = "lineno" > 520< / span > < span class = "keywordflow" > return< / span > ;< / div >
< div class = "line" > < a id = "l00521" name = "l00521" > < / a > < span class = "lineno" > 521< / span > }< / div >
< div class = "line" > < a id = "l00522" name = "l00522" > < / a > < span class = "lineno" > 522< / span > < / div >
< div class = "line" > < a id = "l00523" name = "l00523" > < / a > < span class = "lineno" > 523< / span > < span class = "keywordflow" > if< / span > (reduction_dim == 0 & & < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906" > bi< / a > > = src_tile_dim.x) {< / div >
< div class = "line" > < a id = "l00524" name = "l00524" > < / a > < span class = "lineno" > 524< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb" > n_reads< / a > * < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d" > pack_factor< / a > ; i++) {< / div >
< div class = "line" > < a id = "l00525" name = "l00525" > < / a > < span class = "lineno" > 525< / span > dst[i] = T(0);< / div >
< div class = "line" > < a id = "l00526" name = "l00526" > < / a > < span class = "lineno" > 526< / span > }< / div >
< div class = "line" > < a id = "l00527" name = "l00527" > < / a > < span class = "lineno" > 527< / span > < span class = "keywordflow" > return< / span > ;< / div >
< div class = "line" > < a id = "l00528" name = "l00528" > < / a > < span class = "lineno" > 528< / span > }< / div >
< div class = "line" > < a id = "l00529" name = "l00529" > < / a > < span class = "lineno" > 529< / span > < / div >
< div class = "line" > < a id = "l00530" name = "l00530" > < / a > < span class = "lineno" > 530< / span > T scale = *< a class = "code hl_variable" href = "struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf" > scales< / a > ;< / div >
< div class = "line" > < a id = "l00531" name = "l00531" > < / a > < span class = "lineno" > 531< / span > T bias = *< a class = "code hl_variable" href = "struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd" > biases< / a > ;< / div >
< div class = "line" > < a id = "l00532" name = "l00532" > < / a > < span class = "lineno" > 532< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb" > n_reads< / a > ; i++) {< / div >
< div class = "line" > < a id = "l00533" name = "l00533" > < / a > < span class = "lineno" > 533< / span > < a class = "code hl_function" href = "quantized_8h.html#aecff265b63566d0d5689cfc4e5b037d2" > dequantize< T, pack_factor, bits> < / a > (< / div >
< div class = "line" > < a id = "l00534" name = "l00534" > < / a > < span class = "lineno" > 534< / span > (device uint8_t*)(< a class = "code hl_variable" href = "struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76" > src< / a > + i * < a class = "code hl_variable" href = "struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db" > bytes_per_pack< / a > ),< / div >
< div class = "line" > < a id = "l00535" name = "l00535" > < / a > < span class = "lineno" > 535< / span > scale,< / div >
< div class = "line" > < a id = "l00536" name = "l00536" > < / a > < span class = "lineno" > 536< / span > bias,< / div >
< div class = "line" > < a id = "l00537" name = "l00537" > < / a > < span class = "lineno" > 537< / span > dst + i * < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d" > pack_factor< / a > );< / div >
< div class = "line" > < a id = "l00538" name = "l00538" > < / a > < span class = "lineno" > 538< / span > }< / div >
< div class = "line" > < a id = "l00539" name = "l00539" > < / a > < span class = "lineno" > 539< / span > }< / div >
< / div >
< div class = "line" > < a id = "l00540" name = "l00540" > < / a > < span class = "lineno" > 540< / span > < / div >
< div class = "foldopen" id = "foldopen00541" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00541" name = "l00541" > < / a > < span class = "lineno" > < a class = "line" href = "struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9" > 541< / a > < / span > < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9" > next< / a > () {< / div >
< div class = "line" > < a id = "l00542" name = "l00542" > < / a > < span class = "lineno" > 542< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76" > src< / a > += < a class = "code hl_variable" href = "struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320" > tile_stride< / a > ;< / div >
< div class = "line" > < a id = "l00543" name = "l00543" > < / a > < span class = "lineno" > 543< / span > < span class = "keywordflow" > if< / span > (reduction_dim == 1) {< / div >
< div class = "line" > < a id = "l00544" name = "l00544" > < / a > < span class = "lineno" > 544< / span > < span class = "keywordflow" > if< / span > (< a class = "code hl_variable" href = "struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba" > group_steps< / a > > 1) {< / div >
< div class = "line" > < a id = "l00545" name = "l00545" > < / a > < span class = "lineno" > 545< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6" > group_step_cnt< / a > ++;< / div >
< div class = "line" > < a id = "l00546" name = "l00546" > < / a > < span class = "lineno" > 546< / span > < span class = "keywordflow" > if< / span > (< a class = "code hl_variable" href = "struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6" > group_step_cnt< / a > == < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba" > group_steps< / a > ) {< / div >
< div class = "line" > < a id = "l00547" name = "l00547" > < / a > < span class = "lineno" > 547< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6" > group_step_cnt< / a > = 0;< / div >
< div class = "line" > < a id = "l00548" name = "l00548" > < / a > < span class = "lineno" > 548< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf" > scales< / a > ++;< / div >
< div class = "line" > < a id = "l00549" name = "l00549" > < / a > < span class = "lineno" > 549< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd" > biases< / a > ++;< / div >
< div class = "line" > < a id = "l00550" name = "l00550" > < / a > < span class = "lineno" > 550< / span > }< / div >
< div class = "line" > < a id = "l00551" name = "l00551" > < / a > < span class = "lineno" > 551< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l00552" name = "l00552" > < / a > < span class = "lineno" > 552< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf" > scales< / a > ++;< / div >
< div class = "line" > < a id = "l00553" name = "l00553" > < / a > < span class = "lineno" > 553< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd" > biases< / a > ++;< / div >
< div class = "line" > < a id = "l00554" name = "l00554" > < / a > < span class = "lineno" > 554< / span > }< / div >
< div class = "line" > < a id = "l00555" name = "l00555" > < / a > < span class = "lineno" > 555< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l00556" name = "l00556" > < / a > < span class = "lineno" > 556< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf" > scales< / a > += < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab" > group_stride< / a > ;< / div >
< div class = "line" > < a id = "l00557" name = "l00557" > < / a > < span class = "lineno" > 557< / span > < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd" > biases< / a > += < a class = "code hl_variable" href = "struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab" > group_stride< / a > ;< / div >
< div class = "line" > < a id = "l00558" name = "l00558" > < / a > < span class = "lineno" > 558< / span > }< / div >
< div class = "line" > < a id = "l00559" name = "l00559" > < / a > < span class = "lineno" > 559< / span > }< / div >
< / div >
< div class = "line" > < a id = "l00560" name = "l00560" > < / a > < span class = "lineno" > 560< / span > };< / div >
< / div >
< div class = "line" > < a id = "l00561" name = "l00561" > < / a > < span class = "lineno" > 561< / span > < / div >
< div class = "line" > < a id = "l00562" name = "l00562" > < / a > < span class = "lineno" > 562< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T, < span class = "keywordtype" > int< / span > group_size, < span class = "keywordtype" > int< / span > bits, < span class = "keywordtype" > int< / span > D> < / div >
< div class = "foldopen" id = "foldopen00563" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00563" name = "l00563" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#ad5cf1cf63656bc1780685d22169cd4ef" > 563< / a > < / span > METAL_FUNC < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#ad5cf1cf63656bc1780685d22169cd4ef" > qmv_quad_impl< / a > (< / div >
< div class = "line" > < a id = "l00564" name = "l00564" > < / a > < span class = "lineno" > 564< / span > < span class = "keyword" > const< / span > device uint32_t* w,< / div >
< div class = "line" > < a id = "l00565" name = "l00565" > < / a > < span class = "lineno" > 565< / span > < span class = "keyword" > const< / span > device T* scales,< / div >
< div class = "line" > < a id = "l00566" name = "l00566" > < / a > < span class = "lineno" > 566< / span > < span class = "keyword" > const< / span > device T* biases,< / div >
< div class = "line" > < a id = "l00567" name = "l00567" > < / a > < span class = "lineno" > 567< / span > < span class = "keyword" > const< / span > device T* x,< / div >
< div class = "line" > < a id = "l00568" name = "l00568" > < / a > < span class = "lineno" > 568< / span > device T* y,< / div >
< div class = "line" > < a id = "l00569" name = "l00569" > < / a > < span class = "lineno" > 569< / span > constant < span class = "keywordtype" > int< / span > & in_vec_size,< / div >
< div class = "line" > < a id = "l00570" name = "l00570" > < / a > < span class = "lineno" > 570< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & out_vec_size,< / div >
< div class = "line" > < a id = "l00571" name = "l00571" > < / a > < span class = "lineno" > 571< / span > uint3 tid [[threadgroup_position_in_grid]],< / div >
< div class = "line" > < a id = "l00572" name = "l00572" > < / a > < span class = "lineno" > 572< / span > uint quad_gid [[quadgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l00573" name = "l00573" > < / a > < span class = "lineno" > 573< / span > uint quad_lid [[thread_index_in_quadgroup]]) {< / div >
< div class = "line" > < a id = "l00574" name = "l00574" > < / a > < span class = "lineno" > 574< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > quads_per_simd = < a class = "code hl_variable" href = "quantized_8h.html#a62969a218d93680f5e35d0c61b160b99" > SIMD_SIZE< / a > / < a class = "code hl_variable" href = "quantized_8h.html#a803e4d5a1459844ba647aea5b004e133" > QUAD_SIZE< / a > ;< / div >
< div class = "line" > < a id = "l00575" name = "l00575" > < / a > < span class = "lineno" > 575< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > pack_factor = 32 / bits;< / div >
< div class = "line" > < a id = "l00576" name = "l00576" > < / a > < span class = "lineno" > 576< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > values_per_thread = D / < a class = "code hl_variable" href = "quantized_8h.html#a803e4d5a1459844ba647aea5b004e133" > QUAD_SIZE< / a > ;< / div >
< div class = "line" > < a id = "l00577" name = "l00577" > < / a > < span class = "lineno" > 577< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > packs_per_thread = values_per_thread / pack_factor;< / div >
< div class = "line" > < a id = "l00578" name = "l00578" > < / a > < span class = "lineno" > 578< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > scale_step_per_thread = group_size / values_per_thread;< / div >
< div class = "line" > < a id = "l00579" name = "l00579" > < / a > < span class = "lineno" > 579< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > results_per_quadgroup = 8;< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00580" name = "l00580" > < / a > < span class = "lineno" > 580< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00581" name = "l00581" > < / a > < span class = "lineno" > 581< / span > < span class = "keyword" > typedef< / span > < span class = "keywordtype" > float< / span > U;< / div >
< div class = "line" > < a id = "l00582" name = "l00582" > < / a > < span class = "lineno" > 582< / span > < / div >
< div class = "line" > < a id = "l00583" name = "l00583" > < / a > < span class = "lineno" > 583< / span > thread U x_thread[values_per_thread];< / div >
< div class = "line" > < a id = "l00584" name = "l00584" > < / a > < span class = "lineno" > 584< / span > thread U result[results_per_quadgroup] = {0};< / div >
< div class = "line" > < a id = "l00585" name = "l00585" > < / a > < span class = "lineno" > 585< / span > < / div >
< div class = "line" > < a id = "l00586" name = "l00586" > < / a > < span class = "lineno" > 586< / span > < span class = "comment" > // Adjust positions< / span > < / div >
< div class = "line" > < a id = "l00587" name = "l00587" > < / a > < span class = "lineno" > 587< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > in_vec_size_w = in_vec_size / pack_factor;< / div >
< div class = "line" > < a id = "l00588" name = "l00588" > < / a > < span class = "lineno" > 588< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > in_vec_size_g = in_vec_size / group_size;< / div >
< div class = "line" > < a id = "l00589" name = "l00589" > < / a > < span class = "lineno" > 589< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > out_row = tid.x * quads_per_simd * results_per_quadgroup + quad_gid;< / div >
< div class = "line" > < a id = "l00590" name = "l00590" > < / a > < span class = "lineno" > 590< / span > < / div >
< div class = "line" > < a id = "l00591" name = "l00591" > < / a > < span class = "lineno" > 591< / span > w += out_row * in_vec_size_w + quad_lid * packs_per_thread;< / div >
< div class = "line" > < a id = "l00592" name = "l00592" > < / a > < span class = "lineno" > 592< / span > scales += out_row * in_vec_size_g + quad_lid / scale_step_per_thread;< / div >
< div class = "line" > < a id = "l00593" name = "l00593" > < / a > < span class = "lineno" > 593< / span > biases += out_row * in_vec_size_g + quad_lid / scale_step_per_thread;< / div >
< div class = "line" > < a id = "l00594" name = "l00594" > < / a > < span class = "lineno" > 594< / span > x += tid.y * in_vec_size + quad_lid * values_per_thread;< / div >
< div class = "line" > < a id = "l00595" name = "l00595" > < / a > < span class = "lineno" > 595< / span > y += tid.y * out_vec_size + out_row;< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00596" name = "l00596" > < / a > < span class = "lineno" > 596< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00597" name = "l00597" > < / a > < span class = "lineno" > 597< / span > U sum = < a class = "code hl_function" href = "quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9" > load_vector< T, U, values_per_thread, bits> < / a > (x, x_thread);< / div >
< div class = "line" > < a id = "l00598" name = "l00598" > < / a > < span class = "lineno" > 598< / span > < / div >
< div class = "line" > < a id = "l00599" name = "l00599" > < / a > < span class = "lineno" > 599< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > row = 0; row < results_per_quadgroup; row++) {< / div >
< div class = "line" > < a id = "l00600" name = "l00600" > < / a > < span class = "lineno" > 600< / span > < span class = "keyword" > auto< / span > wl = (< span class = "keyword" > const< / span > device uint8_t*)(w + row * in_vec_size_w * quads_per_simd);< / div >
< div class = "line" > < a id = "l00601" name = "l00601" > < / a > < span class = "lineno" > 601< / span > < span class = "keyword" > const< / span > device T* sl = scales + row * in_vec_size_g * quads_per_simd;< / div >
< div class = "line" > < a id = "l00602" name = "l00602" > < / a > < span class = "lineno" > 602< / span > < span class = "keyword" > const< / span > device T* bl = biases + row * in_vec_size_g * quads_per_simd;< / div >
< div class = "line" > < a id = "l00603" name = "l00603" > < / a > < span class = "lineno" > 603< / span > < / div >
< div class = "line" > < a id = "l00604" name = "l00604" > < / a > < span class = "lineno" > 604< / span > U s = sl[0];< / div >
< div class = "line" > < a id = "l00605" name = "l00605" > < / a > < span class = "lineno" > 605< / span > U b = bl[0];< / div >
< div class = "line" > < a id = "l00606" name = "l00606" > < / a > < span class = "lineno" > 606< / span > < span class = "keywordflow" > if< / span > (row * quads_per_simd + out_row < out_vec_size) {< / div >
< div class = "line" > < a id = "l00607" name = "l00607" > < / a > < span class = "lineno" > 607< / span > result[row] += < a class = "code hl_function" href = "quantized_8h.html#ab364d58ab652e3ad87a8f80910556071" > qdot< U, values_per_thread, bits> < / a > (wl, x_thread, s, b, sum);< / div >
< div class = "line" > < a id = "l00608" name = "l00608" > < / a > < span class = "lineno" > 608< / span > }< / div >
< div class = "line" > < a id = "l00609" name = "l00609" > < / a > < span class = "lineno" > 609< / span > }< / div >
< div class = "line" > < a id = "l00610" name = "l00610" > < / a > < span class = "lineno" > 610< / span > < / div >
< div class = "line" > < a id = "l00611" name = "l00611" > < / a > < span class = "lineno" > 611< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > row = 0; row < results_per_quadgroup; row++) {< / div >
< div class = "line" > < a id = "l00612" name = "l00612" > < / a > < span class = "lineno" > 612< / span > result[row] = quad_sum(result[row]);< / div >
< div class = "line" > < a id = "l00613" name = "l00613" > < / a > < span class = "lineno" > 613< / span > < span class = "keywordflow" > if< / span > (quad_lid == 0 & & row * quads_per_simd + out_row < out_vec_size) {< / div >
< div class = "line" > < a id = "l00614" name = "l00614" > < / a > < span class = "lineno" > 614< / span > y[row * quads_per_simd] = < span class = "keyword" > static_cast< < / span > T< span class = "keyword" > > < / span > (result[row]);< / div >
< div class = "line" > < a id = "l00615" name = "l00615" > < / a > < span class = "lineno" > 615< / span > }< / div >
< div class = "line" > < a id = "l00616" name = "l00616" > < / a > < span class = "lineno" > 616< / span > }< / div >
< div class = "line" > < a id = "l00617" name = "l00617" > < / a > < span class = "lineno" > 617< / span > }< / div >
2024-10-15 23:12:17 +08:00
< / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00618" name = "l00618" > < / a > < span class = "lineno" > 618< / span > < / div >
< div class = "line" > < a id = "l00619" name = "l00619" > < / a > < span class = "lineno" > 619< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T, < span class = "keywordtype" > int< / span > group_size, < span class = "keywordtype" > int< / span > bits> < / div >
< div class = "foldopen" id = "foldopen00620" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00620" name = "l00620" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81" > 620< / a > < / span > METAL_FUNC < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81" > qmv_fast_impl< / a > (< / div >
< div class = "line" > < a id = "l00621" name = "l00621" > < / a > < span class = "lineno" > 621< / span > < span class = "keyword" > const< / span > device uint32_t* w,< / div >
< div class = "line" > < a id = "l00622" name = "l00622" > < / a > < span class = "lineno" > 622< / span > < span class = "keyword" > const< / span > device T* scales,< / div >
< div class = "line" > < a id = "l00623" name = "l00623" > < / a > < span class = "lineno" > 623< / span > < span class = "keyword" > const< / span > device T* biases,< / div >
< div class = "line" > < a id = "l00624" name = "l00624" > < / a > < span class = "lineno" > 624< / span > < span class = "keyword" > const< / span > device T* x,< / div >
< div class = "line" > < a id = "l00625" name = "l00625" > < / a > < span class = "lineno" > 625< / span > device T* y,< / div >
< div class = "line" > < a id = "l00626" name = "l00626" > < / a > < span class = "lineno" > 626< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & in_vec_size,< / div >
< div class = "line" > < a id = "l00627" name = "l00627" > < / a > < span class = "lineno" > 627< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & out_vec_size,< / div >
< div class = "line" > < a id = "l00628" name = "l00628" > < / a > < span class = "lineno" > 628< / span > uint3 tid [[threadgroup_position_in_grid]],< / div >
< div class = "line" > < a id = "l00629" name = "l00629" > < / a > < span class = "lineno" > 629< / span > uint simd_gid [[simdgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l00630" name = "l00630" > < / a > < span class = "lineno" > 630< / span > uint simd_lid [[thread_index_in_simdgroup]]) {< / div >
< div class = "line" > < a id = "l00631" name = "l00631" > < / a > < span class = "lineno" > 631< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > power_of_2_bits = (bits & (bits - 1)) == 0;< / div >
< div class = "line" > < a id = "l00632" name = "l00632" > < / a > < span class = "lineno" > 632< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > packs_per_thread = bits == 2 ? 1 : 2;< / div >
< div class = "line" > < a id = "l00633" name = "l00633" > < / a > < span class = "lineno" > 633< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > num_simdgroups = 2;< / div >
< div class = "line" > < a id = "l00634" name = "l00634" > < / a > < span class = "lineno" > 634< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > results_per_simdgroup = 4;< / div >
< div class = "line" > < a id = "l00635" name = "l00635" > < / a > < span class = "lineno" > 635< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 32 / bits;< / div >
< div class = "line" > < a id = "l00636" name = "l00636" > < / a > < span class = "lineno" > 636< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > bytes_per_pack = power_of_2_bits ? 4 : 3;< / div >
< div class = "line" > < a id = "l00637" name = "l00637" > < / a > < span class = "lineno" > 637< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > values_per_thread = pack_factor * packs_per_thread;< / div >
< div class = "line" > < a id = "l00638" name = "l00638" > < / a > < span class = "lineno" > 638< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > block_size = values_per_thread * < a class = "code hl_variable" href = "quantized_8h.html#a62969a218d93680f5e35d0c61b160b99" > SIMD_SIZE< / a > ;< / div >
< div class = "line" > < a id = "l00639" name = "l00639" > < / a > < span class = "lineno" > 639< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > scale_step_per_thread = group_size / values_per_thread;< / div >
< div class = "line" > < a id = "l00640" name = "l00640" > < / a > < span class = "lineno" > 640< / span > < / div >
< div class = "line" > < a id = "l00641" name = "l00641" > < / a > < span class = "lineno" > 641< / span > < span class = "keyword" > const< / span > device uint8_t* ws = (< span class = "keyword" > const< / span > device uint8_t*)w;< / div >
< div class = "line" > < a id = "l00642" name = "l00642" > < / a > < span class = "lineno" > 642< / span > < / div >
< div class = "line" > < a id = "l00643" name = "l00643" > < / a > < span class = "lineno" > 643< / span > < span class = "keyword" > typedef< / span > < span class = "keywordtype" > float< / span > U;< / div >
< div class = "line" > < a id = "l00644" name = "l00644" > < / a > < span class = "lineno" > 644< / span > < / div >
< div class = "line" > < a id = "l00645" name = "l00645" > < / a > < span class = "lineno" > 645< / span > thread U x_thread[values_per_thread];< / div >
< div class = "line" > < a id = "l00646" name = "l00646" > < / a > < span class = "lineno" > 646< / span > thread U result[results_per_simdgroup] = {0};< / div >
< div class = "line" > < a id = "l00647" name = "l00647" > < / a > < span class = "lineno" > 647< / span > < / div >
< div class = "line" > < a id = "l00648" name = "l00648" > < / a > < span class = "lineno" > 648< / span > < span class = "comment" > // Adjust positions< / span > < / div >
< div class = "line" > < a id = "l00649" name = "l00649" > < / a > < span class = "lineno" > 649< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > in_vec_size_w = in_vec_size * bytes_per_pack / pack_factor;< / div >
< div class = "line" > < a id = "l00650" name = "l00650" > < / a > < span class = "lineno" > 650< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > in_vec_size_g = in_vec_size / group_size;< / div >
< div class = "line" > < a id = "l00651" name = "l00651" > < / a > < span class = "lineno" > 651< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > out_row = tid.x * (num_simdgroups * results_per_simdgroup) +< / div >
< div class = "line" > < a id = "l00652" name = "l00652" > < / a > < span class = "lineno" > 652< / span > simd_gid * results_per_simdgroup;< / div >
< div class = "line" > < a id = "l00653" name = "l00653" > < / a > < span class = "lineno" > 653< / span > < / div >
< div class = "line" > < a id = "l00654" name = "l00654" > < / a > < span class = "lineno" > 654< / span > ws += out_row * in_vec_size_w + simd_lid * packs_per_thread * bytes_per_pack;< / div >
< div class = "line" > < a id = "l00655" name = "l00655" > < / a > < span class = "lineno" > 655< / span > scales += out_row * in_vec_size_g + simd_lid / scale_step_per_thread;< / div >
< div class = "line" > < a id = "l00656" name = "l00656" > < / a > < span class = "lineno" > 656< / span > biases += out_row * in_vec_size_g + simd_lid / scale_step_per_thread;< / div >
< div class = "line" > < a id = "l00657" name = "l00657" > < / a > < span class = "lineno" > 657< / span > x += tid.y * in_vec_size + simd_lid * values_per_thread;< / div >
< div class = "line" > < a id = "l00658" name = "l00658" > < / a > < span class = "lineno" > 658< / span > y += tid.y * out_vec_size + out_row;< / div >
< div class = "line" > < a id = "l00659" name = "l00659" > < / a > < span class = "lineno" > 659< / span > < / div >
< div class = "line" > < a id = "l00660" name = "l00660" > < / a > < span class = "lineno" > 660< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > k = 0; k < in_vec_size; k += block_size) {< / div >
< div class = "line" > < a id = "l00661" name = "l00661" > < / a > < span class = "lineno" > 661< / span > U sum = < a class = "code hl_function" href = "quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9" > load_vector< T, U, values_per_thread, bits> < / a > (x, x_thread);< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00662" name = "l00662" > < / a > < span class = "lineno" > 662< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00663" name = "l00663" > < / a > < span class = "lineno" > 663< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > row = 0; row < results_per_simdgroup; row++) {< / div >
< div class = "line" > < a id = "l00664" name = "l00664" > < / a > < span class = "lineno" > 664< / span > < span class = "keyword" > auto< / span > wl = (< span class = "keyword" > const< / span > device uint8_t*)(ws + row * in_vec_size_w);< / div >
< div class = "line" > < a id = "l00665" name = "l00665" > < / a > < span class = "lineno" > 665< / span > < span class = "keyword" > const< / span > device T* sl = scales + row * in_vec_size_g;< / div >
< div class = "line" > < a id = "l00666" name = "l00666" > < / a > < span class = "lineno" > 666< / span > < span class = "keyword" > const< / span > device T* bl = biases + row * in_vec_size_g;< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00667" name = "l00667" > < / a > < span class = "lineno" > 667< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00668" name = "l00668" > < / a > < span class = "lineno" > 668< / span > U s = sl[0];< / div >
< div class = "line" > < a id = "l00669" name = "l00669" > < / a > < span class = "lineno" > 669< / span > U b = bl[0];< / div >
< div class = "line" > < a id = "l00670" name = "l00670" > < / a > < span class = "lineno" > 670< / span > result[row] += < a class = "code hl_function" href = "quantized_8h.html#ab364d58ab652e3ad87a8f80910556071" > qdot< U, values_per_thread, bits> < / a > (wl, x_thread, s, b, sum);< / div >
< div class = "line" > < a id = "l00671" name = "l00671" > < / a > < span class = "lineno" > 671< / span > }< / div >
< div class = "line" > < a id = "l00672" name = "l00672" > < / a > < span class = "lineno" > 672< / span > < / div >
< div class = "line" > < a id = "l00673" name = "l00673" > < / a > < span class = "lineno" > 673< / span > ws += block_size * bytes_per_pack / pack_factor;< / div >
< div class = "line" > < a id = "l00674" name = "l00674" > < / a > < span class = "lineno" > 674< / span > scales += block_size / group_size;< / div >
< div class = "line" > < a id = "l00675" name = "l00675" > < / a > < span class = "lineno" > 675< / span > biases += block_size / group_size;< / div >
< div class = "line" > < a id = "l00676" name = "l00676" > < / a > < span class = "lineno" > 676< / span > x += block_size;< / div >
< div class = "line" > < a id = "l00677" name = "l00677" > < / a > < span class = "lineno" > 677< / span > }< / div >
< div class = "line" > < a id = "l00678" name = "l00678" > < / a > < span class = "lineno" > 678< / span > < / div >
< div class = "line" > < a id = "l00679" name = "l00679" > < / a > < span class = "lineno" > 679< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > row = 0; row < results_per_simdgroup; row++) {< / div >
< div class = "line" > < a id = "l00680" name = "l00680" > < / a > < span class = "lineno" > 680< / span > result[row] = < a class = "code hl_function" href = "namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5" > simd_sum< / a > (result[row]);< / div >
< div class = "line" > < a id = "l00681" name = "l00681" > < / a > < span class = "lineno" > 681< / span > < span class = "keywordflow" > if< / span > (simd_lid == 0) {< / div >
< div class = "line" > < a id = "l00682" name = "l00682" > < / a > < span class = "lineno" > 682< / span > y[row] = < span class = "keyword" > static_cast< < / span > T< span class = "keyword" > > < / span > (result[row]);< / div >
< div class = "line" > < a id = "l00683" name = "l00683" > < / a > < span class = "lineno" > 683< / span > }< / div >
< div class = "line" > < a id = "l00684" name = "l00684" > < / a > < span class = "lineno" > 684< / span > }< / div >
< div class = "line" > < a id = "l00685" name = "l00685" > < / a > < span class = "lineno" > 685< / span > }< / div >
< / div >
< div class = "line" > < a id = "l00686" name = "l00686" > < / a > < span class = "lineno" > 686< / span > < / div >
< div class = "line" > < a id = "l00687" name = "l00687" > < / a > < span class = "lineno" > 687< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T, < span class = "keywordtype" > int< / span > group_size, < span class = "keywordtype" > int< / span > bits> < / div >
< div class = "foldopen" id = "foldopen00688" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00688" name = "l00688" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd" > 688< / a > < / span > METAL_FUNC < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd" > qmv_impl< / a > (< / div >
< div class = "line" > < a id = "l00689" name = "l00689" > < / a > < span class = "lineno" > 689< / span > < span class = "keyword" > const< / span > device uint32_t* w,< / div >
< div class = "line" > < a id = "l00690" name = "l00690" > < / a > < span class = "lineno" > 690< / span > < span class = "keyword" > const< / span > device T* scales,< / div >
< div class = "line" > < a id = "l00691" name = "l00691" > < / a > < span class = "lineno" > 691< / span > < span class = "keyword" > const< / span > device T* biases,< / div >
< div class = "line" > < a id = "l00692" name = "l00692" > < / a > < span class = "lineno" > 692< / span > < span class = "keyword" > const< / span > device T* x,< / div >
< div class = "line" > < a id = "l00693" name = "l00693" > < / a > < span class = "lineno" > 693< / span > device T* y,< / div >
< div class = "line" > < a id = "l00694" name = "l00694" > < / a > < span class = "lineno" > 694< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & in_vec_size,< / div >
< div class = "line" > < a id = "l00695" name = "l00695" > < / a > < span class = "lineno" > 695< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & out_vec_size,< / div >
< div class = "line" > < a id = "l00696" name = "l00696" > < / a > < span class = "lineno" > 696< / span > uint3 tid [[threadgroup_position_in_grid]],< / div >
< div class = "line" > < a id = "l00697" name = "l00697" > < / a > < span class = "lineno" > 697< / span > uint simd_gid [[simdgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l00698" name = "l00698" > < / a > < span class = "lineno" > 698< / span > uint simd_lid [[thread_index_in_simdgroup]]) {< / div >
< div class = "line" > < a id = "l00699" name = "l00699" > < / a > < span class = "lineno" > 699< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > power_of_2_bits = (bits & (bits - 1)) == 0;< / div >
< div class = "line" > < a id = "l00700" name = "l00700" > < / a > < span class = "lineno" > 700< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > num_simdgroups = 2;< / div >
< div class = "line" > < a id = "l00701" name = "l00701" > < / a > < span class = "lineno" > 701< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > results_per_simdgroup = 4;< / div >
< div class = "line" > < a id = "l00702" name = "l00702" > < / a > < span class = "lineno" > 702< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > packs_per_thread = 1;< / div >
< div class = "line" > < a id = "l00703" name = "l00703" > < / a > < span class = "lineno" > 703< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 32 / bits;< / div >
< div class = "line" > < a id = "l00704" name = "l00704" > < / a > < span class = "lineno" > 704< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > bytes_per_pack = power_of_2_bits ? 4 : 3;< / div >
< div class = "line" > < a id = "l00705" name = "l00705" > < / a > < span class = "lineno" > 705< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > values_per_thread = pack_factor * packs_per_thread;< / div >
< div class = "line" > < a id = "l00706" name = "l00706" > < / a > < span class = "lineno" > 706< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > block_size = values_per_thread * < a class = "code hl_variable" href = "quantized_8h.html#a62969a218d93680f5e35d0c61b160b99" > SIMD_SIZE< / a > ;< / div >
< div class = "line" > < a id = "l00707" name = "l00707" > < / a > < span class = "lineno" > 707< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > scale_step_per_thread = group_size / values_per_thread;< / div >
< div class = "line" > < a id = "l00708" name = "l00708" > < / a > < span class = "lineno" > 708< / span > < / div >
< div class = "line" > < a id = "l00709" name = "l00709" > < / a > < span class = "lineno" > 709< / span > < span class = "keyword" > const< / span > device uint8_t* ws = (< span class = "keyword" > const< / span > device uint8_t*)w;< / div >
< div class = "line" > < a id = "l00710" name = "l00710" > < / a > < span class = "lineno" > 710< / span > < / div >
< div class = "line" > < a id = "l00711" name = "l00711" > < / a > < span class = "lineno" > 711< / span > < span class = "keyword" > typedef< / span > < span class = "keywordtype" > float< / span > U;< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00712" name = "l00712" > < / a > < span class = "lineno" > 712< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00713" name = "l00713" > < / a > < span class = "lineno" > 713< / span > thread U x_thread[values_per_thread];< / div >
< div class = "line" > < a id = "l00714" name = "l00714" > < / a > < span class = "lineno" > 714< / span > thread U result[results_per_simdgroup] = {0};< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00715" name = "l00715" > < / a > < span class = "lineno" > 715< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00716" name = "l00716" > < / a > < span class = "lineno" > 716< / span > < span class = "comment" > // Adjust positions< / span > < / div >
< div class = "line" > < a id = "l00717" name = "l00717" > < / a > < span class = "lineno" > 717< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > in_vec_size_w = in_vec_size * bytes_per_pack / pack_factor;< / div >
< div class = "line" > < a id = "l00718" name = "l00718" > < / a > < span class = "lineno" > 718< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > in_vec_size_g = in_vec_size / group_size;< / div >
< div class = "line" > < a id = "l00719" name = "l00719" > < / a > < span class = "lineno" > 719< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > out_row = tid.x * (num_simdgroups * results_per_simdgroup) +< / div >
< div class = "line" > < a id = "l00720" name = "l00720" > < / a > < span class = "lineno" > 720< / span > simd_gid * results_per_simdgroup;< / div >
< div class = "line" > < a id = "l00721" name = "l00721" > < / a > < span class = "lineno" > 721< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > used_out_row = < a class = "code hl_function" href = "namespacemetal.html#a6653b28c9473087141eddce39878d4d3" > min< / a > (out_vec_size - results_per_simdgroup, out_row);< / div >
< div class = "line" > < a id = "l00722" name = "l00722" > < / a > < span class = "lineno" > 722< / span > < / div >
< div class = "line" > < a id = "l00723" name = "l00723" > < / a > < span class = "lineno" > 723< / span > < span class = "keywordflow" > if< / span > (out_row > = out_vec_size) {< / div >
< div class = "line" > < a id = "l00724" name = "l00724" > < / a > < span class = "lineno" > 724< / span > < span class = "keywordflow" > return< / span > ;< / div >
< div class = "line" > < a id = "l00725" name = "l00725" > < / a > < span class = "lineno" > 725< / span > }< / div >
< div class = "line" > < a id = "l00726" name = "l00726" > < / a > < span class = "lineno" > 726< / span > < / div >
< div class = "line" > < a id = "l00727" name = "l00727" > < / a > < span class = "lineno" > 727< / span > < span class = "comment" > // In this case we need to properly guard all our reads because there isn' t< / span > < / div >
< div class = "line" > < a id = "l00728" name = "l00728" > < / a > < span class = "lineno" > 728< / span > < span class = "comment" > // even 1 tile in the matrix< / span > < / div >
< div class = "line" > < a id = "l00729" name = "l00729" > < / a > < span class = "lineno" > 729< / span > < span class = "keywordflow" > if< / span > (out_vec_size < (num_simdgroups * results_per_simdgroup)) {< / div >
< div class = "line" > < a id = "l00730" name = "l00730" > < / a > < span class = "lineno" > 730< / span > ws +=< / div >
< div class = "line" > < a id = "l00731" name = "l00731" > < / a > < span class = "lineno" > 731< / span > out_row * in_vec_size_w + simd_lid * packs_per_thread * bytes_per_pack;< / div >
< div class = "line" > < a id = "l00732" name = "l00732" > < / a > < span class = "lineno" > 732< / span > scales += out_row * in_vec_size_g + simd_lid / scale_step_per_thread;< / div >
< div class = "line" > < a id = "l00733" name = "l00733" > < / a > < span class = "lineno" > 733< / span > biases += out_row * in_vec_size_g + simd_lid / scale_step_per_thread;< / div >
< div class = "line" > < a id = "l00734" name = "l00734" > < / a > < span class = "lineno" > 734< / span > x += tid.y * in_vec_size + simd_lid * values_per_thread;< / div >
< div class = "line" > < a id = "l00735" name = "l00735" > < / a > < span class = "lineno" > 735< / span > y += tid.y * out_vec_size + out_row;< / div >
< div class = "line" > < a id = "l00736" name = "l00736" > < / a > < span class = "lineno" > 736< / span > < / div >
< div class = "line" > < a id = "l00737" name = "l00737" > < / a > < span class = "lineno" > 737< / span > < span class = "keywordtype" > int< / span > k = 0;< / div >
< div class = "line" > < a id = "l00738" name = "l00738" > < / a > < span class = "lineno" > 738< / span > < span class = "keywordflow" > for< / span > (; k < in_vec_size - block_size; k += block_size) {< / div >
< div class = "line" > < a id = "l00739" name = "l00739" > < / a > < span class = "lineno" > 739< / span > U sum = < a class = "code hl_function" href = "quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9" > load_vector< T, U, values_per_thread, bits> < / a > (x, x_thread);< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00740" name = "l00740" > < / a > < span class = "lineno" > 740< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00741" name = "l00741" > < / a > < span class = "lineno" > 741< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > row = 0; out_row + row < out_vec_size; row++) {< / div >
< div class = "line" > < a id = "l00742" name = "l00742" > < / a > < span class = "lineno" > 742< / span > < span class = "keyword" > auto< / span > wl = (< span class = "keyword" > const< / span > device uint8_t*)(ws + row * in_vec_size_w);< / div >
< div class = "line" > < a id = "l00743" name = "l00743" > < / a > < span class = "lineno" > 743< / span > < span class = "keyword" > const< / span > device T* sl = scales + row * in_vec_size_g;< / div >
< div class = "line" > < a id = "l00744" name = "l00744" > < / a > < span class = "lineno" > 744< / span > < span class = "keyword" > const< / span > device T* bl = biases + row * in_vec_size_g;< / div >
< div class = "line" > < a id = "l00745" name = "l00745" > < / a > < span class = "lineno" > 745< / span > < / div >
< div class = "line" > < a id = "l00746" name = "l00746" > < / a > < span class = "lineno" > 746< / span > U s = sl[0];< / div >
< div class = "line" > < a id = "l00747" name = "l00747" > < / a > < span class = "lineno" > 747< / span > U b = bl[0];< / div >
< div class = "line" > < a id = "l00748" name = "l00748" > < / a > < span class = "lineno" > 748< / span > result[row] +=< / div >
< div class = "line" > < a id = "l00749" name = "l00749" > < / a > < span class = "lineno" > 749< / span > < a class = "code hl_function" href = "quantized_8h.html#ab364d58ab652e3ad87a8f80910556071" > qdot< U, values_per_thread, bits> < / a > (wl, x_thread, s, b, sum);< / div >
< div class = "line" > < a id = "l00750" name = "l00750" > < / a > < span class = "lineno" > 750< / span > }< / div >
< div class = "line" > < a id = "l00751" name = "l00751" > < / a > < span class = "lineno" > 751< / span > < / div >
< div class = "line" > < a id = "l00752" name = "l00752" > < / a > < span class = "lineno" > 752< / span > ws += block_size * bytes_per_pack / pack_factor;< / div >
< div class = "line" > < a id = "l00753" name = "l00753" > < / a > < span class = "lineno" > 753< / span > scales += block_size / group_size;< / div >
< div class = "line" > < a id = "l00754" name = "l00754" > < / a > < span class = "lineno" > 754< / span > biases += block_size / group_size;< / div >
< div class = "line" > < a id = "l00755" name = "l00755" > < / a > < span class = "lineno" > 755< / span > x += block_size;< / div >
< div class = "line" > < a id = "l00756" name = "l00756" > < / a > < span class = "lineno" > 756< / span > }< / div >
< div class = "line" > < a id = "l00757" name = "l00757" > < / a > < span class = "lineno" > 757< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > remaining = clamp(< / div >
< div class = "line" > < a id = "l00758" name = "l00758" > < / a > < span class = "lineno" > 758< / span > < span class = "keyword" > static_cast< < / span > < span class = "keywordtype" > int< / span > < span class = "keyword" > > < / span > (in_vec_size - k - simd_lid * values_per_thread),< / div >
< div class = "line" > < a id = "l00759" name = "l00759" > < / a > < span class = "lineno" > 759< / span > 0,< / div >
< div class = "line" > < a id = "l00760" name = "l00760" > < / a > < span class = "lineno" > 760< / span > values_per_thread);< / div >
< div class = "line" > < a id = "l00761" name = "l00761" > < / a > < span class = "lineno" > 761< / span > < span class = "keywordflow" > if< / span > (remaining > 0) {< / div >
< div class = "line" > < a id = "l00762" name = "l00762" > < / a > < span class = "lineno" > 762< / span > U sum = < a class = "code hl_function" href = "quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7" > load_vector_safe< T, U, values_per_thread, bits> < / a > (< / div >
< div class = "line" > < a id = "l00763" name = "l00763" > < / a > < span class = "lineno" > 763< / span > x, x_thread, remaining);< / div >
< div class = "line" > < a id = "l00764" name = "l00764" > < / a > < span class = "lineno" > 764< / span > < / div >
< div class = "line" > < a id = "l00765" name = "l00765" > < / a > < span class = "lineno" > 765< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > row = 0; out_row + row < out_vec_size; row++) {< / div >
< div class = "line" > < a id = "l00766" name = "l00766" > < / a > < span class = "lineno" > 766< / span > < span class = "keyword" > auto< / span > wl = (< span class = "keyword" > const< / span > device uint8_t*)(ws + row * in_vec_size_w);< / div >
< div class = "line" > < a id = "l00767" name = "l00767" > < / a > < span class = "lineno" > 767< / span > < span class = "keyword" > const< / span > device T* sl = scales + row * in_vec_size_g;< / div >
< div class = "line" > < a id = "l00768" name = "l00768" > < / a > < span class = "lineno" > 768< / span > < span class = "keyword" > const< / span > device T* bl = biases + row * in_vec_size_g;< / div >
< div class = "line" > < a id = "l00769" name = "l00769" > < / a > < span class = "lineno" > 769< / span > < / div >
< div class = "line" > < a id = "l00770" name = "l00770" > < / a > < span class = "lineno" > 770< / span > U s = sl[0];< / div >
< div class = "line" > < a id = "l00771" name = "l00771" > < / a > < span class = "lineno" > 771< / span > U b = bl[0];< / div >
< div class = "line" > < a id = "l00772" name = "l00772" > < / a > < span class = "lineno" > 772< / span > result[row] +=< / div >
< div class = "line" > < a id = "l00773" name = "l00773" > < / a > < span class = "lineno" > 773< / span > < a class = "code hl_function" href = "quantized_8h.html#ab364d58ab652e3ad87a8f80910556071" > qdot< U, values_per_thread, bits> < / a > (wl, x_thread, s, b, sum);< / div >
< div class = "line" > < a id = "l00774" name = "l00774" > < / a > < span class = "lineno" > 774< / span > }< / div >
< div class = "line" > < a id = "l00775" name = "l00775" > < / a > < span class = "lineno" > 775< / span > }< / div >
< div class = "line" > < a id = "l00776" name = "l00776" > < / a > < span class = "lineno" > 776< / span > < / div >
< div class = "line" > < a id = "l00777" name = "l00777" > < / a > < span class = "lineno" > 777< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > row = 0; out_row + row < out_vec_size; row++) {< / div >
< div class = "line" > < a id = "l00778" name = "l00778" > < / a > < span class = "lineno" > 778< / span > result[row] = < a class = "code hl_function" href = "namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5" > simd_sum< / a > (result[row]);< / div >
< div class = "line" > < a id = "l00779" name = "l00779" > < / a > < span class = "lineno" > 779< / span > < span class = "keywordflow" > if< / span > (simd_lid == 0) {< / div >
< div class = "line" > < a id = "l00780" name = "l00780" > < / a > < span class = "lineno" > 780< / span > y[row] = < span class = "keyword" > static_cast< < / span > T< span class = "keyword" > > < / span > (result[row]);< / div >
< div class = "line" > < a id = "l00781" name = "l00781" > < / a > < span class = "lineno" > 781< / span > }< / div >
< div class = "line" > < a id = "l00782" name = "l00782" > < / a > < span class = "lineno" > 782< / span > }< / div >
< div class = "line" > < a id = "l00783" name = "l00783" > < / a > < span class = "lineno" > 783< / span > }< / div >
< div class = "line" > < a id = "l00784" name = "l00784" > < / a > < span class = "lineno" > 784< / span > < / div >
< div class = "line" > < a id = "l00785" name = "l00785" > < / a > < span class = "lineno" > 785< / span > < span class = "comment" > // In this case the last tile is moved back to redo some output values< / span > < / div >
< div class = "line" > < a id = "l00786" name = "l00786" > < / a > < span class = "lineno" > 786< / span > < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l00787" name = "l00787" > < / a > < span class = "lineno" > 787< / span > ws += used_out_row * in_vec_size_w +< / div >
< div class = "line" > < a id = "l00788" name = "l00788" > < / a > < span class = "lineno" > 788< / span > simd_lid * packs_per_thread * bytes_per_pack;< / div >
< div class = "line" > < a id = "l00789" name = "l00789" > < / a > < span class = "lineno" > 789< / span > scales += used_out_row * in_vec_size_g + simd_lid / scale_step_per_thread;< / div >
< div class = "line" > < a id = "l00790" name = "l00790" > < / a > < span class = "lineno" > 790< / span > biases += used_out_row * in_vec_size_g + simd_lid / scale_step_per_thread;< / div >
< div class = "line" > < a id = "l00791" name = "l00791" > < / a > < span class = "lineno" > 791< / span > x += tid.y * in_vec_size + simd_lid * values_per_thread;< / div >
< div class = "line" > < a id = "l00792" name = "l00792" > < / a > < span class = "lineno" > 792< / span > y += tid.y * out_vec_size + used_out_row;< / div >
< div class = "line" > < a id = "l00793" name = "l00793" > < / a > < span class = "lineno" > 793< / span > < / div >
< div class = "line" > < a id = "l00794" name = "l00794" > < / a > < span class = "lineno" > 794< / span > < span class = "keywordtype" > int< / span > k = 0;< / div >
< div class = "line" > < a id = "l00795" name = "l00795" > < / a > < span class = "lineno" > 795< / span > < span class = "keywordflow" > for< / span > (; k < in_vec_size - block_size; k += block_size) {< / div >
< div class = "line" > < a id = "l00796" name = "l00796" > < / a > < span class = "lineno" > 796< / span > U sum = < a class = "code hl_function" href = "quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9" > load_vector< T, U, values_per_thread, bits> < / a > (x, x_thread);< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00797" name = "l00797" > < / a > < span class = "lineno" > 797< / span > < / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00798" name = "l00798" > < / a > < span class = "lineno" > 798< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > row = 0; row < results_per_simdgroup; row++) {< / div >
< div class = "line" > < a id = "l00799" name = "l00799" > < / a > < span class = "lineno" > 799< / span > < span class = "keyword" > auto< / span > wl = (< span class = "keyword" > const< / span > device uint8_t*)(ws + row * in_vec_size_w);< / div >
< div class = "line" > < a id = "l00800" name = "l00800" > < / a > < span class = "lineno" > 800< / span > < span class = "keyword" > const< / span > device T* sl = scales + row * in_vec_size_g;< / div >
< div class = "line" > < a id = "l00801" name = "l00801" > < / a > < span class = "lineno" > 801< / span > < span class = "keyword" > const< / span > device T* bl = biases + row * in_vec_size_g;< / div >
< div class = "line" > < a id = "l00802" name = "l00802" > < / a > < span class = "lineno" > 802< / span > < / div >
< div class = "line" > < a id = "l00803" name = "l00803" > < / a > < span class = "lineno" > 803< / span > U s = sl[0];< / div >
< div class = "line" > < a id = "l00804" name = "l00804" > < / a > < span class = "lineno" > 804< / span > U b = bl[0];< / div >
< div class = "line" > < a id = "l00805" name = "l00805" > < / a > < span class = "lineno" > 805< / span > result[row] +=< / div >
< div class = "line" > < a id = "l00806" name = "l00806" > < / a > < span class = "lineno" > 806< / span > < a class = "code hl_function" href = "quantized_8h.html#ab364d58ab652e3ad87a8f80910556071" > qdot< U, values_per_thread, bits> < / a > (wl, x_thread, s, b, sum);< / div >
< div class = "line" > < a id = "l00807" name = "l00807" > < / a > < span class = "lineno" > 807< / span > }< / div >
< div class = "line" > < a id = "l00808" name = "l00808" > < / a > < span class = "lineno" > 808< / span > < / div >
< div class = "line" > < a id = "l00809" name = "l00809" > < / a > < span class = "lineno" > 809< / span > ws += block_size * bytes_per_pack / pack_factor;< / div >
< div class = "line" > < a id = "l00810" name = "l00810" > < / a > < span class = "lineno" > 810< / span > scales += block_size / group_size;< / div >
< div class = "line" > < a id = "l00811" name = "l00811" > < / a > < span class = "lineno" > 811< / span > biases += block_size / group_size;< / div >
< div class = "line" > < a id = "l00812" name = "l00812" > < / a > < span class = "lineno" > 812< / span > x += block_size;< / div >
< div class = "line" > < a id = "l00813" name = "l00813" > < / a > < span class = "lineno" > 813< / span > }< / div >
< div class = "line" > < a id = "l00814" name = "l00814" > < / a > < span class = "lineno" > 814< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > remaining = clamp(< / div >
< div class = "line" > < a id = "l00815" name = "l00815" > < / a > < span class = "lineno" > 815< / span > < span class = "keyword" > static_cast< < / span > < span class = "keywordtype" > int< / span > < span class = "keyword" > > < / span > (in_vec_size - k - simd_lid * values_per_thread),< / div >
< div class = "line" > < a id = "l00816" name = "l00816" > < / a > < span class = "lineno" > 816< / span > 0,< / div >
< div class = "line" > < a id = "l00817" name = "l00817" > < / a > < span class = "lineno" > 817< / span > values_per_thread);< / div >
< div class = "line" > < a id = "l00818" name = "l00818" > < / a > < span class = "lineno" > 818< / span > < span class = "keywordflow" > if< / span > (remaining > 0) {< / div >
< div class = "line" > < a id = "l00819" name = "l00819" > < / a > < span class = "lineno" > 819< / span > U sum = < a class = "code hl_function" href = "quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7" > load_vector_safe< T, U, values_per_thread, bits> < / a > (< / div >
< div class = "line" > < a id = "l00820" name = "l00820" > < / a > < span class = "lineno" > 820< / span > x, x_thread, remaining);< / div >
< div class = "line" > < a id = "l00821" name = "l00821" > < / a > < span class = "lineno" > 821< / span > < / div >
< div class = "line" > < a id = "l00822" name = "l00822" > < / a > < span class = "lineno" > 822< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > row = 0; row < results_per_simdgroup; row++) {< / div >
< div class = "line" > < a id = "l00823" name = "l00823" > < / a > < span class = "lineno" > 823< / span > < span class = "keyword" > auto< / span > wl = (< span class = "keyword" > const< / span > device uint8_t*)(ws + row * in_vec_size_w);< / div >
< div class = "line" > < a id = "l00824" name = "l00824" > < / a > < span class = "lineno" > 824< / span > < span class = "keyword" > const< / span > device T* sl = scales + row * in_vec_size_g;< / div >
< div class = "line" > < a id = "l00825" name = "l00825" > < / a > < span class = "lineno" > 825< / span > < span class = "keyword" > const< / span > device T* bl = biases + row * in_vec_size_g;< / div >
< div class = "line" > < a id = "l00826" name = "l00826" > < / a > < span class = "lineno" > 826< / span > < / div >
< div class = "line" > < a id = "l00827" name = "l00827" > < / a > < span class = "lineno" > 827< / span > U s = sl[0];< / div >
< div class = "line" > < a id = "l00828" name = "l00828" > < / a > < span class = "lineno" > 828< / span > U b = bl[0];< / div >
< div class = "line" > < a id = "l00829" name = "l00829" > < / a > < span class = "lineno" > 829< / span > result[row] += < a class = "code hl_function" href = "quantized_8h.html#a07b26d2d0b0d65dfe925c452c453fa42" > qdot_safe< U, values_per_thread, bits> < / a > (< / div >
< div class = "line" > < a id = "l00830" name = "l00830" > < / a > < span class = "lineno" > 830< / span > wl, x_thread, s, b, sum, remaining);< / div >
< div class = "line" > < a id = "l00831" name = "l00831" > < / a > < span class = "lineno" > 831< / span > }< / div >
< div class = "line" > < a id = "l00832" name = "l00832" > < / a > < span class = "lineno" > 832< / span > }< / div >
< div class = "line" > < a id = "l00833" name = "l00833" > < / a > < span class = "lineno" > 833< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > row = 0; row < results_per_simdgroup; row++) {< / div >
< div class = "line" > < a id = "l00834" name = "l00834" > < / a > < span class = "lineno" > 834< / span > result[row] = < a class = "code hl_function" href = "namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5" > simd_sum< / a > (result[row]);< / div >
< div class = "line" > < a id = "l00835" name = "l00835" > < / a > < span class = "lineno" > 835< / span > < span class = "keywordflow" > if< / span > (simd_lid == 0) {< / div >
< div class = "line" > < a id = "l00836" name = "l00836" > < / a > < span class = "lineno" > 836< / span > y[row] = < span class = "keyword" > static_cast< < / span > T< span class = "keyword" > > < / span > (result[row]);< / div >
2024-10-26 04:23:45 +08:00
< div class = "line" > < a id = "l00837" name = "l00837" > < / a > < span class = "lineno" > 837< / span > }< / div >
< div class = "line" > < a id = "l00838" name = "l00838" > < / a > < span class = "lineno" > 838< / span > }< / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00839" name = "l00839" > < / a > < span class = "lineno" > 839< / span > }< / div >
< div class = "line" > < a id = "l00840" name = "l00840" > < / a > < span class = "lineno" > 840< / span > }< / div >
2024-10-15 23:12:17 +08:00
< / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00841" name = "l00841" > < / a > < span class = "lineno" > 841< / span > < / div >
< div class = "line" > < a id = "l00842" name = "l00842" > < / a > < span class = "lineno" > 842< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T, const < span class = "keywordtype" > int< / span > group_size, const < span class = "keywordtype" > int< / span > bits> < / div >
< div class = "foldopen" id = "foldopen00843" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00843" name = "l00843" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a" > 843< / a > < / span > METAL_FUNC < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a" > qvm_impl< / a > (< / div >
< div class = "line" > < a id = "l00844" name = "l00844" > < / a > < span class = "lineno" > 844< / span > < span class = "keyword" > const< / span > device uint32_t* w,< / div >
< div class = "line" > < a id = "l00845" name = "l00845" > < / a > < span class = "lineno" > 845< / span > < span class = "keyword" > const< / span > device T* scales,< / div >
< div class = "line" > < a id = "l00846" name = "l00846" > < / a > < span class = "lineno" > 846< / span > < span class = "keyword" > const< / span > device T* biases,< / div >
< div class = "line" > < a id = "l00847" name = "l00847" > < / a > < span class = "lineno" > 847< / span > < span class = "keyword" > const< / span > device T* x,< / div >
< div class = "line" > < a id = "l00848" name = "l00848" > < / a > < span class = "lineno" > 848< / span > device T* y,< / div >
< div class = "line" > < a id = "l00849" name = "l00849" > < / a > < span class = "lineno" > 849< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > in_vec_size,< / div >
< div class = "line" > < a id = "l00850" name = "l00850" > < / a > < span class = "lineno" > 850< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > out_vec_size,< / div >
< div class = "line" > < a id = "l00851" name = "l00851" > < / a > < span class = "lineno" > 851< / span > uint3 tid [[threadgroup_position_in_grid]],< / div >
< div class = "line" > < a id = "l00852" name = "l00852" > < / a > < span class = "lineno" > 852< / span > uint simd_gid [[simdgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l00853" name = "l00853" > < / a > < span class = "lineno" > 853< / span > uint simd_lid [[thread_index_in_simdgroup]]) {< / div >
< div class = "line" > < a id = "l00854" name = "l00854" > < / a > < span class = "lineno" > 854< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > power_of_2_bits = (bits & (bits - 1)) == 0;< / div >
< div class = "line" > < a id = "l00855" name = "l00855" > < / a > < span class = "lineno" > 855< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > num_simdgroups = 2;< / div >
< div class = "line" > < a id = "l00856" name = "l00856" > < / a > < span class = "lineno" > 856< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 32 / bits;< / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l00857" name = "l00857" > < / a > < span class = "lineno" > 857< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > bytes_per_pack = power_of_2_bits ? 1 : 3;< / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00858" name = "l00858" > < / a > < span class = "lineno" > 858< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > tn = 32 / pack_factor;< / div >
< div class = "line" > < a id = "l00859" name = "l00859" > < / a > < span class = "lineno" > 859< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > block_size = < a class = "code hl_variable" href = "quantized_8h.html#a62969a218d93680f5e35d0c61b160b99" > SIMD_SIZE< / a > ;< / div >
< div class = "line" > < a id = "l00860" name = "l00860" > < / a > < span class = "lineno" > 860< / span > < / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l00861" name = "l00861" > < / a > < span class = "lineno" > 861< / span > < span class = "keyword" > using < / span > W_T =< / div >
< div class = "line" > < a id = "l00862" name = "l00862" > < / a > < span class = "lineno" > 862< / span > < span class = "keyword" > typename< / span > < a class = "code hl_typedef" href = "struct_conditional_type.html#a00bac71c43763817c4422bf0363dc92b" > ConditionalType< power_of_2_bits, uint32_t, uint8_t> ::type< / a > ;< / div >
< div class = "line" > < a id = "l00863" name = "l00863" > < / a > < span class = "lineno" > 863< / span > < span class = "keyword" > const< / span > device W_T* ws = (< span class = "keyword" > const< / span > device W_T*)w;< / div >
< div class = "line" > < a id = "l00864" name = "l00864" > < / a > < span class = "lineno" > 864< / span > < / div >
< div class = "line" > < a id = "l00865" name = "l00865" > < / a > < span class = "lineno" > 865< / span > < span class = "keyword" > typedef< / span > < span class = "keywordtype" > float< / span > U;< / div >
< div class = "line" > < a id = "l00866" name = "l00866" > < / a > < span class = "lineno" > 866< / span > < span class = "keyword" > typedef< / span > < span class = "keyword" > struct < / span > {< / div >
< div class = "line" > < a id = "l00867" name = "l00867" > < / a > < span class = "lineno" > 867< / span > W_T wi[tn * bytes_per_pack];< / div >
< div class = "line" > < a id = "l00868" name = "l00868" > < / a > < span class = "lineno" > 868< / span > } vec_w;< / div >
< div class = "line" > < a id = "l00869" name = "l00869" > < / a > < span class = "lineno" > 869< / span > < / div >
< div class = "line" > < a id = "l00870" name = "l00870" > < / a > < span class = "lineno" > 870< / span > thread vec_w w_local;< / div >
< div class = "line" > < a id = "l00871" name = "l00871" > < / a > < span class = "lineno" > 871< / span > thread U result[tn * pack_factor] = {0};< / div >
< div class = "line" > < a id = "l00872" name = "l00872" > < / a > < span class = "lineno" > 872< / span > thread U scale = 1;< / div >
< div class = "line" > < a id = "l00873" name = "l00873" > < / a > < span class = "lineno" > 873< / span > thread U bias = 0;< / div >
< div class = "line" > < a id = "l00874" name = "l00874" > < / a > < span class = "lineno" > 874< / span > thread U x_local = 0;< / div >
< div class = "line" > < a id = "l00875" name = "l00875" > < / a > < span class = "lineno" > 875< / span > < / div >
< div class = "line" > < a id = "l00876" name = "l00876" > < / a > < span class = "lineno" > 876< / span > < span class = "comment" > // Adjust positions< / span > < / div >
< div class = "line" > < a id = "l00877" name = "l00877" > < / a > < span class = "lineno" > 877< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > out_vec_size_w = out_vec_size * bytes_per_pack / pack_factor;< / div >
< div class = "line" > < a id = "l00878" name = "l00878" > < / a > < span class = "lineno" > 878< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > out_vec_size_g = out_vec_size / group_size;< / div >
< div class = "line" > < a id = "l00879" name = "l00879" > < / a > < span class = "lineno" > 879< / span > < span class = "keywordtype" > int< / span > out_col = pack_factor * tn * (tid.x * num_simdgroups + simd_gid);< / div >
< div class = "line" > < a id = "l00880" name = "l00880" > < / a > < span class = "lineno" > 880< / span > ws += out_col * bytes_per_pack / pack_factor + simd_lid * out_vec_size_w;< / div >
< div class = "line" > < a id = "l00881" name = "l00881" > < / a > < span class = "lineno" > 881< / span > scales += out_col / group_size + simd_lid * out_vec_size_g;< / div >
< div class = "line" > < a id = "l00882" name = "l00882" > < / a > < span class = "lineno" > 882< / span > biases += out_col / group_size + simd_lid * out_vec_size_g;< / div >
< div class = "line" > < a id = "l00883" name = "l00883" > < / a > < span class = "lineno" > 883< / span > x += tid.y * in_vec_size + simd_lid;< / div >
< div class = "line" > < a id = "l00884" name = "l00884" > < / a > < span class = "lineno" > 884< / span > y += tid.y * out_vec_size + out_col;< / div >
< div class = "line" > < a id = "l00885" name = "l00885" > < / a > < span class = "lineno" > 885< / span > < / div >
< div class = "line" > < a id = "l00886" name = "l00886" > < / a > < span class = "lineno" > 886< / span > < span class = "keywordflow" > if< / span > (out_col > = out_vec_size) {< / div >
< div class = "line" > < a id = "l00887" name = "l00887" > < / a > < span class = "lineno" > 887< / span > < span class = "keywordflow" > return< / span > ;< / div >
< div class = "line" > < a id = "l00888" name = "l00888" > < / a > < span class = "lineno" > 888< / span > }< / div >
< div class = "line" > < a id = "l00889" name = "l00889" > < / a > < span class = "lineno" > 889< / span > < / div >
< div class = "line" > < a id = "l00890" name = "l00890" > < / a > < span class = "lineno" > 890< / span > < span class = "comment" > // Loop over in_vec in blocks of block_size< / span > < / div >
< div class = "line" > < a id = "l00891" name = "l00891" > < / a > < span class = "lineno" > 891< / span > < span class = "keywordtype" > int< / span > remaining = in_vec_size % block_size;< / div >
< div class = "line" > < a id = "l00892" name = "l00892" > < / a > < span class = "lineno" > 892< / span > < span class = "keywordflow" > if< / span > (remaining == 0) {< / div >
< div class = "line" > < a id = "l00893" name = "l00893" > < / a > < span class = "lineno" > 893< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < in_vec_size; i += block_size) {< / div >
< div class = "line" > < a id = "l00894" name = "l00894" > < / a > < span class = "lineno" > 894< / span > x_local = *x;< / div >
< div class = "line" > < a id = "l00895" name = "l00895" > < / a > < span class = "lineno" > 895< / span > scale = *scales;< / div >
< div class = "line" > < a id = "l00896" name = "l00896" > < / a > < span class = "lineno" > 896< / span > bias = *biases;< / div >
< div class = "line" > < a id = "l00897" name = "l00897" > < / a > < span class = "lineno" > 897< / span > w_local = *((device vec_w*)ws);< / div >
< div class = "line" > < a id = "l00898" name = "l00898" > < / a > < span class = "lineno" > 898< / span > < a class = "code hl_function" href = "quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58" > qouter< U, tn * pack_factor, bits> < / a > (< / div >
< div class = "line" > < a id = "l00899" name = "l00899" > < / a > < span class = "lineno" > 899< / span > (thread uint8_t*)& w_local, x_local, scale, bias, result);< / div >
< div class = "line" > < a id = "l00900" name = "l00900" > < / a > < span class = "lineno" > 900< / span > < / div >
< div class = "line" > < a id = "l00901" name = "l00901" > < / a > < span class = "lineno" > 901< / span > x += block_size;< / div >
< div class = "line" > < a id = "l00902" name = "l00902" > < / a > < span class = "lineno" > 902< / span > scales += block_size * out_vec_size_g;< / div >
< div class = "line" > < a id = "l00903" name = "l00903" > < / a > < span class = "lineno" > 903< / span > biases += block_size * out_vec_size_g;< / div >
< div class = "line" > < a id = "l00904" name = "l00904" > < / a > < span class = "lineno" > 904< / span > ws += block_size * out_vec_size_w;< / div >
< div class = "line" > < a id = "l00905" name = "l00905" > < / a > < span class = "lineno" > 905< / span > }< / div >
< div class = "line" > < a id = "l00906" name = "l00906" > < / a > < span class = "lineno" > 906< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l00907" name = "l00907" > < / a > < span class = "lineno" > 907< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = block_size; i < in_vec_size; i += block_size) {< / div >
< div class = "line" > < a id = "l00908" name = "l00908" > < / a > < span class = "lineno" > 908< / span > x_local = *x;< / div >
< div class = "line" > < a id = "l00909" name = "l00909" > < / a > < span class = "lineno" > 909< / span > scale = *scales;< / div >
< div class = "line" > < a id = "l00910" name = "l00910" > < / a > < span class = "lineno" > 910< / span > bias = *biases;< / div >
< div class = "line" > < a id = "l00911" name = "l00911" > < / a > < span class = "lineno" > 911< / span > w_local = *((device vec_w*)ws);< / div >
< div class = "line" > < a id = "l00912" name = "l00912" > < / a > < span class = "lineno" > 912< / span > < / div >
< div class = "line" > < a id = "l00913" name = "l00913" > < / a > < span class = "lineno" > 913< / span > < a class = "code hl_function" href = "quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58" > qouter< U, tn * pack_factor, bits> < / a > (< / div >
< div class = "line" > < a id = "l00914" name = "l00914" > < / a > < span class = "lineno" > 914< / span > (thread uint8_t*)& w_local, x_local, scale, bias, result);< / div >
< div class = "line" > < a id = "l00915" name = "l00915" > < / a > < span class = "lineno" > 915< / span > < / div >
< div class = "line" > < a id = "l00916" name = "l00916" > < / a > < span class = "lineno" > 916< / span > x += block_size;< / div >
< div class = "line" > < a id = "l00917" name = "l00917" > < / a > < span class = "lineno" > 917< / span > scales += block_size * out_vec_size_g;< / div >
< div class = "line" > < a id = "l00918" name = "l00918" > < / a > < span class = "lineno" > 918< / span > biases += block_size * out_vec_size_g;< / div >
< div class = "line" > < a id = "l00919" name = "l00919" > < / a > < span class = "lineno" > 919< / span > ws += block_size * out_vec_size_w;< / div >
< div class = "line" > < a id = "l00920" name = "l00920" > < / a > < span class = "lineno" > 920< / span > }< / div >
< div class = "line" > < a id = "l00921" name = "l00921" > < / a > < span class = "lineno" > 921< / span > < span class = "keywordflow" > if< / span > (< span class = "keyword" > static_cast< < / span > < span class = "keywordtype" > int< / span > < span class = "keyword" > > < / span > (simd_lid) < remaining) {< / div >
< div class = "line" > < a id = "l00922" name = "l00922" > < / a > < span class = "lineno" > 922< / span > x_local = *x;< / div >
< div class = "line" > < a id = "l00923" name = "l00923" > < / a > < span class = "lineno" > 923< / span > scale = *scales;< / div >
< div class = "line" > < a id = "l00924" name = "l00924" > < / a > < span class = "lineno" > 924< / span > bias = *biases;< / div >
< div class = "line" > < a id = "l00925" name = "l00925" > < / a > < span class = "lineno" > 925< / span > w_local = *((device vec_w*)ws);< / div >
< div class = "line" > < a id = "l00926" name = "l00926" > < / a > < span class = "lineno" > 926< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l00927" name = "l00927" > < / a > < span class = "lineno" > 927< / span > x_local = 0;< / div >
< div class = "line" > < a id = "l00928" name = "l00928" > < / a > < span class = "lineno" > 928< / span > scale = 0;< / div >
< div class = "line" > < a id = "l00929" name = "l00929" > < / a > < span class = "lineno" > 929< / span > bias = 0;< / div >
< div class = "line" > < a id = "l00930" name = "l00930" > < / a > < span class = "lineno" > 930< / span > }< / div >
< div class = "line" > < a id = "l00931" name = "l00931" > < / a > < span class = "lineno" > 931< / span > < a class = "code hl_function" href = "quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58" > qouter< U, tn * pack_factor, bits> < / a > (< / div >
< div class = "line" > < a id = "l00932" name = "l00932" > < / a > < span class = "lineno" > 932< / span > (thread uint8_t*)& w_local, x_local, scale, bias, result);< / div >
< div class = "line" > < a id = "l00933" name = "l00933" > < / a > < span class = "lineno" > 933< / span > }< / div >
< div class = "line" > < a id = "l00934" name = "l00934" > < / a > < span class = "lineno" > 934< / span > < / div >
< div class = "line" > < a id = "l00935" name = "l00935" > < / a > < span class = "lineno" > 935< / span > < span class = "comment" > // Accumulate in the simdgroup< / span > < / div >
< div class = "line" > < a id = "l00936" name = "l00936" > < / a > < span class = "lineno" > 936< / span > < span class = "preprocessor" > #pragma clang loop unroll(full)< / span > < / div >
< div class = "line" > < a id = "l00937" name = "l00937" > < / a > < span class = "lineno" > 937< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > k = 0; k < tn * pack_factor; k++) {< / div >
< div class = "line" > < a id = "l00938" name = "l00938" > < / a > < span class = "lineno" > 938< / span > result[k] = < a class = "code hl_function" href = "namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5" > simd_sum< / a > (result[k]);< / div >
< div class = "line" > < a id = "l00939" name = "l00939" > < / a > < span class = "lineno" > 939< / span > }< / div >
< div class = "line" > < a id = "l00940" name = "l00940" > < / a > < span class = "lineno" > 940< / span > < / div >
< div class = "line" > < a id = "l00941" name = "l00941" > < / a > < span class = "lineno" > 941< / span > < span class = "comment" > // Store the result< / span > < / div >
< div class = "line" > < a id = "l00942" name = "l00942" > < / a > < span class = "lineno" > 942< / span > < span class = "keywordflow" > if< / span > (simd_lid == 0) {< / div >
< div class = "line" > < a id = "l00943" name = "l00943" > < / a > < span class = "lineno" > 943< / span > < span class = "preprocessor" > #pragma clang loop unroll(full)< / span > < / div >
< div class = "line" > < a id = "l00944" name = "l00944" > < / a > < span class = "lineno" > 944< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > k = 0; k < tn * pack_factor; k++) {< / div >
< div class = "line" > < a id = "l00945" name = "l00945" > < / a > < span class = "lineno" > 945< / span > y[k] = < span class = "keyword" > static_cast< < / span > T< span class = "keyword" > > < / span > (result[k]);< / div >
< div class = "line" > < a id = "l00946" name = "l00946" > < / a > < span class = "lineno" > 946< / span > }< / div >
< div class = "line" > < a id = "l00947" name = "l00947" > < / a > < span class = "lineno" > 947< / span > }< / div >
< div class = "line" > < a id = "l00948" name = "l00948" > < / a > < span class = "lineno" > 948< / span > }< / div >
2024-10-15 23:12:17 +08:00
< / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l00949" name = "l00949" > < / a > < span class = "lineno" > 949< / span > < / div >
< div class = "line" > < a id = "l00950" name = "l00950" > < / a > < span class = "lineno" > 950< / span > < span class = "keyword" > template< / span > < < / div >
< div class = "line" > < a id = "l00951" name = "l00951" > < / a > < span class = "lineno" > 951< / span > < span class = "keyword" > typename< / span > T,< / div >
< div class = "line" > < a id = "l00952" name = "l00952" > < / a > < span class = "lineno" > 952< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > group_size,< / div >
< div class = "line" > < a id = "l00953" name = "l00953" > < / a > < span class = "lineno" > 953< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > < a class = "code hl_function" href = "namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825" > bits< / a > ,< / div >
< div class = "line" > < a id = "l00954" name = "l00954" > < / a > < span class = "lineno" > 954< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > bool< / span > aligned_N,< / div >
< div class = "line" > < a id = "l00955" name = "l00955" > < / a > < span class = "lineno" > 955< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > BM = 32,< / div >
< div class = "line" > < a id = "l00956" name = "l00956" > < / a > < span class = "lineno" > 956< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > BK = 32,< / div >
< div class = "line" > < a id = "l00957" name = "l00957" > < / a > < span class = "lineno" > 957< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > BN = 32> < / div >
< div class = "foldopen" id = "foldopen00958" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l00958" name = "l00958" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#af5750a35e8f5462218effba719f7f5b8" > 958< / a > < / span > METAL_FUNC < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#af5750a35e8f5462218effba719f7f5b8" > qmm_t_impl< / a > (< / div >
< div class = "line" > < a id = "l00959" name = "l00959" > < / a > < span class = "lineno" > 959< / span > < span class = "keyword" > const< / span > device uint32_t* w,< / div >
< div class = "line" > < a id = "l00960" name = "l00960" > < / a > < span class = "lineno" > 960< / span > < span class = "keyword" > const< / span > device T* scales,< / div >
< div class = "line" > < a id = "l00961" name = "l00961" > < / a > < span class = "lineno" > 961< / span > < span class = "keyword" > const< / span > device T* biases,< / div >
< div class = "line" > < a id = "l00962" name = "l00962" > < / a > < span class = "lineno" > 962< / span > < span class = "keyword" > const< / span > device T* x,< / div >
< div class = "line" > < a id = "l00963" name = "l00963" > < / a > < span class = "lineno" > 963< / span > device T* y,< / div >
< div class = "line" > < a id = "l00964" name = "l00964" > < / a > < span class = "lineno" > 964< / span > threadgroup T* Xs,< / div >
< div class = "line" > < a id = "l00965" name = "l00965" > < / a > < span class = "lineno" > 965< / span > threadgroup T* Ws,< / div >
< div class = "line" > < a id = "l00966" name = "l00966" > < / a > < span class = "lineno" > 966< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & K,< / div >
< div class = "line" > < a id = "l00967" name = "l00967" > < / a > < span class = "lineno" > 967< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & N,< / div >
< div class = "line" > < a id = "l00968" name = "l00968" > < / a > < span class = "lineno" > 968< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & M,< / div >
< div class = "line" > < a id = "l00969" name = "l00969" > < / a > < span class = "lineno" > 969< / span > uint3 tid [[threadgroup_position_in_grid]],< / div >
< div class = "line" > < a id = "l00970" name = "l00970" > < / a > < span class = "lineno" > 970< / span > uint lid [[thread_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l00971" name = "l00971" > < / a > < span class = "lineno" > 971< / span > uint simd_gid [[simdgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l00972" name = "l00972" > < / a > < span class = "lineno" > 972< / span > uint simd_lid [[thread_index_in_simdgroup]]) {< / div >
< div class = "line" > < a id = "l00973" name = "l00973" > < / a > < span class = "lineno" > 973< / span > < span class = "keyword" > static_assert< / span > (BK > = < a class = "code hl_variable" href = "quantized_8h.html#a62969a218d93680f5e35d0c61b160b99" > SIMD_SIZE< / a > , < span class = "stringliteral" > " BK should be larger than SIMD_SIZE" < / span > );< / div >
< div class = "line" > < a id = "l00974" name = "l00974" > < / a > < span class = "lineno" > 974< / span > < span class = "keyword" > static_assert< / span > (BK % < a class = "code hl_variable" href = "quantized_8h.html#a62969a218d93680f5e35d0c61b160b99" > SIMD_SIZE< / a > == 0, < span class = "stringliteral" > " BK should be divisible by SIMD_SIZE" < / span > );< / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l00975" name = "l00975" > < / a > < span class = "lineno" > 975< / span > < / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l00976" name = "l00976" > < / a > < span class = "lineno" > 976< / span > (void)lid;< / div >
< div class = "line" > < a id = "l00977" name = "l00977" > < / a > < span class = "lineno" > 977< / span > < / div >
< div class = "line" > < a id = "l00978" name = "l00978" > < / a > < span class = "lineno" > 978< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > WM = 2;< / div >
< div class = "line" > < a id = "l00979" name = "l00979" > < / a > < span class = "lineno" > 979< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > WN = 2;< / div >
< div class = "line" > < a id = "l00980" name = "l00980" > < / a > < span class = "lineno" > 980< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;< / div >
< div class = "line" > < a id = "l00981" name = "l00981" > < / a > < span class = "lineno" > 981< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > BK_padded = (BK + 16 / < span class = "keyword" > sizeof< / span > (T));< / div >
< div class = "line" > < a id = "l00982" name = "l00982" > < / a > < span class = "lineno" > 982< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > bytes_per_pack = (bits == 3 || bits == 6) ? 3 : 1;< / div >
< div class = "line" > < a id = "l00983" name = "l00983" > < / a > < span class = "lineno" > 983< / span > < / div >
< div class = "line" > < a id = "l00984" name = "l00984" > < / a > < span class = "lineno" > 984< / span > < span class = "comment" > // Instantiate the appropriate BlockMMA and Loader< / span > < / div >
< div class = "line" > < a id = "l00985" name = "l00985" > < / a > < span class = "lineno" > 985< / span > < span class = "keyword" > using < / span > mma_t = mlx::steel::< / div >
< div class = "line" > < a id = "l00986" name = "l00986" > < / a > < span class = "lineno" > 986< / span > BlockMMA< T, T, BM, BN, BK, WM, WN, false, true, BK_padded, BK_padded> ;< / div >
< div class = "line" > < a id = "l00987" name = "l00987" > < / a > < span class = "lineno" > 987< / span > < span class = "keyword" > using < / span > loader_x_t =< / div >
< div class = "line" > < a id = "l00988" name = "l00988" > < / a > < span class = "lineno" > 988< / span > < a class = "code hl_struct" href = "structmlx_1_1steel_1_1_block_loader.html" > mlx::steel::BlockLoader< T, BM, BK, BK_padded, 1, WM * WN * SIMD_SIZE> < / a > ;< / div >
< div class = "line" > < a id = "l00989" name = "l00989" > < / a > < span class = "lineno" > 989< / span > < span class = "keyword" > using < / span > loader_w_t = < a class = "code hl_struct" href = "struct_quantized_block_loader.html" > QuantizedBlockLoader< / a > < < / div >
< div class = "line" > < a id = "l00990" name = "l00990" > < / a > < span class = "lineno" > 990< / span > T,< / div >
< div class = "line" > < a id = "l00991" name = "l00991" > < / a > < span class = "lineno" > 991< / span > BN,< / div >
< div class = "line" > < a id = "l00992" name = "l00992" > < / a > < span class = "lineno" > 992< / span > BK,< / div >
< div class = "line" > < a id = "l00993" name = "l00993" > < / a > < span class = "lineno" > 993< / span > BK_padded,< / div >
< div class = "line" > < a id = "l00994" name = "l00994" > < / a > < span class = "lineno" > 994< / span > 1,< / div >
< div class = "line" > < a id = "l00995" name = "l00995" > < / a > < span class = "lineno" > 995< / span > WM * WN * < a class = "code hl_variable" href = "quantized_8h.html#a62969a218d93680f5e35d0c61b160b99" > SIMD_SIZE< / a > ,< / div >
< div class = "line" > < a id = "l00996" name = "l00996" > < / a > < span class = "lineno" > 996< / span > group_size,< / div >
< div class = "line" > < a id = "l00997" name = "l00997" > < / a > < span class = "lineno" > 997< / span > bits> ;< / div >
< div class = "line" > < a id = "l00998" name = "l00998" > < / a > < span class = "lineno" > 998< / span > < / div >
< div class = "line" > < a id = "l00999" name = "l00999" > < / a > < span class = "lineno" > 999< / span > < span class = "comment" > // Set the block< / span > < / div >
< div class = "line" > < a id = "l01000" name = "l01000" > < / a > < span class = "lineno" > 1000< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > K_w = K * bytes_per_pack / pack_factor;< / div >
< div class = "line" > < a id = "l01001" name = "l01001" > < / a > < span class = "lineno" > 1001< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > K_g = K / group_size;< / div >
< div class = "line" > < a id = "l01002" name = "l01002" > < / a > < span class = "lineno" > 1002< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > y_row = tid.y * BM;< / div >
< div class = "line" > < a id = "l01003" name = "l01003" > < / a > < span class = "lineno" > 1003< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > y_col = tid.x * BN;< / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l01004" name = "l01004" > < / a > < span class = "lineno" > 1004< / span > < / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01005" name = "l01005" > < / a > < span class = "lineno" > 1005< / span > < span class = "keyword" > auto< / span > wl = (< span class = "keyword" > const< / span > device uint8_t*)w;< / div >
< div class = "line" > < a id = "l01006" name = "l01006" > < / a > < span class = "lineno" > 1006< / span > < / div >
< div class = "line" > < a id = "l01007" name = "l01007" > < / a > < span class = "lineno" > 1007< / span > x += y_row * K;< / div >
< div class = "line" > < a id = "l01008" name = "l01008" > < / a > < span class = "lineno" > 1008< / span > wl += y_col * K_w;< / div >
< div class = "line" > < a id = "l01009" name = "l01009" > < / a > < span class = "lineno" > 1009< / span > scales += y_col * K_g;< / div >
< div class = "line" > < a id = "l01010" name = "l01010" > < / a > < span class = "lineno" > 1010< / span > biases += y_col * K_g;< / div >
< div class = "line" > < a id = "l01011" name = "l01011" > < / a > < span class = "lineno" > 1011< / span > y += y_row * N + y_col;< / div >
< div class = "line" > < a id = "l01012" name = "l01012" > < / a > < span class = "lineno" > 1012< / span > < / div >
< div class = "line" > < a id = "l01013" name = "l01013" > < / a > < span class = "lineno" > 1013< / span > < span class = "comment" > // Make the x loader and mma operation< / span > < / div >
< div class = "line" > < a id = "l01014" name = "l01014" > < / a > < span class = "lineno" > 1014< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > short< / span > num_els = < a class = "code hl_function" href = "namespacemetal.html#a6653b28c9473087141eddce39878d4d3" > min< / a > (BM, M - y_row);< / div >
< div class = "line" > < a id = "l01015" name = "l01015" > < / a > < span class = "lineno" > 1015< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > short< / span > num_outs = < a class = "code hl_function" href = "namespacemetal.html#a6653b28c9473087141eddce39878d4d3" > min< / a > (BN, N - y_col);< / div >
< div class = "line" > < a id = "l01016" name = "l01016" > < / a > < span class = "lineno" > 1016< / span > loader_x_t loader_x(x, K, Xs, simd_gid, simd_lid);< / div >
< div class = "line" > < a id = "l01017" name = "l01017" > < / a > < span class = "lineno" > 1017< / span > loader_w_t loader_w(wl, scales, biases, K, Ws, simd_gid, simd_lid);< / div >
< div class = "line" > < a id = "l01018" name = "l01018" > < / a > < span class = "lineno" > 1018< / span > mma_t mma_op(simd_gid, simd_lid);< / div >
< div class = "line" > < a id = "l01019" name = "l01019" > < / a > < span class = "lineno" > 1019< / span > < / div >
< div class = "line" > < a id = "l01020" name = "l01020" > < / a > < span class = "lineno" > 1020< / span > < span class = "keywordflow" > if< / span > (num_els < BM) {< / div >
< div class = "line" > < a id = "l01021" name = "l01021" > < / a > < span class = "lineno" > 1021< / span > < span class = "keywordflow" > if< / span > (!aligned_N & & num_outs < BN) {< / div >
< div class = "line" > < a id = "l01022" name = "l01022" > < / a > < span class = "lineno" > 1022< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > k = 0; k < K; k += BK) {< / div >
< div class = "line" > < a id = "l01023" name = "l01023" > < / a > < span class = "lineno" > 1023< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01024" name = "l01024" > < / a > < span class = "lineno" > 1024< / span > loader_x.load_safe(short2(BK, num_els));< / div >
< div class = "line" > < a id = "l01025" name = "l01025" > < / a > < span class = "lineno" > 1025< / span > loader_w.load_safe(short2(BK, num_outs));< / div >
< div class = "line" > < a id = "l01026" name = "l01026" > < / a > < span class = "lineno" > 1026< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01027" name = "l01027" > < / a > < span class = "lineno" > 1027< / span > mma_op.mma(Xs, Ws);< / div >
< div class = "line" > < a id = "l01028" name = "l01028" > < / a > < span class = "lineno" > 1028< / span > loader_x.next();< / div >
< div class = "line" > < a id = "l01029" name = "l01029" > < / a > < span class = "lineno" > 1029< / span > loader_w.next();< / div >
< div class = "line" > < a id = "l01030" name = "l01030" > < / a > < span class = "lineno" > 1030< / span > }< / div >
< div class = "line" > < a id = "l01031" name = "l01031" > < / a > < span class = "lineno" > 1031< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l01032" name = "l01032" > < / a > < span class = "lineno" > 1032< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > k = 0; k < K; k += BK) {< / div >
< div class = "line" > < a id = "l01033" name = "l01033" > < / a > < span class = "lineno" > 1033< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01034" name = "l01034" > < / a > < span class = "lineno" > 1034< / span > loader_x.load_safe(short2(BK, num_els));< / div >
< div class = "line" > < a id = "l01035" name = "l01035" > < / a > < span class = "lineno" > 1035< / span > loader_w.load_unsafe();< / div >
< div class = "line" > < a id = "l01036" name = "l01036" > < / a > < span class = "lineno" > 1036< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01037" name = "l01037" > < / a > < span class = "lineno" > 1037< / span > mma_op.mma(Xs, Ws);< / div >
< div class = "line" > < a id = "l01038" name = "l01038" > < / a > < span class = "lineno" > 1038< / span > loader_x.next();< / div >
< div class = "line" > < a id = "l01039" name = "l01039" > < / a > < span class = "lineno" > 1039< / span > loader_w.next();< / div >
< div class = "line" > < a id = "l01040" name = "l01040" > < / a > < span class = "lineno" > 1040< / span > }< / div >
< div class = "line" > < a id = "l01041" name = "l01041" > < / a > < span class = "lineno" > 1041< / span > }< / div >
< div class = "line" > < a id = "l01042" name = "l01042" > < / a > < span class = "lineno" > 1042< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l01043" name = "l01043" > < / a > < span class = "lineno" > 1043< / span > < span class = "keywordflow" > if< / span > (!aligned_N & & num_outs < BN) {< / div >
< div class = "line" > < a id = "l01044" name = "l01044" > < / a > < span class = "lineno" > 1044< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > k = 0; k < K; k += BK) {< / div >
< div class = "line" > < a id = "l01045" name = "l01045" > < / a > < span class = "lineno" > 1045< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01046" name = "l01046" > < / a > < span class = "lineno" > 1046< / span > loader_x.load_unsafe();< / div >
< div class = "line" > < a id = "l01047" name = "l01047" > < / a > < span class = "lineno" > 1047< / span > loader_w.load_safe(short2(BK, num_outs));< / div >
< div class = "line" > < a id = "l01048" name = "l01048" > < / a > < span class = "lineno" > 1048< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01049" name = "l01049" > < / a > < span class = "lineno" > 1049< / span > mma_op.mma(Xs, Ws);< / div >
< div class = "line" > < a id = "l01050" name = "l01050" > < / a > < span class = "lineno" > 1050< / span > loader_x.next();< / div >
< div class = "line" > < a id = "l01051" name = "l01051" > < / a > < span class = "lineno" > 1051< / span > loader_w.next();< / div >
< div class = "line" > < a id = "l01052" name = "l01052" > < / a > < span class = "lineno" > 1052< / span > }< / div >
< div class = "line" > < a id = "l01053" name = "l01053" > < / a > < span class = "lineno" > 1053< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l01054" name = "l01054" > < / a > < span class = "lineno" > 1054< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > k = 0; k < K; k += BK) {< / div >
< div class = "line" > < a id = "l01055" name = "l01055" > < / a > < span class = "lineno" > 1055< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01056" name = "l01056" > < / a > < span class = "lineno" > 1056< / span > loader_x.load_unsafe();< / div >
< div class = "line" > < a id = "l01057" name = "l01057" > < / a > < span class = "lineno" > 1057< / span > loader_w.load_unsafe();< / div >
< div class = "line" > < a id = "l01058" name = "l01058" > < / a > < span class = "lineno" > 1058< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01059" name = "l01059" > < / a > < span class = "lineno" > 1059< / span > < / div >
< div class = "line" > < a id = "l01060" name = "l01060" > < / a > < span class = "lineno" > 1060< / span > mma_op.mma(Xs, Ws);< / div >
< div class = "line" > < a id = "l01061" name = "l01061" > < / a > < span class = "lineno" > 1061< / span > loader_x.next();< / div >
< div class = "line" > < a id = "l01062" name = "l01062" > < / a > < span class = "lineno" > 1062< / span > loader_w.next();< / div >
< div class = "line" > < a id = "l01063" name = "l01063" > < / a > < span class = "lineno" > 1063< / span > }< / div >
< div class = "line" > < a id = "l01064" name = "l01064" > < / a > < span class = "lineno" > 1064< / span > }< / div >
< div class = "line" > < a id = "l01065" name = "l01065" > < / a > < span class = "lineno" > 1065< / span > }< / div >
< div class = "line" > < a id = "l01066" name = "l01066" > < / a > < span class = "lineno" > 1066< / span > < / div >
< div class = "line" > < a id = "l01067" name = "l01067" > < / a > < span class = "lineno" > 1067< / span > < span class = "comment" > // Store results to device memory< / span > < / div >
< div class = "line" > < a id = "l01068" name = "l01068" > < / a > < span class = "lineno" > 1068< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01069" name = "l01069" > < / a > < span class = "lineno" > 1069< / span > < span class = "keywordflow" > if< / span > (num_els < BM || num_outs < BN) {< / div >
< div class = "line" > < a id = "l01070" name = "l01070" > < / a > < span class = "lineno" > 1070< / span > mma_op.store_result_safe(y, N, short2(num_outs, num_els));< / div >
< div class = "line" > < a id = "l01071" name = "l01071" > < / a > < span class = "lineno" > 1071< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l01072" name = "l01072" > < / a > < span class = "lineno" > 1072< / span > mma_op.store_result(y, N);< / div >
< div class = "line" > < a id = "l01073" name = "l01073" > < / a > < span class = "lineno" > 1073< / span > }< / div >
< div class = "line" > < a id = "l01074" name = "l01074" > < / a > < span class = "lineno" > 1074< / span > }< / div >
2024-10-15 23:12:17 +08:00
< / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01075" name = "l01075" > < / a > < span class = "lineno" > 1075< / span > < / div >
< div class = "line" > < a id = "l01076" name = "l01076" > < / a > < span class = "lineno" > 1076< / span > < span class = "keyword" > template< / span > < < / div >
< div class = "line" > < a id = "l01077" name = "l01077" > < / a > < span class = "lineno" > 1077< / span > < span class = "keyword" > typename< / span > T,< / div >
< div class = "line" > < a id = "l01078" name = "l01078" > < / a > < span class = "lineno" > 1078< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > group_size,< / div >
< div class = "line" > < a id = "l01079" name = "l01079" > < / a > < span class = "lineno" > 1079< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > < a class = "code hl_function" href = "namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825" > bits< / a > ,< / div >
< div class = "line" > < a id = "l01080" name = "l01080" > < / a > < span class = "lineno" > 1080< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > BM = 32,< / div >
< div class = "line" > < a id = "l01081" name = "l01081" > < / a > < span class = "lineno" > 1081< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > BK = 32,< / div >
< div class = "line" > < a id = "l01082" name = "l01082" > < / a > < span class = "lineno" > 1082< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > BN = 32> < / div >
< div class = "foldopen" id = "foldopen01083" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l01083" name = "l01083" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#a0ba59096494f1001c195312571523ae9" > 1083< / a > < / span > METAL_FUNC < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#a0ba59096494f1001c195312571523ae9" > qmm_n_impl< / a > (< / div >
< div class = "line" > < a id = "l01084" name = "l01084" > < / a > < span class = "lineno" > 1084< / span > < span class = "keyword" > const< / span > device uint32_t* w,< / div >
< div class = "line" > < a id = "l01085" name = "l01085" > < / a > < span class = "lineno" > 1085< / span > < span class = "keyword" > const< / span > device T* scales,< / div >
< div class = "line" > < a id = "l01086" name = "l01086" > < / a > < span class = "lineno" > 1086< / span > < span class = "keyword" > const< / span > device T* biases,< / div >
< div class = "line" > < a id = "l01087" name = "l01087" > < / a > < span class = "lineno" > 1087< / span > < span class = "keyword" > const< / span > device T* x,< / div >
< div class = "line" > < a id = "l01088" name = "l01088" > < / a > < span class = "lineno" > 1088< / span > device T* y,< / div >
< div class = "line" > < a id = "l01089" name = "l01089" > < / a > < span class = "lineno" > 1089< / span > threadgroup T* Xs,< / div >
< div class = "line" > < a id = "l01090" name = "l01090" > < / a > < span class = "lineno" > 1090< / span > threadgroup T* Ws,< / div >
< div class = "line" > < a id = "l01091" name = "l01091" > < / a > < span class = "lineno" > 1091< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & K,< / div >
< div class = "line" > < a id = "l01092" name = "l01092" > < / a > < span class = "lineno" > 1092< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & N,< / div >
< div class = "line" > < a id = "l01093" name = "l01093" > < / a > < span class = "lineno" > 1093< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & M,< / div >
< div class = "line" > < a id = "l01094" name = "l01094" > < / a > < span class = "lineno" > 1094< / span > uint3 tid [[threadgroup_position_in_grid]],< / div >
< div class = "line" > < a id = "l01095" name = "l01095" > < / a > < span class = "lineno" > 1095< / span > uint lid [[thread_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l01096" name = "l01096" > < / a > < span class = "lineno" > 1096< / span > uint simd_gid [[simdgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l01097" name = "l01097" > < / a > < span class = "lineno" > 1097< / span > uint simd_lid [[thread_index_in_simdgroup]]) {< / div >
< div class = "line" > < a id = "l01098" name = "l01098" > < / a > < span class = "lineno" > 1098< / span > < span class = "keyword" > static_assert< / span > (BK > = < a class = "code hl_variable" href = "quantized_8h.html#a62969a218d93680f5e35d0c61b160b99" > SIMD_SIZE< / a > , < span class = "stringliteral" > " BK should be larger than SIMD_SIZE" < / span > );< / div >
< div class = "line" > < a id = "l01099" name = "l01099" > < / a > < span class = "lineno" > 1099< / span > < span class = "keyword" > static_assert< / span > (BK % < a class = "code hl_variable" href = "quantized_8h.html#a62969a218d93680f5e35d0c61b160b99" > SIMD_SIZE< / a > == 0, < span class = "stringliteral" > " BK should be divisible by SIMD_SIZE" < / span > );< / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l01100" name = "l01100" > < / a > < span class = "lineno" > 1100< / span > < / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01101" name = "l01101" > < / a > < span class = "lineno" > 1101< / span > (void)lid;< / div >
< div class = "line" > < a id = "l01102" name = "l01102" > < / a > < span class = "lineno" > 1102< / span > < / div >
< div class = "line" > < a id = "l01103" name = "l01103" > < / a > < span class = "lineno" > 1103< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > WM = 2;< / div >
< div class = "line" > < a id = "l01104" name = "l01104" > < / a > < span class = "lineno" > 1104< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > WN = 2;< / div >
< div class = "line" > < a id = "l01105" name = "l01105" > < / a > < span class = "lineno" > 1105< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > pack_factor = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;< / div >
< div class = "line" > < a id = "l01106" name = "l01106" > < / a > < span class = "lineno" > 1106< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > BK_padded = (BK + 16 / < span class = "keyword" > sizeof< / span > (T));< / div >
< div class = "line" > < a id = "l01107" name = "l01107" > < / a > < span class = "lineno" > 1107< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > BN_padded = (BN + 16 / < span class = "keyword" > sizeof< / span > (T));< / div >
< div class = "line" > < a id = "l01108" name = "l01108" > < / a > < span class = "lineno" > 1108< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > power_of_2_bits = (bits & (bits - 1)) == 0;< / div >
< div class = "line" > < a id = "l01109" name = "l01109" > < / a > < span class = "lineno" > 1109< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > bytes_per_pack = power_of_2_bits ? 1 : 3;< / div >
< div class = "line" > < a id = "l01110" name = "l01110" > < / a > < span class = "lineno" > 1110< / span > < / div >
< div class = "line" > < a id = "l01111" name = "l01111" > < / a > < span class = "lineno" > 1111< / span > < span class = "comment" > // Instantiate the appropriate BlockMMA and Loader< / span > < / div >
< div class = "line" > < a id = "l01112" name = "l01112" > < / a > < span class = "lineno" > 1112< / span > < span class = "keyword" > using < / span > mma_t = mlx::steel::< / div >
< div class = "line" > < a id = "l01113" name = "l01113" > < / a > < span class = "lineno" > 1113< / span > BlockMMA< T, T, BM, BN, BK, WM, WN, false, false, BK_padded, BN_padded> ;< / div >
< div class = "line" > < a id = "l01114" name = "l01114" > < / a > < span class = "lineno" > 1114< / span > < span class = "keyword" > using < / span > loader_x_t = mlx::steel::< / div >
< div class = "line" > < a id = "l01115" name = "l01115" > < / a > < span class = "lineno" > 1115< / span > BlockLoader< T, BM, BK, BK_padded, 1, WM * WN * SIMD_SIZE, 1, 4> ;< / div >
< div class = "line" > < a id = "l01116" name = "l01116" > < / a > < span class = "lineno" > 1116< / span > < span class = "keyword" > using < / span > loader_w_t = < a class = "code hl_struct" href = "struct_quantized_block_loader.html" > QuantizedBlockLoader< / a > < < / div >
< div class = "line" > < a id = "l01117" name = "l01117" > < / a > < span class = "lineno" > 1117< / span > T,< / div >
< div class = "line" > < a id = "l01118" name = "l01118" > < / a > < span class = "lineno" > 1118< / span > BK,< / div >
< div class = "line" > < a id = "l01119" name = "l01119" > < / a > < span class = "lineno" > 1119< / span > BN,< / div >
< div class = "line" > < a id = "l01120" name = "l01120" > < / a > < span class = "lineno" > 1120< / span > BN_padded,< / div >
< div class = "line" > < a id = "l01121" name = "l01121" > < / a > < span class = "lineno" > 1121< / span > 0,< / div >
< div class = "line" > < a id = "l01122" name = "l01122" > < / a > < span class = "lineno" > 1122< / span > WM * WN * < a class = "code hl_variable" href = "quantized_8h.html#a62969a218d93680f5e35d0c61b160b99" > SIMD_SIZE< / a > ,< / div >
< div class = "line" > < a id = "l01123" name = "l01123" > < / a > < span class = "lineno" > 1123< / span > group_size,< / div >
< div class = "line" > < a id = "l01124" name = "l01124" > < / a > < span class = "lineno" > 1124< / span > bits> ;< / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l01125" name = "l01125" > < / a > < span class = "lineno" > 1125< / span > < / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01126" name = "l01126" > < / a > < span class = "lineno" > 1126< / span > < span class = "keyword" > auto< / span > wl = (< span class = "keyword" > const< / span > device uint8_t*)w;< / div >
< div class = "line" > < a id = "l01127" name = "l01127" > < / a > < span class = "lineno" > 1127< / span > < / div >
< div class = "line" > < a id = "l01128" name = "l01128" > < / a > < span class = "lineno" > 1128< / span > < span class = "comment" > // Set the block< / span > < / div >
< div class = "line" > < a id = "l01129" name = "l01129" > < / a > < span class = "lineno" > 1129< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > y_row = tid.y * BM;< / div >
< div class = "line" > < a id = "l01130" name = "l01130" > < / a > < span class = "lineno" > 1130< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > y_col = tid.x * BN;< / div >
< div class = "line" > < a id = "l01131" name = "l01131" > < / a > < span class = "lineno" > 1131< / span > x += y_row * K;< / div >
< div class = "line" > < a id = "l01132" name = "l01132" > < / a > < span class = "lineno" > 1132< / span > wl += y_col * bytes_per_pack / pack_factor;< / div >
< div class = "line" > < a id = "l01133" name = "l01133" > < / a > < span class = "lineno" > 1133< / span > scales += y_col / group_size;< / div >
< div class = "line" > < a id = "l01134" name = "l01134" > < / a > < span class = "lineno" > 1134< / span > biases += y_col / group_size;< / div >
< div class = "line" > < a id = "l01135" name = "l01135" > < / a > < span class = "lineno" > 1135< / span > y += y_row * N + y_col;< / div >
< div class = "line" > < a id = "l01136" name = "l01136" > < / a > < span class = "lineno" > 1136< / span > < / div >
< div class = "line" > < a id = "l01137" name = "l01137" > < / a > < span class = "lineno" > 1137< / span > < span class = "comment" > // Make the x loader and mma operation< / span > < / div >
< div class = "line" > < a id = "l01138" name = "l01138" > < / a > < span class = "lineno" > 1138< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > short< / span > num_els = < a class = "code hl_function" href = "namespacemetal.html#a6653b28c9473087141eddce39878d4d3" > min< / a > (BM, M - y_row);< / div >
< div class = "line" > < a id = "l01139" name = "l01139" > < / a > < span class = "lineno" > 1139< / span > loader_x_t loader_x(x, K, Xs, simd_gid, simd_lid);< / div >
< div class = "line" > < a id = "l01140" name = "l01140" > < / a > < span class = "lineno" > 1140< / span > loader_w_t loader_w(wl, scales, biases, N, Ws, simd_gid, simd_lid);< / div >
< div class = "line" > < a id = "l01141" name = "l01141" > < / a > < span class = "lineno" > 1141< / span > mma_t mma_op(simd_gid, simd_lid);< / div >
< div class = "line" > < a id = "l01142" name = "l01142" > < / a > < span class = "lineno" > 1142< / span > < / div >
< div class = "line" > < a id = "l01143" name = "l01143" > < / a > < span class = "lineno" > 1143< / span > < span class = "keywordflow" > if< / span > (num_els < BM) {< / div >
< div class = "line" > < a id = "l01144" name = "l01144" > < / a > < span class = "lineno" > 1144< / span > < span class = "keywordflow" > if< / span > ((K % BK) != 0) {< / div >
< div class = "line" > < a id = "l01145" name = "l01145" > < / a > < span class = "lineno" > 1145< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > k_blocks = K / BK;< / div >
< div class = "line" > < a id = "l01146" name = "l01146" > < / a > < span class = "lineno" > 1146< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > k = 0; k < k_blocks; k++) {< / div >
< div class = "line" > < a id = "l01147" name = "l01147" > < / a > < span class = "lineno" > 1147< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01148" name = "l01148" > < / a > < span class = "lineno" > 1148< / span > loader_x.load_safe(short2(BK, num_els));< / div >
< div class = "line" > < a id = "l01149" name = "l01149" > < / a > < span class = "lineno" > 1149< / span > loader_w.load_unsafe();< / div >
< div class = "line" > < a id = "l01150" name = "l01150" > < / a > < span class = "lineno" > 1150< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01151" name = "l01151" > < / a > < span class = "lineno" > 1151< / span > mma_op.mma(Xs, Ws);< / div >
< div class = "line" > < a id = "l01152" name = "l01152" > < / a > < span class = "lineno" > 1152< / span > loader_x.next();< / div >
< div class = "line" > < a id = "l01153" name = "l01153" > < / a > < span class = "lineno" > 1153< / span > loader_w.next();< / div >
< div class = "line" > < a id = "l01154" name = "l01154" > < / a > < span class = "lineno" > 1154< / span > }< / div >
< div class = "line" > < a id = "l01155" name = "l01155" > < / a > < span class = "lineno" > 1155< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > short< / span > num_k = K - k_blocks * BK;< / div >
< div class = "line" > < a id = "l01156" name = "l01156" > < / a > < span class = "lineno" > 1156< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01157" name = "l01157" > < / a > < span class = "lineno" > 1157< / span > loader_x.load_safe(short2(num_k, num_els));< / div >
< div class = "line" > < a id = "l01158" name = "l01158" > < / a > < span class = "lineno" > 1158< / span > loader_w.load_safe(short2(BN, num_k));< / div >
< div class = "line" > < a id = "l01159" name = "l01159" > < / a > < span class = "lineno" > 1159< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01160" name = "l01160" > < / a > < span class = "lineno" > 1160< / span > mma_op.mma(Xs, Ws);< / div >
< div class = "line" > < a id = "l01161" name = "l01161" > < / a > < span class = "lineno" > 1161< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l01162" name = "l01162" > < / a > < span class = "lineno" > 1162< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > k = 0; k < K; k += BK) {< / div >
< div class = "line" > < a id = "l01163" name = "l01163" > < / a > < span class = "lineno" > 1163< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01164" name = "l01164" > < / a > < span class = "lineno" > 1164< / span > loader_x.load_safe(short2(BK, num_els));< / div >
< div class = "line" > < a id = "l01165" name = "l01165" > < / a > < span class = "lineno" > 1165< / span > loader_w.load_unsafe();< / div >
< div class = "line" > < a id = "l01166" name = "l01166" > < / a > < span class = "lineno" > 1166< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01167" name = "l01167" > < / a > < span class = "lineno" > 1167< / span > mma_op.mma(Xs, Ws);< / div >
< div class = "line" > < a id = "l01168" name = "l01168" > < / a > < span class = "lineno" > 1168< / span > loader_x.next();< / div >
< div class = "line" > < a id = "l01169" name = "l01169" > < / a > < span class = "lineno" > 1169< / span > loader_w.next();< / div >
< div class = "line" > < a id = "l01170" name = "l01170" > < / a > < span class = "lineno" > 1170< / span > }< / div >
< div class = "line" > < a id = "l01171" name = "l01171" > < / a > < span class = "lineno" > 1171< / span > }< / div >
< div class = "line" > < a id = "l01172" name = "l01172" > < / a > < span class = "lineno" > 1172< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l01173" name = "l01173" > < / a > < span class = "lineno" > 1173< / span > < span class = "keywordflow" > if< / span > ((K % BK) != 0) {< / div >
< div class = "line" > < a id = "l01174" name = "l01174" > < / a > < span class = "lineno" > 1174< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > k_blocks = K / BK;< / div >
< div class = "line" > < a id = "l01175" name = "l01175" > < / a > < span class = "lineno" > 1175< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > k = 0; k < k_blocks; k++) {< / div >
< div class = "line" > < a id = "l01176" name = "l01176" > < / a > < span class = "lineno" > 1176< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01177" name = "l01177" > < / a > < span class = "lineno" > 1177< / span > loader_x.load_unsafe();< / div >
< div class = "line" > < a id = "l01178" name = "l01178" > < / a > < span class = "lineno" > 1178< / span > loader_w.load_unsafe();< / div >
< div class = "line" > < a id = "l01179" name = "l01179" > < / a > < span class = "lineno" > 1179< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01180" name = "l01180" > < / a > < span class = "lineno" > 1180< / span > mma_op.mma(Xs, Ws);< / div >
< div class = "line" > < a id = "l01181" name = "l01181" > < / a > < span class = "lineno" > 1181< / span > loader_x.next();< / div >
< div class = "line" > < a id = "l01182" name = "l01182" > < / a > < span class = "lineno" > 1182< / span > loader_w.next();< / div >
< div class = "line" > < a id = "l01183" name = "l01183" > < / a > < span class = "lineno" > 1183< / span > }< / div >
< div class = "line" > < a id = "l01184" name = "l01184" > < / a > < span class = "lineno" > 1184< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > short< / span > num_k = K - k_blocks * BK;< / div >
< div class = "line" > < a id = "l01185" name = "l01185" > < / a > < span class = "lineno" > 1185< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01186" name = "l01186" > < / a > < span class = "lineno" > 1186< / span > loader_x.load_safe(short2(num_k, BM));< / div >
< div class = "line" > < a id = "l01187" name = "l01187" > < / a > < span class = "lineno" > 1187< / span > loader_w.load_safe(short2(BN, num_k));< / div >
< div class = "line" > < a id = "l01188" name = "l01188" > < / a > < span class = "lineno" > 1188< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01189" name = "l01189" > < / a > < span class = "lineno" > 1189< / span > mma_op.mma(Xs, Ws);< / div >
< div class = "line" > < a id = "l01190" name = "l01190" > < / a > < span class = "lineno" > 1190< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l01191" name = "l01191" > < / a > < span class = "lineno" > 1191< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > k = 0; k < K; k += BK) {< / div >
< div class = "line" > < a id = "l01192" name = "l01192" > < / a > < span class = "lineno" > 1192< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01193" name = "l01193" > < / a > < span class = "lineno" > 1193< / span > loader_x.load_unsafe();< / div >
< div class = "line" > < a id = "l01194" name = "l01194" > < / a > < span class = "lineno" > 1194< / span > loader_w.load_unsafe();< / div >
< div class = "line" > < a id = "l01195" name = "l01195" > < / a > < span class = "lineno" > 1195< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01196" name = "l01196" > < / a > < span class = "lineno" > 1196< / span > mma_op.mma(Xs, Ws);< / div >
< div class = "line" > < a id = "l01197" name = "l01197" > < / a > < span class = "lineno" > 1197< / span > loader_x.next();< / div >
< div class = "line" > < a id = "l01198" name = "l01198" > < / a > < span class = "lineno" > 1198< / span > loader_w.next();< / div >
< div class = "line" > < a id = "l01199" name = "l01199" > < / a > < span class = "lineno" > 1199< / span > }< / div >
< div class = "line" > < a id = "l01200" name = "l01200" > < / a > < span class = "lineno" > 1200< / span > }< / div >
< div class = "line" > < a id = "l01201" name = "l01201" > < / a > < span class = "lineno" > 1201< / span > }< / div >
< div class = "line" > < a id = "l01202" name = "l01202" > < / a > < span class = "lineno" > 1202< / span > < / div >
< div class = "line" > < a id = "l01203" name = "l01203" > < / a > < span class = "lineno" > 1203< / span > < span class = "comment" > // Store results to device memory< / span > < / div >
< div class = "line" > < a id = "l01204" name = "l01204" > < / a > < span class = "lineno" > 1204< / span > threadgroup_barrier(mem_flags::mem_threadgroup);< / div >
< div class = "line" > < a id = "l01205" name = "l01205" > < / a > < span class = "lineno" > 1205< / span > < span class = "keywordflow" > if< / span > (num_els < BM) {< / div >
< div class = "line" > < a id = "l01206" name = "l01206" > < / a > < span class = "lineno" > 1206< / span > mma_op.store_result_safe(y, N, short2(BN, num_els));< / div >
< div class = "line" > < a id = "l01207" name = "l01207" > < / a > < span class = "lineno" > 1207< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l01208" name = "l01208" > < / a > < span class = "lineno" > 1208< / span > mma_op.store_result(y, N);< / div >
< div class = "line" > < a id = "l01209" name = "l01209" > < / a > < span class = "lineno" > 1209< / span > }< / div >
< div class = "line" > < a id = "l01210" name = "l01210" > < / a > < span class = "lineno" > 1210< / span > }< / div >
2024-10-15 23:12:17 +08:00
< / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01211" name = "l01211" > < / a > < span class = "lineno" > 1211< / span > < / div >
< div class = "line" > < a id = "l01212" name = "l01212" > < / a > < span class = "lineno" > 1212< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T> < / div >
< div class = "foldopen" id = "foldopen01213" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l01213" name = "l01213" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#accab1f9e17a65242347c051f98e4c0be" > 1213< / a > < / span > METAL_FUNC < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#accab1f9e17a65242347c051f98e4c0be" > adjust_matrix_offsets< / a > (< / div >
< div class = "line" > < a id = "l01214" name = "l01214" > < / a > < span class = "lineno" > 1214< / span > < span class = "keyword" > const< / span > device T*& x,< / div >
< div class = "line" > < a id = "l01215" name = "l01215" > < / a > < span class = "lineno" > 1215< / span > < span class = "keyword" > const< / span > device uint32_t*& w,< / div >
< div class = "line" > < a id = "l01216" name = "l01216" > < / a > < span class = "lineno" > 1216< / span > < span class = "keyword" > const< / span > device T*& scales,< / div >
< div class = "line" > < a id = "l01217" name = "l01217" > < / a > < span class = "lineno" > 1217< / span > < span class = "keyword" > const< / span > device T*& biases,< / div >
< div class = "line" > < a id = "l01218" name = "l01218" > < / a > < span class = "lineno" > 1218< / span > device T*& y,< / div >
< div class = "line" > < a id = "l01219" name = "l01219" > < / a > < span class = "lineno" > 1219< / span > < span class = "keywordtype" > int< / span > output_stride,< / div >
< div class = "line" > < a id = "l01220" name = "l01220" > < / a > < span class = "lineno" > 1220< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & x_batch_ndims,< / div >
< div class = "line" > < a id = "l01221" name = "l01221" > < / a > < span class = "lineno" > 1221< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * x_shape,< / div >
< div class = "line" > < a id = "l01222" name = "l01222" > < / a > < span class = "lineno" > 1222< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * x_strides,< / div >
< div class = "line" > < a id = "l01223" name = "l01223" > < / a > < span class = "lineno" > 1223< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & w_batch_ndims,< / div >
< div class = "line" > < a id = "l01224" name = "l01224" > < / a > < span class = "lineno" > 1224< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * w_shape,< / div >
< div class = "line" > < a id = "l01225" name = "l01225" > < / a > < span class = "lineno" > 1225< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * w_strides,< / div >
< div class = "line" > < a id = "l01226" name = "l01226" > < / a > < span class = "lineno" > 1226< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * s_strides,< / div >
< div class = "line" > < a id = "l01227" name = "l01227" > < / a > < span class = "lineno" > 1227< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * b_strides,< / div >
< div class = "line" > < a id = "l01228" name = "l01228" > < / a > < span class = "lineno" > 1228< / span > uint3 tid [[threadgroup_position_in_grid]]) {< / div >
< div class = "line" > < a id = "l01229" name = "l01229" > < / a > < span class = "lineno" > 1229< / span > < span class = "comment" > // Set the input/output matrices< / span > < / div >
< div class = "line" > < a id = "l01230" name = "l01230" > < / a > < span class = "lineno" > 1230< / span > uint32_t x_idx = tid.z;< / div >
< div class = "line" > < a id = "l01231" name = "l01231" > < / a > < span class = "lineno" > 1231< / span > uint32_t w_idx = tid.z;< / div >
< div class = "line" > < a id = "l01232" name = "l01232" > < / a > < span class = "lineno" > 1232< / span > < span class = "keywordflow" > if< / span > (x_batch_ndims == 1) {< / div >
< div class = "line" > < a id = "l01233" name = "l01233" > < / a > < span class = "lineno" > 1233< / span > x += x_idx * x_strides[0];< / div >
< div class = "line" > < a id = "l01234" name = "l01234" > < / a > < span class = "lineno" > 1234< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l01235" name = "l01235" > < / a > < span class = "lineno" > 1235< / span > x += < a class = "code hl_function" href = "backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5" > elem_to_loc< / a > (x_idx, x_shape, x_strides, x_batch_ndims);< / div >
< div class = "line" > < a id = "l01236" name = "l01236" > < / a > < span class = "lineno" > 1236< / span > }< / div >
< div class = "line" > < a id = "l01237" name = "l01237" > < / a > < span class = "lineno" > 1237< / span > < span class = "keywordflow" > if< / span > (w_batch_ndims == 1) {< / div >
< div class = "line" > < a id = "l01238" name = "l01238" > < / a > < span class = "lineno" > 1238< / span > w += w_idx * w_strides[0];< / div >
< div class = "line" > < a id = "l01239" name = "l01239" > < / a > < span class = "lineno" > 1239< / span > scales += w_idx * s_strides[0];< / div >
< div class = "line" > < a id = "l01240" name = "l01240" > < / a > < span class = "lineno" > 1240< / span > biases += w_idx * b_strides[0];< / div >
< div class = "line" > < a id = "l01241" name = "l01241" > < / a > < span class = "lineno" > 1241< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l01242" name = "l01242" > < / a > < span class = "lineno" > 1242< / span > ulong3 idx = < a class = "code hl_function" href = "backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f" > elem_to_loc_broadcast< / a > (< / div >
< div class = "line" > < a id = "l01243" name = "l01243" > < / a > < span class = "lineno" > 1243< / span > w_idx, w_shape, w_strides, s_strides, b_strides, w_batch_ndims);< / div >
< div class = "line" > < a id = "l01244" name = "l01244" > < / a > < span class = "lineno" > 1244< / span > w += idx.x;< / div >
< div class = "line" > < a id = "l01245" name = "l01245" > < / a > < span class = "lineno" > 1245< / span > scales += idx.y;< / div >
< div class = "line" > < a id = "l01246" name = "l01246" > < / a > < span class = "lineno" > 1246< / span > biases += idx.z;< / div >
< div class = "line" > < a id = "l01247" name = "l01247" > < / a > < span class = "lineno" > 1247< / span > }< / div >
< div class = "line" > < a id = "l01248" name = "l01248" > < / a > < span class = "lineno" > 1248< / span > y += tid.z * output_stride;< / div >
< div class = "line" > < a id = "l01249" name = "l01249" > < / a > < span class = "lineno" > 1249< / span > }< / div >
2024-10-15 23:12:17 +08:00
< / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01250" name = "l01250" > < / a > < span class = "lineno" > 1250< / span > < / div >
< div class = "line" > < a id = "l01251" name = "l01251" > < / a > < span class = "lineno" > 1251< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T> < / div >
< div class = "foldopen" id = "foldopen01252" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l01252" name = "l01252" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#a3ab400746ad77be89c30d25638e01698" > 1252< / a > < / span > METAL_FUNC < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#accab1f9e17a65242347c051f98e4c0be" > adjust_matrix_offsets< / a > (< / div >
< div class = "line" > < a id = "l01253" name = "l01253" > < / a > < span class = "lineno" > 1253< / span > < span class = "keyword" > const< / span > device T*& x,< / div >
< div class = "line" > < a id = "l01254" name = "l01254" > < / a > < span class = "lineno" > 1254< / span > < span class = "keyword" > const< / span > device uint32_t*& w,< / div >
< div class = "line" > < a id = "l01255" name = "l01255" > < / a > < span class = "lineno" > 1255< / span > < span class = "keyword" > const< / span > device T*& scales,< / div >
< div class = "line" > < a id = "l01256" name = "l01256" > < / a > < span class = "lineno" > 1256< / span > < span class = "keyword" > const< / span > device T*& biases,< / div >
< div class = "line" > < a id = "l01257" name = "l01257" > < / a > < span class = "lineno" > 1257< / span > < span class = "keyword" > const< / span > device uint32_t* lhs_indices,< / div >
< div class = "line" > < a id = "l01258" name = "l01258" > < / a > < span class = "lineno" > 1258< / span > < span class = "keyword" > const< / span > device uint32_t* rhs_indices,< / div >
< div class = "line" > < a id = "l01259" name = "l01259" > < / a > < span class = "lineno" > 1259< / span > device T*& y,< / div >
< div class = "line" > < a id = "l01260" name = "l01260" > < / a > < span class = "lineno" > 1260< / span > < span class = "keywordtype" > int< / span > output_stride,< / div >
< div class = "line" > < a id = "l01261" name = "l01261" > < / a > < span class = "lineno" > 1261< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & batch_ndims,< / div >
< div class = "line" > < a id = "l01262" name = "l01262" > < / a > < span class = "lineno" > 1262< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * batch_shape,< / div >
< div class = "line" > < a id = "l01263" name = "l01263" > < / a > < span class = "lineno" > 1263< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * lhs_strides,< / div >
< div class = "line" > < a id = "l01264" name = "l01264" > < / a > < span class = "lineno" > 1264< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * rhs_strides,< / div >
< div class = "line" > < a id = "l01265" name = "l01265" > < / a > < span class = "lineno" > 1265< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & x_batch_ndims,< / div >
< div class = "line" > < a id = "l01266" name = "l01266" > < / a > < span class = "lineno" > 1266< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * x_shape,< / div >
< div class = "line" > < a id = "l01267" name = "l01267" > < / a > < span class = "lineno" > 1267< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * x_strides,< / div >
< div class = "line" > < a id = "l01268" name = "l01268" > < / a > < span class = "lineno" > 1268< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & w_batch_ndims,< / div >
< div class = "line" > < a id = "l01269" name = "l01269" > < / a > < span class = "lineno" > 1269< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * w_shape,< / div >
< div class = "line" > < a id = "l01270" name = "l01270" > < / a > < span class = "lineno" > 1270< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * w_strides,< / div >
< div class = "line" > < a id = "l01271" name = "l01271" > < / a > < span class = "lineno" > 1271< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * s_strides,< / div >
< div class = "line" > < a id = "l01272" name = "l01272" > < / a > < span class = "lineno" > 1272< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * b_strides,< / div >
< div class = "line" > < a id = "l01273" name = "l01273" > < / a > < span class = "lineno" > 1273< / span > uint3 tid [[threadgroup_position_in_grid]]) {< / div >
< div class = "line" > < a id = "l01274" name = "l01274" > < / a > < span class = "lineno" > 1274< / span > < span class = "comment" > // Set the input/output matrices< / span > < / div >
< div class = "line" > < a id = "l01275" name = "l01275" > < / a > < span class = "lineno" > 1275< / span > uint32_t x_idx;< / div >
< div class = "line" > < a id = "l01276" name = "l01276" > < / a > < span class = "lineno" > 1276< / span > uint32_t w_idx;< / div >
< div class = "line" > < a id = "l01277" name = "l01277" > < / a > < span class = "lineno" > 1277< / span > < span class = "keywordflow" > if< / span > (batch_ndims == 1) {< / div >
< div class = "line" > < a id = "l01278" name = "l01278" > < / a > < span class = "lineno" > 1278< / span > x_idx = lhs_indices[tid.z * lhs_strides[0]];< / div >
< div class = "line" > < a id = "l01279" name = "l01279" > < / a > < span class = "lineno" > 1279< / span > w_idx = rhs_indices[tid.z * rhs_strides[0]];< / div >
< div class = "line" > < a id = "l01280" name = "l01280" > < / a > < span class = "lineno" > 1280< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l01281" name = "l01281" > < / a > < span class = "lineno" > 1281< / span > ulong2 idx = < a class = "code hl_function" href = "backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f" > elem_to_loc_broadcast< / a > (< / div >
< div class = "line" > < a id = "l01282" name = "l01282" > < / a > < span class = "lineno" > 1282< / span > tid.z, batch_shape, lhs_strides, rhs_strides, batch_ndims);< / div >
< div class = "line" > < a id = "l01283" name = "l01283" > < / a > < span class = "lineno" > 1283< / span > x_idx = lhs_indices[idx.x];< / div >
< div class = "line" > < a id = "l01284" name = "l01284" > < / a > < span class = "lineno" > 1284< / span > w_idx = rhs_indices[idx.y];< / div >
< div class = "line" > < a id = "l01285" name = "l01285" > < / a > < span class = "lineno" > 1285< / span > }< / div >
< div class = "line" > < a id = "l01286" name = "l01286" > < / a > < span class = "lineno" > 1286< / span > < span class = "keywordflow" > if< / span > (x_batch_ndims == 1) {< / div >
< div class = "line" > < a id = "l01287" name = "l01287" > < / a > < span class = "lineno" > 1287< / span > x += x_idx * x_strides[0];< / div >
< div class = "line" > < a id = "l01288" name = "l01288" > < / a > < span class = "lineno" > 1288< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l01289" name = "l01289" > < / a > < span class = "lineno" > 1289< / span > x += < a class = "code hl_function" href = "backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5" > elem_to_loc< / a > (x_idx, x_shape, x_strides, x_batch_ndims);< / div >
< div class = "line" > < a id = "l01290" name = "l01290" > < / a > < span class = "lineno" > 1290< / span > }< / div >
< div class = "line" > < a id = "l01291" name = "l01291" > < / a > < span class = "lineno" > 1291< / span > < span class = "keywordflow" > if< / span > (w_batch_ndims == 1) {< / div >
< div class = "line" > < a id = "l01292" name = "l01292" > < / a > < span class = "lineno" > 1292< / span > w += w_idx * w_strides[0];< / div >
< div class = "line" > < a id = "l01293" name = "l01293" > < / a > < span class = "lineno" > 1293< / span > scales += w_idx * s_strides[0];< / div >
< div class = "line" > < a id = "l01294" name = "l01294" > < / a > < span class = "lineno" > 1294< / span > biases += w_idx * b_strides[0];< / div >
< div class = "line" > < a id = "l01295" name = "l01295" > < / a > < span class = "lineno" > 1295< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l01296" name = "l01296" > < / a > < span class = "lineno" > 1296< / span > ulong3 idx = < a class = "code hl_function" href = "backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f" > elem_to_loc_broadcast< / a > (< / div >
< div class = "line" > < a id = "l01297" name = "l01297" > < / a > < span class = "lineno" > 1297< / span > w_idx, w_shape, w_strides, s_strides, b_strides, w_batch_ndims);< / div >
< div class = "line" > < a id = "l01298" name = "l01298" > < / a > < span class = "lineno" > 1298< / span > w += idx.x;< / div >
< div class = "line" > < a id = "l01299" name = "l01299" > < / a > < span class = "lineno" > 1299< / span > scales += idx.y;< / div >
< div class = "line" > < a id = "l01300" name = "l01300" > < / a > < span class = "lineno" > 1300< / span > biases += idx.z;< / div >
< div class = "line" > < a id = "l01301" name = "l01301" > < / a > < span class = "lineno" > 1301< / span > }< / div >
< div class = "line" > < a id = "l01302" name = "l01302" > < / a > < span class = "lineno" > 1302< / span > y += tid.z * output_stride;< / div >
< div class = "line" > < a id = "l01303" name = "l01303" > < / a > < span class = "lineno" > 1303< / span > }< / div >
2024-10-15 23:12:17 +08:00
< / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01304" name = "l01304" > < / a > < span class = "lineno" > 1304< / span > < / div >
< div class = "line" > < a id = "l01305" name = "l01305" > < / a > < span class = "lineno" > 1305< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T, < span class = "keywordtype" > int< / span > group_size, < span class = "keywordtype" > int< / span > bits, < span class = "keywordtype" > int< / span > D, < span class = "keywordtype" > bool< / span > batched> < / div >
< div class = "foldopen" id = "foldopen01306" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l01306" name = "l01306" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#a7ce5f53a4d6d1555e9402d545408d0ad" > 1306< / a > < / span > [[kernel]] < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#a7ce5f53a4d6d1555e9402d545408d0ad" > qmv_quad< / a > (< / div >
< div class = "line" > < a id = "l01307" name = "l01307" > < / a > < span class = "lineno" > 1307< / span > < span class = "keyword" > const< / span > device uint32_t* w [[buffer(0)]],< / div >
< div class = "line" > < a id = "l01308" name = "l01308" > < / a > < span class = "lineno" > 1308< / span > < span class = "keyword" > const< / span > device T* scales [[buffer(1)]],< / div >
< div class = "line" > < a id = "l01309" name = "l01309" > < / a > < span class = "lineno" > 1309< / span > < span class = "keyword" > const< / span > device T* biases [[buffer(2)]],< / div >
< div class = "line" > < a id = "l01310" name = "l01310" > < / a > < span class = "lineno" > 1310< / span > < span class = "keyword" > const< / span > device T* x [[buffer(3)]],< / div >
< div class = "line" > < a id = "l01311" name = "l01311" > < / a > < span class = "lineno" > 1311< / span > device T* y [[buffer(4)]],< / div >
< div class = "line" > < a id = "l01312" name = "l01312" > < / a > < span class = "lineno" > 1312< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & in_vec_size [[buffer(5)]],< / div >
< div class = "line" > < a id = "l01313" name = "l01313" > < / a > < span class = "lineno" > 1313< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & out_vec_size [[buffer(6)]],< / div >
< div class = "line" > < a id = "l01314" name = "l01314" > < / a > < span class = "lineno" > 1314< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & x_batch_ndims [[buffer(7)]],< / div >
< div class = "line" > < a id = "l01315" name = "l01315" > < / a > < span class = "lineno" > 1315< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * x_shape [[buffer(8)]],< / div >
< div class = "line" > < a id = "l01316" name = "l01316" > < / a > < span class = "lineno" > 1316< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * x_strides [[buffer(9)]],< / div >
< div class = "line" > < a id = "l01317" name = "l01317" > < / a > < span class = "lineno" > 1317< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & w_batch_ndims [[buffer(10)]],< / div >
< div class = "line" > < a id = "l01318" name = "l01318" > < / a > < span class = "lineno" > 1318< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * w_shape [[buffer(11)]],< / div >
< div class = "line" > < a id = "l01319" name = "l01319" > < / a > < span class = "lineno" > 1319< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * w_strides [[buffer(12)]],< / div >
< div class = "line" > < a id = "l01320" name = "l01320" > < / a > < span class = "lineno" > 1320< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * s_strides [[buffer(13)]],< / div >
< div class = "line" > < a id = "l01321" name = "l01321" > < / a > < span class = "lineno" > 1321< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * b_strides [[buffer(14)]],< / div >
< div class = "line" > < a id = "l01322" name = "l01322" > < / a > < span class = "lineno" > 1322< / span > uint3 tid [[threadgroup_position_in_grid]],< / div >
< div class = "line" > < a id = "l01323" name = "l01323" > < / a > < span class = "lineno" > 1323< / span > uint quad_gid [[quadgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l01324" name = "l01324" > < / a > < span class = "lineno" > 1324< / span > uint quad_lid [[thread_index_in_quadgroup]]) {< / div >
< div class = "line" > < a id = "l01325" name = "l01325" > < / a > < span class = "lineno" > 1325< / span > < span class = "keywordflow" > if< / span > (batched) {< / div >
< div class = "line" > < a id = "l01326" name = "l01326" > < / a > < span class = "lineno" > 1326< / span > < a class = "code hl_function" href = "quantized_8h.html#accab1f9e17a65242347c051f98e4c0be" > adjust_matrix_offsets< T> < / a > (< / div >
< div class = "line" > < a id = "l01327" name = "l01327" > < / a > < span class = "lineno" > 1327< / span > x,< / div >
< div class = "line" > < a id = "l01328" name = "l01328" > < / a > < span class = "lineno" > 1328< / span > w,< / div >
< div class = "line" > < a id = "l01329" name = "l01329" > < / a > < span class = "lineno" > 1329< / span > scales,< / div >
< div class = "line" > < a id = "l01330" name = "l01330" > < / a > < span class = "lineno" > 1330< / span > biases,< / div >
< div class = "line" > < a id = "l01331" name = "l01331" > < / a > < span class = "lineno" > 1331< / span > y,< / div >
< div class = "line" > < a id = "l01332" name = "l01332" > < / a > < span class = "lineno" > 1332< / span > out_vec_size,< / div >
< div class = "line" > < a id = "l01333" name = "l01333" > < / a > < span class = "lineno" > 1333< / span > x_batch_ndims,< / div >
< div class = "line" > < a id = "l01334" name = "l01334" > < / a > < span class = "lineno" > 1334< / span > x_shape,< / div >
< div class = "line" > < a id = "l01335" name = "l01335" > < / a > < span class = "lineno" > 1335< / span > x_strides,< / div >
< div class = "line" > < a id = "l01336" name = "l01336" > < / a > < span class = "lineno" > 1336< / span > w_batch_ndims,< / div >
< div class = "line" > < a id = "l01337" name = "l01337" > < / a > < span class = "lineno" > 1337< / span > w_shape,< / div >
< div class = "line" > < a id = "l01338" name = "l01338" > < / a > < span class = "lineno" > 1338< / span > w_strides,< / div >
< div class = "line" > < a id = "l01339" name = "l01339" > < / a > < span class = "lineno" > 1339< / span > s_strides,< / div >
< div class = "line" > < a id = "l01340" name = "l01340" > < / a > < span class = "lineno" > 1340< / span > b_strides,< / div >
< div class = "line" > < a id = "l01341" name = "l01341" > < / a > < span class = "lineno" > 1341< / span > tid);< / div >
< div class = "line" > < a id = "l01342" name = "l01342" > < / a > < span class = "lineno" > 1342< / span > }< / div >
< div class = "line" > < a id = "l01343" name = "l01343" > < / a > < span class = "lineno" > 1343< / span > < a class = "code hl_function" href = "quantized_8h.html#ad5cf1cf63656bc1780685d22169cd4ef" > qmv_quad_impl< T, group_size, bits, D> < / a > (< / div >
< div class = "line" > < a id = "l01344" name = "l01344" > < / a > < span class = "lineno" > 1344< / span > w,< / div >
< div class = "line" > < a id = "l01345" name = "l01345" > < / a > < span class = "lineno" > 1345< / span > scales,< / div >
< div class = "line" > < a id = "l01346" name = "l01346" > < / a > < span class = "lineno" > 1346< / span > biases,< / div >
< div class = "line" > < a id = "l01347" name = "l01347" > < / a > < span class = "lineno" > 1347< / span > x,< / div >
< div class = "line" > < a id = "l01348" name = "l01348" > < / a > < span class = "lineno" > 1348< / span > y,< / div >
< div class = "line" > < a id = "l01349" name = "l01349" > < / a > < span class = "lineno" > 1349< / span > in_vec_size,< / div >
< div class = "line" > < a id = "l01350" name = "l01350" > < / a > < span class = "lineno" > 1350< / span > out_vec_size,< / div >
< div class = "line" > < a id = "l01351" name = "l01351" > < / a > < span class = "lineno" > 1351< / span > tid,< / div >
< div class = "line" > < a id = "l01352" name = "l01352" > < / a > < span class = "lineno" > 1352< / span > quad_gid,< / div >
< div class = "line" > < a id = "l01353" name = "l01353" > < / a > < span class = "lineno" > 1353< / span > quad_lid);< / div >
< div class = "line" > < a id = "l01354" name = "l01354" > < / a > < span class = "lineno" > 1354< / span > }< / div >
2024-10-15 23:12:17 +08:00
< / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01355" name = "l01355" > < / a > < span class = "lineno" > 1355< / span > < / div >
< div class = "line" > < a id = "l01356" name = "l01356" > < / a > < span class = "lineno" > 1356< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T, < span class = "keywordtype" > int< / span > group_size, < span class = "keywordtype" > int< / span > bits, < span class = "keywordtype" > bool< / span > batched> < / div >
< div class = "foldopen" id = "foldopen01357" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l01357" name = "l01357" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#a7bd1d9f17c86c8fd34ec13678cff755f" > 1357< / a > < / span > [[kernel]] < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#a7bd1d9f17c86c8fd34ec13678cff755f" > qmv_fast< / a > (< / div >
< div class = "line" > < a id = "l01358" name = "l01358" > < / a > < span class = "lineno" > 1358< / span > < span class = "keyword" > const< / span > device uint32_t* w [[buffer(0)]],< / div >
< div class = "line" > < a id = "l01359" name = "l01359" > < / a > < span class = "lineno" > 1359< / span > < span class = "keyword" > const< / span > device T* scales [[buffer(1)]],< / div >
< div class = "line" > < a id = "l01360" name = "l01360" > < / a > < span class = "lineno" > 1360< / span > < span class = "keyword" > const< / span > device T* biases [[buffer(2)]],< / div >
< div class = "line" > < a id = "l01361" name = "l01361" > < / a > < span class = "lineno" > 1361< / span > < span class = "keyword" > const< / span > device T* x [[buffer(3)]],< / div >
< div class = "line" > < a id = "l01362" name = "l01362" > < / a > < span class = "lineno" > 1362< / span > device T* y [[buffer(4)]],< / div >
< div class = "line" > < a id = "l01363" name = "l01363" > < / a > < span class = "lineno" > 1363< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & in_vec_size [[buffer(5)]],< / div >
< div class = "line" > < a id = "l01364" name = "l01364" > < / a > < span class = "lineno" > 1364< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & out_vec_size [[buffer(6)]],< / div >
< div class = "line" > < a id = "l01365" name = "l01365" > < / a > < span class = "lineno" > 1365< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & x_batch_ndims [[buffer(7)]],< / div >
< div class = "line" > < a id = "l01366" name = "l01366" > < / a > < span class = "lineno" > 1366< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * x_shape [[buffer(8)]],< / div >
< div class = "line" > < a id = "l01367" name = "l01367" > < / a > < span class = "lineno" > 1367< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * x_strides [[buffer(9)]],< / div >
< div class = "line" > < a id = "l01368" name = "l01368" > < / a > < span class = "lineno" > 1368< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & w_batch_ndims [[buffer(10)]],< / div >
< div class = "line" > < a id = "l01369" name = "l01369" > < / a > < span class = "lineno" > 1369< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * w_shape [[buffer(11)]],< / div >
< div class = "line" > < a id = "l01370" name = "l01370" > < / a > < span class = "lineno" > 1370< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * w_strides [[buffer(12)]],< / div >
< div class = "line" > < a id = "l01371" name = "l01371" > < / a > < span class = "lineno" > 1371< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * s_strides [[buffer(13)]],< / div >
< div class = "line" > < a id = "l01372" name = "l01372" > < / a > < span class = "lineno" > 1372< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * b_strides [[buffer(14)]],< / div >
< div class = "line" > < a id = "l01373" name = "l01373" > < / a > < span class = "lineno" > 1373< / span > uint3 tid [[threadgroup_position_in_grid]],< / div >
< div class = "line" > < a id = "l01374" name = "l01374" > < / a > < span class = "lineno" > 1374< / span > uint simd_gid [[simdgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l01375" name = "l01375" > < / a > < span class = "lineno" > 1375< / span > uint simd_lid [[thread_index_in_simdgroup]]) {< / div >
< div class = "line" > < a id = "l01376" name = "l01376" > < / a > < span class = "lineno" > 1376< / span > < span class = "keywordflow" > if< / span > (batched) {< / div >
< div class = "line" > < a id = "l01377" name = "l01377" > < / a > < span class = "lineno" > 1377< / span > < a class = "code hl_function" href = "quantized_8h.html#accab1f9e17a65242347c051f98e4c0be" > adjust_matrix_offsets< T> < / a > (< / div >
< div class = "line" > < a id = "l01378" name = "l01378" > < / a > < span class = "lineno" > 1378< / span > x,< / div >
< div class = "line" > < a id = "l01379" name = "l01379" > < / a > < span class = "lineno" > 1379< / span > w,< / div >
< div class = "line" > < a id = "l01380" name = "l01380" > < / a > < span class = "lineno" > 1380< / span > scales,< / div >
< div class = "line" > < a id = "l01381" name = "l01381" > < / a > < span class = "lineno" > 1381< / span > biases,< / div >
< div class = "line" > < a id = "l01382" name = "l01382" > < / a > < span class = "lineno" > 1382< / span > y,< / div >
< div class = "line" > < a id = "l01383" name = "l01383" > < / a > < span class = "lineno" > 1383< / span > out_vec_size,< / div >
< div class = "line" > < a id = "l01384" name = "l01384" > < / a > < span class = "lineno" > 1384< / span > x_batch_ndims,< / div >
< div class = "line" > < a id = "l01385" name = "l01385" > < / a > < span class = "lineno" > 1385< / span > x_shape,< / div >
< div class = "line" > < a id = "l01386" name = "l01386" > < / a > < span class = "lineno" > 1386< / span > x_strides,< / div >
< div class = "line" > < a id = "l01387" name = "l01387" > < / a > < span class = "lineno" > 1387< / span > w_batch_ndims,< / div >
< div class = "line" > < a id = "l01388" name = "l01388" > < / a > < span class = "lineno" > 1388< / span > w_shape,< / div >
< div class = "line" > < a id = "l01389" name = "l01389" > < / a > < span class = "lineno" > 1389< / span > w_strides,< / div >
< div class = "line" > < a id = "l01390" name = "l01390" > < / a > < span class = "lineno" > 1390< / span > s_strides,< / div >
< div class = "line" > < a id = "l01391" name = "l01391" > < / a > < span class = "lineno" > 1391< / span > b_strides,< / div >
< div class = "line" > < a id = "l01392" name = "l01392" > < / a > < span class = "lineno" > 1392< / span > tid);< / div >
< div class = "line" > < a id = "l01393" name = "l01393" > < / a > < span class = "lineno" > 1393< / span > }< / div >
< div class = "line" > < a id = "l01394" name = "l01394" > < / a > < span class = "lineno" > 1394< / span > < a class = "code hl_function" href = "quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81" > qmv_fast_impl< T, group_size, bits> < / a > (< / div >
< div class = "line" > < a id = "l01395" name = "l01395" > < / a > < span class = "lineno" > 1395< / span > w,< / div >
< div class = "line" > < a id = "l01396" name = "l01396" > < / a > < span class = "lineno" > 1396< / span > scales,< / div >
< div class = "line" > < a id = "l01397" name = "l01397" > < / a > < span class = "lineno" > 1397< / span > biases,< / div >
< div class = "line" > < a id = "l01398" name = "l01398" > < / a > < span class = "lineno" > 1398< / span > x,< / div >
< div class = "line" > < a id = "l01399" name = "l01399" > < / a > < span class = "lineno" > 1399< / span > y,< / div >
< div class = "line" > < a id = "l01400" name = "l01400" > < / a > < span class = "lineno" > 1400< / span > in_vec_size,< / div >
< div class = "line" > < a id = "l01401" name = "l01401" > < / a > < span class = "lineno" > 1401< / span > out_vec_size,< / div >
< div class = "line" > < a id = "l01402" name = "l01402" > < / a > < span class = "lineno" > 1402< / span > tid,< / div >
< div class = "line" > < a id = "l01403" name = "l01403" > < / a > < span class = "lineno" > 1403< / span > simd_gid,< / div >
< div class = "line" > < a id = "l01404" name = "l01404" > < / a > < span class = "lineno" > 1404< / span > simd_lid);< / div >
< div class = "line" > < a id = "l01405" name = "l01405" > < / a > < span class = "lineno" > 1405< / span > }< / div >
2024-10-26 04:23:45 +08:00
< / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01406" name = "l01406" > < / a > < span class = "lineno" > 1406< / span > < / div >
< div class = "line" > < a id = "l01407" name = "l01407" > < / a > < span class = "lineno" > 1407< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T, const < span class = "keywordtype" > int< / span > group_size, const < span class = "keywordtype" > int< / span > bits, < span class = "keywordtype" > bool< / span > batched> < / div >
< div class = "foldopen" id = "foldopen01408" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l01408" name = "l01408" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#a639c50a08b5cf57e8be5279a116274bd" > 1408< / a > < / span > [[kernel]] < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#a639c50a08b5cf57e8be5279a116274bd" > qmv< / a > (< / div >
< div class = "line" > < a id = "l01409" name = "l01409" > < / a > < span class = "lineno" > 1409< / span > < span class = "keyword" > const< / span > device uint32_t* w [[buffer(0)]],< / div >
< div class = "line" > < a id = "l01410" name = "l01410" > < / a > < span class = "lineno" > 1410< / span > < span class = "keyword" > const< / span > device T* scales [[buffer(1)]],< / div >
< div class = "line" > < a id = "l01411" name = "l01411" > < / a > < span class = "lineno" > 1411< / span > < span class = "keyword" > const< / span > device T* biases [[buffer(2)]],< / div >
< div class = "line" > < a id = "l01412" name = "l01412" > < / a > < span class = "lineno" > 1412< / span > < span class = "keyword" > const< / span > device T* x [[buffer(3)]],< / div >
< div class = "line" > < a id = "l01413" name = "l01413" > < / a > < span class = "lineno" > 1413< / span > device T* y [[buffer(4)]],< / div >
< div class = "line" > < a id = "l01414" name = "l01414" > < / a > < span class = "lineno" > 1414< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & in_vec_size [[buffer(5)]],< / div >
< div class = "line" > < a id = "l01415" name = "l01415" > < / a > < span class = "lineno" > 1415< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & out_vec_size [[buffer(6)]],< / div >
< div class = "line" > < a id = "l01416" name = "l01416" > < / a > < span class = "lineno" > 1416< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & x_batch_ndims [[buffer(7)]],< / div >
< div class = "line" > < a id = "l01417" name = "l01417" > < / a > < span class = "lineno" > 1417< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * x_shape [[buffer(8)]],< / div >
< div class = "line" > < a id = "l01418" name = "l01418" > < / a > < span class = "lineno" > 1418< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * x_strides [[buffer(9)]],< / div >
< div class = "line" > < a id = "l01419" name = "l01419" > < / a > < span class = "lineno" > 1419< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & w_batch_ndims [[buffer(10)]],< / div >
< div class = "line" > < a id = "l01420" name = "l01420" > < / a > < span class = "lineno" > 1420< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * w_shape [[buffer(11)]],< / div >
< div class = "line" > < a id = "l01421" name = "l01421" > < / a > < span class = "lineno" > 1421< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * w_strides [[buffer(12)]],< / div >
< div class = "line" > < a id = "l01422" name = "l01422" > < / a > < span class = "lineno" > 1422< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * s_strides [[buffer(13)]],< / div >
< div class = "line" > < a id = "l01423" name = "l01423" > < / a > < span class = "lineno" > 1423< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * b_strides [[buffer(14)]],< / div >
< div class = "line" > < a id = "l01424" name = "l01424" > < / a > < span class = "lineno" > 1424< / span > uint3 tid [[threadgroup_position_in_grid]],< / div >
< div class = "line" > < a id = "l01425" name = "l01425" > < / a > < span class = "lineno" > 1425< / span > uint simd_gid [[simdgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l01426" name = "l01426" > < / a > < span class = "lineno" > 1426< / span > uint simd_lid [[thread_index_in_simdgroup]]) {< / div >
< div class = "line" > < a id = "l01427" name = "l01427" > < / a > < span class = "lineno" > 1427< / span > < span class = "keywordflow" > if< / span > (batched) {< / div >
< div class = "line" > < a id = "l01428" name = "l01428" > < / a > < span class = "lineno" > 1428< / span > < a class = "code hl_function" href = "quantized_8h.html#accab1f9e17a65242347c051f98e4c0be" > adjust_matrix_offsets< T> < / a > (< / div >
< div class = "line" > < a id = "l01429" name = "l01429" > < / a > < span class = "lineno" > 1429< / span > x,< / div >
< div class = "line" > < a id = "l01430" name = "l01430" > < / a > < span class = "lineno" > 1430< / span > w,< / div >
< div class = "line" > < a id = "l01431" name = "l01431" > < / a > < span class = "lineno" > 1431< / span > scales,< / div >
< div class = "line" > < a id = "l01432" name = "l01432" > < / a > < span class = "lineno" > 1432< / span > biases,< / div >
< div class = "line" > < a id = "l01433" name = "l01433" > < / a > < span class = "lineno" > 1433< / span > y,< / div >
< div class = "line" > < a id = "l01434" name = "l01434" > < / a > < span class = "lineno" > 1434< / span > out_vec_size,< / div >
< div class = "line" > < a id = "l01435" name = "l01435" > < / a > < span class = "lineno" > 1435< / span > x_batch_ndims,< / div >
< div class = "line" > < a id = "l01436" name = "l01436" > < / a > < span class = "lineno" > 1436< / span > x_shape,< / div >
< div class = "line" > < a id = "l01437" name = "l01437" > < / a > < span class = "lineno" > 1437< / span > x_strides,< / div >
< div class = "line" > < a id = "l01438" name = "l01438" > < / a > < span class = "lineno" > 1438< / span > w_batch_ndims,< / div >
< div class = "line" > < a id = "l01439" name = "l01439" > < / a > < span class = "lineno" > 1439< / span > w_shape,< / div >
< div class = "line" > < a id = "l01440" name = "l01440" > < / a > < span class = "lineno" > 1440< / span > w_strides,< / div >
< div class = "line" > < a id = "l01441" name = "l01441" > < / a > < span class = "lineno" > 1441< / span > s_strides,< / div >
< div class = "line" > < a id = "l01442" name = "l01442" > < / a > < span class = "lineno" > 1442< / span > b_strides,< / div >
< div class = "line" > < a id = "l01443" name = "l01443" > < / a > < span class = "lineno" > 1443< / span > tid);< / div >
< div class = "line" > < a id = "l01444" name = "l01444" > < / a > < span class = "lineno" > 1444< / span > }< / div >
< div class = "line" > < a id = "l01445" name = "l01445" > < / a > < span class = "lineno" > 1445< / span > < a class = "code hl_function" href = "quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd" > qmv_impl< T, group_size, bits> < / a > (< / div >
< div class = "line" > < a id = "l01446" name = "l01446" > < / a > < span class = "lineno" > 1446< / span > w,< / div >
< div class = "line" > < a id = "l01447" name = "l01447" > < / a > < span class = "lineno" > 1447< / span > scales,< / div >
< div class = "line" > < a id = "l01448" name = "l01448" > < / a > < span class = "lineno" > 1448< / span > biases,< / div >
< div class = "line" > < a id = "l01449" name = "l01449" > < / a > < span class = "lineno" > 1449< / span > x,< / div >
< div class = "line" > < a id = "l01450" name = "l01450" > < / a > < span class = "lineno" > 1450< / span > y,< / div >
< div class = "line" > < a id = "l01451" name = "l01451" > < / a > < span class = "lineno" > 1451< / span > in_vec_size,< / div >
< div class = "line" > < a id = "l01452" name = "l01452" > < / a > < span class = "lineno" > 1452< / span > out_vec_size,< / div >
< div class = "line" > < a id = "l01453" name = "l01453" > < / a > < span class = "lineno" > 1453< / span > tid,< / div >
< div class = "line" > < a id = "l01454" name = "l01454" > < / a > < span class = "lineno" > 1454< / span > simd_gid,< / div >
< div class = "line" > < a id = "l01455" name = "l01455" > < / a > < span class = "lineno" > 1455< / span > simd_lid);< / div >
< div class = "line" > < a id = "l01456" name = "l01456" > < / a > < span class = "lineno" > 1456< / span > }< / div >
2024-10-15 23:12:17 +08:00
< / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01457" name = "l01457" > < / a > < span class = "lineno" > 1457< / span > < / div >
< div class = "line" > < a id = "l01458" name = "l01458" > < / a > < span class = "lineno" > 1458< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T, const < span class = "keywordtype" > int< / span > group_size, const < span class = "keywordtype" > int< / span > bits, < span class = "keywordtype" > bool< / span > batched> < / div >
< div class = "foldopen" id = "foldopen01459" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l01459" name = "l01459" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#ad84f7d5ab9e32dbbe3ca759ae5d5d5c5" > 1459< / a > < / span > [[kernel]] < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#ad84f7d5ab9e32dbbe3ca759ae5d5d5c5" > qvm< / a > (< / div >
< div class = "line" > < a id = "l01460" name = "l01460" > < / a > < span class = "lineno" > 1460< / span > < span class = "keyword" > const< / span > device uint32_t* w [[buffer(0)]],< / div >
< div class = "line" > < a id = "l01461" name = "l01461" > < / a > < span class = "lineno" > 1461< / span > < span class = "keyword" > const< / span > device T* scales [[buffer(1)]],< / div >
< div class = "line" > < a id = "l01462" name = "l01462" > < / a > < span class = "lineno" > 1462< / span > < span class = "keyword" > const< / span > device T* biases [[buffer(2)]],< / div >
< div class = "line" > < a id = "l01463" name = "l01463" > < / a > < span class = "lineno" > 1463< / span > < span class = "keyword" > const< / span > device T* x [[buffer(3)]],< / div >
< div class = "line" > < a id = "l01464" name = "l01464" > < / a > < span class = "lineno" > 1464< / span > device T* y [[buffer(4)]],< / div >
< div class = "line" > < a id = "l01465" name = "l01465" > < / a > < span class = "lineno" > 1465< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & in_vec_size [[buffer(5)]],< / div >
< div class = "line" > < a id = "l01466" name = "l01466" > < / a > < span class = "lineno" > 1466< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & out_vec_size [[buffer(6)]],< / div >
< div class = "line" > < a id = "l01467" name = "l01467" > < / a > < span class = "lineno" > 1467< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & x_batch_ndims [[buffer(7)]],< / div >
< div class = "line" > < a id = "l01468" name = "l01468" > < / a > < span class = "lineno" > 1468< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * x_shape [[buffer(8)]],< / div >
< div class = "line" > < a id = "l01469" name = "l01469" > < / a > < span class = "lineno" > 1469< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * x_strides [[buffer(9)]],< / div >
< div class = "line" > < a id = "l01470" name = "l01470" > < / a > < span class = "lineno" > 1470< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & w_batch_ndims [[buffer(10)]],< / div >
< div class = "line" > < a id = "l01471" name = "l01471" > < / a > < span class = "lineno" > 1471< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * w_shape [[buffer(11)]],< / div >
< div class = "line" > < a id = "l01472" name = "l01472" > < / a > < span class = "lineno" > 1472< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * w_strides [[buffer(12)]],< / div >
< div class = "line" > < a id = "l01473" name = "l01473" > < / a > < span class = "lineno" > 1473< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * s_strides [[buffer(13)]],< / div >
< div class = "line" > < a id = "l01474" name = "l01474" > < / a > < span class = "lineno" > 1474< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * b_strides [[buffer(14)]],< / div >
< div class = "line" > < a id = "l01475" name = "l01475" > < / a > < span class = "lineno" > 1475< / span > uint3 tid [[threadgroup_position_in_grid]],< / div >
< div class = "line" > < a id = "l01476" name = "l01476" > < / a > < span class = "lineno" > 1476< / span > uint simd_gid [[simdgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l01477" name = "l01477" > < / a > < span class = "lineno" > 1477< / span > uint simd_lid [[thread_index_in_simdgroup]]) {< / div >
< div class = "line" > < a id = "l01478" name = "l01478" > < / a > < span class = "lineno" > 1478< / span > < span class = "keywordflow" > if< / span > (batched) {< / div >
< div class = "line" > < a id = "l01479" name = "l01479" > < / a > < span class = "lineno" > 1479< / span > < a class = "code hl_function" href = "quantized_8h.html#accab1f9e17a65242347c051f98e4c0be" > adjust_matrix_offsets< T> < / a > (< / div >
< div class = "line" > < a id = "l01480" name = "l01480" > < / a > < span class = "lineno" > 1480< / span > x,< / div >
< div class = "line" > < a id = "l01481" name = "l01481" > < / a > < span class = "lineno" > 1481< / span > w,< / div >
< div class = "line" > < a id = "l01482" name = "l01482" > < / a > < span class = "lineno" > 1482< / span > scales,< / div >
< div class = "line" > < a id = "l01483" name = "l01483" > < / a > < span class = "lineno" > 1483< / span > biases,< / div >
< div class = "line" > < a id = "l01484" name = "l01484" > < / a > < span class = "lineno" > 1484< / span > y,< / div >
< div class = "line" > < a id = "l01485" name = "l01485" > < / a > < span class = "lineno" > 1485< / span > out_vec_size,< / div >
< div class = "line" > < a id = "l01486" name = "l01486" > < / a > < span class = "lineno" > 1486< / span > x_batch_ndims,< / div >
< div class = "line" > < a id = "l01487" name = "l01487" > < / a > < span class = "lineno" > 1487< / span > x_shape,< / div >
< div class = "line" > < a id = "l01488" name = "l01488" > < / a > < span class = "lineno" > 1488< / span > x_strides,< / div >
< div class = "line" > < a id = "l01489" name = "l01489" > < / a > < span class = "lineno" > 1489< / span > w_batch_ndims,< / div >
< div class = "line" > < a id = "l01490" name = "l01490" > < / a > < span class = "lineno" > 1490< / span > w_shape,< / div >
< div class = "line" > < a id = "l01491" name = "l01491" > < / a > < span class = "lineno" > 1491< / span > w_strides,< / div >
< div class = "line" > < a id = "l01492" name = "l01492" > < / a > < span class = "lineno" > 1492< / span > s_strides,< / div >
< div class = "line" > < a id = "l01493" name = "l01493" > < / a > < span class = "lineno" > 1493< / span > b_strides,< / div >
< div class = "line" > < a id = "l01494" name = "l01494" > < / a > < span class = "lineno" > 1494< / span > tid);< / div >
< div class = "line" > < a id = "l01495" name = "l01495" > < / a > < span class = "lineno" > 1495< / span > }< / div >
< div class = "line" > < a id = "l01496" name = "l01496" > < / a > < span class = "lineno" > 1496< / span > < a class = "code hl_function" href = "quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a" > qvm_impl< T, group_size, bits> < / a > (< / div >
< div class = "line" > < a id = "l01497" name = "l01497" > < / a > < span class = "lineno" > 1497< / span > w,< / div >
< div class = "line" > < a id = "l01498" name = "l01498" > < / a > < span class = "lineno" > 1498< / span > scales,< / div >
< div class = "line" > < a id = "l01499" name = "l01499" > < / a > < span class = "lineno" > 1499< / span > biases,< / div >
< div class = "line" > < a id = "l01500" name = "l01500" > < / a > < span class = "lineno" > 1500< / span > x,< / div >
< div class = "line" > < a id = "l01501" name = "l01501" > < / a > < span class = "lineno" > 1501< / span > y,< / div >
< div class = "line" > < a id = "l01502" name = "l01502" > < / a > < span class = "lineno" > 1502< / span > in_vec_size,< / div >
< div class = "line" > < a id = "l01503" name = "l01503" > < / a > < span class = "lineno" > 1503< / span > out_vec_size,< / div >
< div class = "line" > < a id = "l01504" name = "l01504" > < / a > < span class = "lineno" > 1504< / span > tid,< / div >
< div class = "line" > < a id = "l01505" name = "l01505" > < / a > < span class = "lineno" > 1505< / span > simd_gid,< / div >
< div class = "line" > < a id = "l01506" name = "l01506" > < / a > < span class = "lineno" > 1506< / span > simd_lid);< / div >
< div class = "line" > < a id = "l01507" name = "l01507" > < / a > < span class = "lineno" > 1507< / span > }< / div >
2024-10-26 04:23:45 +08:00
< / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01508" name = "l01508" > < / a > < span class = "lineno" > 1508< / span > < / div >
< div class = "line" > < a id = "l01509" name = "l01509" > < / a > < span class = "lineno" > 1509< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T, const < span class = "keywordtype" > int< / span > group_size, const < span class = "keywordtype" > int< / span > bits, < span class = "keywordtype" > int< / span > split_k = 32> < / div >
< div class = "foldopen" id = "foldopen01510" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l01510" name = "l01510" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#ab8243818512d6078d23e6ffb65fd7bb8" > 1510< / a > < / span > [[kernel]] < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#ab8243818512d6078d23e6ffb65fd7bb8" > qvm_split_k< / a > (< / div >
< div class = "line" > < a id = "l01511" name = "l01511" > < / a > < span class = "lineno" > 1511< / span > < span class = "keyword" > const< / span > device uint32_t* w [[buffer(0)]],< / div >
< div class = "line" > < a id = "l01512" name = "l01512" > < / a > < span class = "lineno" > 1512< / span > < span class = "keyword" > const< / span > device T* scales [[buffer(1)]],< / div >
< div class = "line" > < a id = "l01513" name = "l01513" > < / a > < span class = "lineno" > 1513< / span > < span class = "keyword" > const< / span > device T* biases [[buffer(2)]],< / div >
< div class = "line" > < a id = "l01514" name = "l01514" > < / a > < span class = "lineno" > 1514< / span > < span class = "keyword" > const< / span > device T* x [[buffer(3)]],< / div >
< div class = "line" > < a id = "l01515" name = "l01515" > < / a > < span class = "lineno" > 1515< / span > device T* y [[buffer(4)]],< / div >
< div class = "line" > < a id = "l01516" name = "l01516" > < / a > < span class = "lineno" > 1516< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & in_vec_size [[buffer(5)]],< / div >
< div class = "line" > < a id = "l01517" name = "l01517" > < / a > < span class = "lineno" > 1517< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & out_vec_size [[buffer(6)]],< / div >
< div class = "line" > < a id = "l01518" name = "l01518" > < / a > < span class = "lineno" > 1518< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & x_batch_ndims [[buffer(7)]],< / div >
< div class = "line" > < a id = "l01519" name = "l01519" > < / a > < span class = "lineno" > 1519< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * x_shape [[buffer(8)]],< / div >
< div class = "line" > < a id = "l01520" name = "l01520" > < / a > < span class = "lineno" > 1520< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * x_strides [[buffer(9)]],< / div >
< div class = "line" > < a id = "l01521" name = "l01521" > < / a > < span class = "lineno" > 1521< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & w_batch_ndims [[buffer(10)]],< / div >
< div class = "line" > < a id = "l01522" name = "l01522" > < / a > < span class = "lineno" > 1522< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * w_shape [[buffer(11)]],< / div >
< div class = "line" > < a id = "l01523" name = "l01523" > < / a > < span class = "lineno" > 1523< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * w_strides [[buffer(12)]],< / div >
< div class = "line" > < a id = "l01524" name = "l01524" > < / a > < span class = "lineno" > 1524< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * s_strides [[buffer(13)]],< / div >
< div class = "line" > < a id = "l01525" name = "l01525" > < / a > < span class = "lineno" > 1525< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * b_strides [[buffer(14)]],< / div >
< div class = "line" > < a id = "l01526" name = "l01526" > < / a > < span class = "lineno" > 1526< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & final_block_size [[buffer(15)]],< / div >
< div class = "line" > < a id = "l01527" name = "l01527" > < / a > < span class = "lineno" > 1527< / span > uint3 tid [[threadgroup_position_in_grid]],< / div >
< div class = "line" > < a id = "l01528" name = "l01528" > < / a > < span class = "lineno" > 1528< / span > uint simd_gid [[simdgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l01529" name = "l01529" > < / a > < span class = "lineno" > 1529< / span > uint simd_lid [[thread_index_in_simdgroup]]) {< / div >
< div class = "line" > < a id = "l01530" name = "l01530" > < / a > < span class = "lineno" > 1530< / span > < a class = "code hl_function" href = "quantized_8h.html#accab1f9e17a65242347c051f98e4c0be" > adjust_matrix_offsets< T> < / a > (< / div >
< div class = "line" > < a id = "l01531" name = "l01531" > < / a > < span class = "lineno" > 1531< / span > x,< / div >
< div class = "line" > < a id = "l01532" name = "l01532" > < / a > < span class = "lineno" > 1532< / span > w,< / div >
< div class = "line" > < a id = "l01533" name = "l01533" > < / a > < span class = "lineno" > 1533< / span > scales,< / div >
< div class = "line" > < a id = "l01534" name = "l01534" > < / a > < span class = "lineno" > 1534< / span > biases,< / div >
< div class = "line" > < a id = "l01535" name = "l01535" > < / a > < span class = "lineno" > 1535< / span > y,< / div >
< div class = "line" > < a id = "l01536" name = "l01536" > < / a > < span class = "lineno" > 1536< / span > out_vec_size,< / div >
< div class = "line" > < a id = "l01537" name = "l01537" > < / a > < span class = "lineno" > 1537< / span > x_batch_ndims,< / div >
< div class = "line" > < a id = "l01538" name = "l01538" > < / a > < span class = "lineno" > 1538< / span > x_shape,< / div >
< div class = "line" > < a id = "l01539" name = "l01539" > < / a > < span class = "lineno" > 1539< / span > x_strides,< / div >
< div class = "line" > < a id = "l01540" name = "l01540" > < / a > < span class = "lineno" > 1540< / span > w_batch_ndims,< / div >
< div class = "line" > < a id = "l01541" name = "l01541" > < / a > < span class = "lineno" > 1541< / span > w_shape,< / div >
< div class = "line" > < a id = "l01542" name = "l01542" > < / a > < span class = "lineno" > 1542< / span > w_strides,< / div >
< div class = "line" > < a id = "l01543" name = "l01543" > < / a > < span class = "lineno" > 1543< / span > s_strides,< / div >
< div class = "line" > < a id = "l01544" name = "l01544" > < / a > < span class = "lineno" > 1544< / span > b_strides,< / div >
< div class = "line" > < a id = "l01545" name = "l01545" > < / a > < span class = "lineno" > 1545< / span > tid);< / div >
< div class = "line" > < a id = "l01546" name = "l01546" > < / a > < span class = "lineno" > 1546< / span > < / div >
< div class = "line" > < a id = "l01547" name = "l01547" > < / a > < span class = "lineno" > 1547< / span > < span class = "comment" > // When (in_vec_size % split_k != 0) the final block needs to be smaller< / span > < / div >
< div class = "line" > < a id = "l01548" name = "l01548" > < / a > < span class = "lineno" > 1548< / span > < span class = "keywordtype" > int< / span > in_vec_size_adj =< / div >
< div class = "line" > < a id = "l01549" name = "l01549" > < / a > < span class = "lineno" > 1549< / span > tid.z % split_k == split_k - 1 ? final_block_size : in_vec_size;< / div >
< div class = "line" > < a id = "l01550" name = "l01550" > < / a > < span class = "lineno" > 1550< / span > < / div >
< div class = "line" > < a id = "l01551" name = "l01551" > < / a > < span class = "lineno" > 1551< / span > < a class = "code hl_function" href = "quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a" > qvm_impl< T, group_size, bits> < / a > (< / div >
< div class = "line" > < a id = "l01552" name = "l01552" > < / a > < span class = "lineno" > 1552< / span > w,< / div >
< div class = "line" > < a id = "l01553" name = "l01553" > < / a > < span class = "lineno" > 1553< / span > scales,< / div >
< div class = "line" > < a id = "l01554" name = "l01554" > < / a > < span class = "lineno" > 1554< / span > biases,< / div >
< div class = "line" > < a id = "l01555" name = "l01555" > < / a > < span class = "lineno" > 1555< / span > x,< / div >
< div class = "line" > < a id = "l01556" name = "l01556" > < / a > < span class = "lineno" > 1556< / span > y,< / div >
< div class = "line" > < a id = "l01557" name = "l01557" > < / a > < span class = "lineno" > 1557< / span > in_vec_size_adj,< / div >
< div class = "line" > < a id = "l01558" name = "l01558" > < / a > < span class = "lineno" > 1558< / span > out_vec_size,< / div >
< div class = "line" > < a id = "l01559" name = "l01559" > < / a > < span class = "lineno" > 1559< / span > tid,< / div >
< div class = "line" > < a id = "l01560" name = "l01560" > < / a > < span class = "lineno" > 1560< / span > simd_gid,< / div >
< div class = "line" > < a id = "l01561" name = "l01561" > < / a > < span class = "lineno" > 1561< / span > simd_lid);< / div >
< div class = "line" > < a id = "l01562" name = "l01562" > < / a > < span class = "lineno" > 1562< / span > }< / div >
2024-10-26 04:23:45 +08:00
< / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01563" name = "l01563" > < / a > < span class = "lineno" > 1563< / span > < / div >
< div class = "line" > < a id = "l01564" name = "l01564" > < / a > < span class = "lineno" > 1564< / span > < span class = "keyword" > template< / span > < < / div >
< div class = "line" > < a id = "l01565" name = "l01565" > < / a > < span class = "lineno" > 1565< / span > < span class = "keyword" > typename< / span > T,< / div >
< div class = "line" > < a id = "l01566" name = "l01566" > < / a > < span class = "lineno" > 1566< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > group_size,< / div >
< div class = "line" > < a id = "l01567" name = "l01567" > < / a > < span class = "lineno" > 1567< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > < a class = "code hl_function" href = "namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825" > bits< / a > ,< / div >
< div class = "line" > < a id = "l01568" name = "l01568" > < / a > < span class = "lineno" > 1568< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > bool< / span > aligned_N,< / div >
< div class = "line" > < a id = "l01569" name = "l01569" > < / a > < span class = "lineno" > 1569< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > bool< / span > batched,< / div >
< div class = "line" > < a id = "l01570" name = "l01570" > < / a > < span class = "lineno" > 1570< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > BM = 32,< / div >
< div class = "line" > < a id = "l01571" name = "l01571" > < / a > < span class = "lineno" > 1571< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > BK = 32,< / div >
< div class = "line" > < a id = "l01572" name = "l01572" > < / a > < span class = "lineno" > 1572< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > BN = 32> < / div >
< div class = "foldopen" id = "foldopen01573" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l01573" name = "l01573" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#abe2e3ef0ee4ec2cb61dc5330ad463d10" > 1573< / a > < / span > [[kernel]] < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#abe2e3ef0ee4ec2cb61dc5330ad463d10" > qmm_t< / a > (< / div >
< div class = "line" > < a id = "l01574" name = "l01574" > < / a > < span class = "lineno" > 1574< / span > < span class = "keyword" > const< / span > device uint32_t* w [[buffer(0)]],< / div >
< div class = "line" > < a id = "l01575" name = "l01575" > < / a > < span class = "lineno" > 1575< / span > < span class = "keyword" > const< / span > device T* scales [[buffer(1)]],< / div >
< div class = "line" > < a id = "l01576" name = "l01576" > < / a > < span class = "lineno" > 1576< / span > < span class = "keyword" > const< / span > device T* biases [[buffer(2)]],< / div >
< div class = "line" > < a id = "l01577" name = "l01577" > < / a > < span class = "lineno" > 1577< / span > < span class = "keyword" > const< / span > device T* x [[buffer(3)]],< / div >
< div class = "line" > < a id = "l01578" name = "l01578" > < / a > < span class = "lineno" > 1578< / span > device T* y [[buffer(4)]],< / div >
< div class = "line" > < a id = "l01579" name = "l01579" > < / a > < span class = "lineno" > 1579< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & K [[buffer(5)]],< / div >
< div class = "line" > < a id = "l01580" name = "l01580" > < / a > < span class = "lineno" > 1580< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & N [[buffer(6)]],< / div >
< div class = "line" > < a id = "l01581" name = "l01581" > < / a > < span class = "lineno" > 1581< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & M [[buffer(7)]],< / div >
< div class = "line" > < a id = "l01582" name = "l01582" > < / a > < span class = "lineno" > 1582< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & x_batch_ndims [[buffer(8)]],< / div >
< div class = "line" > < a id = "l01583" name = "l01583" > < / a > < span class = "lineno" > 1583< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * x_shape [[buffer(9)]],< / div >
< div class = "line" > < a id = "l01584" name = "l01584" > < / a > < span class = "lineno" > 1584< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * x_strides [[buffer(10)]],< / div >
< div class = "line" > < a id = "l01585" name = "l01585" > < / a > < span class = "lineno" > 1585< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & w_batch_ndims [[buffer(11)]],< / div >
< div class = "line" > < a id = "l01586" name = "l01586" > < / a > < span class = "lineno" > 1586< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * w_shape [[buffer(12)]],< / div >
< div class = "line" > < a id = "l01587" name = "l01587" > < / a > < span class = "lineno" > 1587< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * w_strides [[buffer(13)]],< / div >
< div class = "line" > < a id = "l01588" name = "l01588" > < / a > < span class = "lineno" > 1588< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * s_strides [[buffer(14)]],< / div >
< div class = "line" > < a id = "l01589" name = "l01589" > < / a > < span class = "lineno" > 1589< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * b_strides [[buffer(15)]],< / div >
< div class = "line" > < a id = "l01590" name = "l01590" > < / a > < span class = "lineno" > 1590< / span > uint3 tid [[threadgroup_position_in_grid]],< / div >
< div class = "line" > < a id = "l01591" name = "l01591" > < / a > < span class = "lineno" > 1591< / span > uint lid [[thread_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l01592" name = "l01592" > < / a > < span class = "lineno" > 1592< / span > uint simd_gid [[simdgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l01593" name = "l01593" > < / a > < span class = "lineno" > 1593< / span > uint simd_lid [[thread_index_in_simdgroup]]) {< / div >
< div class = "line" > < a id = "l01594" name = "l01594" > < / a > < span class = "lineno" > 1594< / span > (void)lid;< / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l01595" name = "l01595" > < / a > < span class = "lineno" > 1595< / span > < / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01596" name = "l01596" > < / a > < span class = "lineno" > 1596< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > BK_padded = (BK + 16 / < span class = "keyword" > sizeof< / span > (T));< / div >
< div class = "line" > < a id = "l01597" name = "l01597" > < / a > < span class = "lineno" > 1597< / span > < / div >
< div class = "line" > < a id = "l01598" name = "l01598" > < / a > < span class = "lineno" > 1598< / span > threadgroup T Xs[BM * BK_padded];< / div >
< div class = "line" > < a id = "l01599" name = "l01599" > < / a > < span class = "lineno" > 1599< / span > threadgroup T Ws[BN * BK_padded];< / div >
< div class = "line" > < a id = "l01600" name = "l01600" > < / a > < span class = "lineno" > 1600< / span > < / div >
< div class = "line" > < a id = "l01601" name = "l01601" > < / a > < span class = "lineno" > 1601< / span > < span class = "keywordflow" > if< / span > (batched) {< / div >
< div class = "line" > < a id = "l01602" name = "l01602" > < / a > < span class = "lineno" > 1602< / span > < a class = "code hl_function" href = "quantized_8h.html#accab1f9e17a65242347c051f98e4c0be" > adjust_matrix_offsets< T> < / a > (< / div >
< div class = "line" > < a id = "l01603" name = "l01603" > < / a > < span class = "lineno" > 1603< / span > x,< / div >
< div class = "line" > < a id = "l01604" name = "l01604" > < / a > < span class = "lineno" > 1604< / span > w,< / div >
< div class = "line" > < a id = "l01605" name = "l01605" > < / a > < span class = "lineno" > 1605< / span > scales,< / div >
< div class = "line" > < a id = "l01606" name = "l01606" > < / a > < span class = "lineno" > 1606< / span > biases,< / div >
< div class = "line" > < a id = "l01607" name = "l01607" > < / a > < span class = "lineno" > 1607< / span > y,< / div >
< div class = "line" > < a id = "l01608" name = "l01608" > < / a > < span class = "lineno" > 1608< / span > M * N,< / div >
< div class = "line" > < a id = "l01609" name = "l01609" > < / a > < span class = "lineno" > 1609< / span > x_batch_ndims,< / div >
< div class = "line" > < a id = "l01610" name = "l01610" > < / a > < span class = "lineno" > 1610< / span > x_shape,< / div >
< div class = "line" > < a id = "l01611" name = "l01611" > < / a > < span class = "lineno" > 1611< / span > x_strides,< / div >
< div class = "line" > < a id = "l01612" name = "l01612" > < / a > < span class = "lineno" > 1612< / span > w_batch_ndims,< / div >
< div class = "line" > < a id = "l01613" name = "l01613" > < / a > < span class = "lineno" > 1613< / span > w_shape,< / div >
< div class = "line" > < a id = "l01614" name = "l01614" > < / a > < span class = "lineno" > 1614< / span > w_strides,< / div >
< div class = "line" > < a id = "l01615" name = "l01615" > < / a > < span class = "lineno" > 1615< / span > s_strides,< / div >
< div class = "line" > < a id = "l01616" name = "l01616" > < / a > < span class = "lineno" > 1616< / span > b_strides,< / div >
< div class = "line" > < a id = "l01617" name = "l01617" > < / a > < span class = "lineno" > 1617< / span > tid);< / div >
< div class = "line" > < a id = "l01618" name = "l01618" > < / a > < span class = "lineno" > 1618< / span > }< / div >
< div class = "line" > < a id = "l01619" name = "l01619" > < / a > < span class = "lineno" > 1619< / span > < a class = "code hl_function" href = "quantized_8h.html#af5750a35e8f5462218effba719f7f5b8" > qmm_t_impl< T, group_size, bits, aligned_N, BM, BK, BN> < / a > (< / div >
< div class = "line" > < a id = "l01620" name = "l01620" > < / a > < span class = "lineno" > 1620< / span > w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);< / div >
< div class = "line" > < a id = "l01621" name = "l01621" > < / a > < span class = "lineno" > 1621< / span > }< / div >
2024-10-15 23:12:17 +08:00
< / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01622" name = "l01622" > < / a > < span class = "lineno" > 1622< / span > < / div >
< div class = "line" > < a id = "l01623" name = "l01623" > < / a > < span class = "lineno" > 1623< / span > < span class = "keyword" > template< / span > < < / div >
< div class = "line" > < a id = "l01624" name = "l01624" > < / a > < span class = "lineno" > 1624< / span > < span class = "keyword" > typename< / span > T,< / div >
< div class = "line" > < a id = "l01625" name = "l01625" > < / a > < span class = "lineno" > 1625< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > group_size,< / div >
< div class = "line" > < a id = "l01626" name = "l01626" > < / a > < span class = "lineno" > 1626< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > < a class = "code hl_function" href = "namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825" > bits< / a > ,< / div >
< div class = "line" > < a id = "l01627" name = "l01627" > < / a > < span class = "lineno" > 1627< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > bool< / span > batched,< / div >
< div class = "line" > < a id = "l01628" name = "l01628" > < / a > < span class = "lineno" > 1628< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > BM = 32,< / div >
< div class = "line" > < a id = "l01629" name = "l01629" > < / a > < span class = "lineno" > 1629< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > BK = 32,< / div >
< div class = "line" > < a id = "l01630" name = "l01630" > < / a > < span class = "lineno" > 1630< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > BN = 32> < / div >
< div class = "foldopen" id = "foldopen01631" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l01631" name = "l01631" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#a2ce135e392dbf9a3e5180fb083792ed7" > 1631< / a > < / span > [[kernel]] < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#a2ce135e392dbf9a3e5180fb083792ed7" > qmm_n< / a > (< / div >
< div class = "line" > < a id = "l01632" name = "l01632" > < / a > < span class = "lineno" > 1632< / span > < span class = "keyword" > const< / span > device uint32_t* w [[buffer(0)]],< / div >
< div class = "line" > < a id = "l01633" name = "l01633" > < / a > < span class = "lineno" > 1633< / span > < span class = "keyword" > const< / span > device T* scales [[buffer(1)]],< / div >
< div class = "line" > < a id = "l01634" name = "l01634" > < / a > < span class = "lineno" > 1634< / span > < span class = "keyword" > const< / span > device T* biases [[buffer(2)]],< / div >
< div class = "line" > < a id = "l01635" name = "l01635" > < / a > < span class = "lineno" > 1635< / span > < span class = "keyword" > const< / span > device T* x [[buffer(3)]],< / div >
< div class = "line" > < a id = "l01636" name = "l01636" > < / a > < span class = "lineno" > 1636< / span > device T* y [[buffer(4)]],< / div >
< div class = "line" > < a id = "l01637" name = "l01637" > < / a > < span class = "lineno" > 1637< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & K [[buffer(5)]],< / div >
< div class = "line" > < a id = "l01638" name = "l01638" > < / a > < span class = "lineno" > 1638< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & N [[buffer(6)]],< / div >
< div class = "line" > < a id = "l01639" name = "l01639" > < / a > < span class = "lineno" > 1639< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & M [[buffer(7)]],< / div >
< div class = "line" > < a id = "l01640" name = "l01640" > < / a > < span class = "lineno" > 1640< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & x_batch_ndims [[buffer(8)]],< / div >
< div class = "line" > < a id = "l01641" name = "l01641" > < / a > < span class = "lineno" > 1641< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * x_shape [[buffer(9)]],< / div >
< div class = "line" > < a id = "l01642" name = "l01642" > < / a > < span class = "lineno" > 1642< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * x_strides [[buffer(10)]],< / div >
< div class = "line" > < a id = "l01643" name = "l01643" > < / a > < span class = "lineno" > 1643< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & w_batch_ndims [[buffer(11)]],< / div >
< div class = "line" > < a id = "l01644" name = "l01644" > < / a > < span class = "lineno" > 1644< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * w_shape [[buffer(12)]],< / div >
< div class = "line" > < a id = "l01645" name = "l01645" > < / a > < span class = "lineno" > 1645< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * w_strides [[buffer(13)]],< / div >
< div class = "line" > < a id = "l01646" name = "l01646" > < / a > < span class = "lineno" > 1646< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * s_strides [[buffer(14)]],< / div >
< div class = "line" > < a id = "l01647" name = "l01647" > < / a > < span class = "lineno" > 1647< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * b_strides [[buffer(15)]],< / div >
< div class = "line" > < a id = "l01648" name = "l01648" > < / a > < span class = "lineno" > 1648< / span > uint3 tid [[threadgroup_position_in_grid]],< / div >
< div class = "line" > < a id = "l01649" name = "l01649" > < / a > < span class = "lineno" > 1649< / span > uint lid [[thread_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l01650" name = "l01650" > < / a > < span class = "lineno" > 1650< / span > uint simd_gid [[simdgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l01651" name = "l01651" > < / a > < span class = "lineno" > 1651< / span > uint simd_lid [[thread_index_in_simdgroup]]) {< / div >
< div class = "line" > < a id = "l01652" name = "l01652" > < / a > < span class = "lineno" > 1652< / span > (void)lid;< / div >
< div class = "line" > < a id = "l01653" name = "l01653" > < / a > < span class = "lineno" > 1653< / span > < / div >
< div class = "line" > < a id = "l01654" name = "l01654" > < / a > < span class = "lineno" > 1654< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > BK_padded = (BK + 16 / < span class = "keyword" > sizeof< / span > (T));< / div >
< div class = "line" > < a id = "l01655" name = "l01655" > < / a > < span class = "lineno" > 1655< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > BN_padded = (BN + 16 / < span class = "keyword" > sizeof< / span > (T));< / div >
< div class = "line" > < a id = "l01656" name = "l01656" > < / a > < span class = "lineno" > 1656< / span > < / div >
< div class = "line" > < a id = "l01657" name = "l01657" > < / a > < span class = "lineno" > 1657< / span > threadgroup T Xs[BM * BK_padded];< / div >
< div class = "line" > < a id = "l01658" name = "l01658" > < / a > < span class = "lineno" > 1658< / span > threadgroup T Ws[BK * BN_padded];< / div >
< div class = "line" > < a id = "l01659" name = "l01659" > < / a > < span class = "lineno" > 1659< / span > < / div >
< div class = "line" > < a id = "l01660" name = "l01660" > < / a > < span class = "lineno" > 1660< / span > < span class = "keywordflow" > if< / span > (batched) {< / div >
< div class = "line" > < a id = "l01661" name = "l01661" > < / a > < span class = "lineno" > 1661< / span > < a class = "code hl_function" href = "quantized_8h.html#accab1f9e17a65242347c051f98e4c0be" > adjust_matrix_offsets< T> < / a > (< / div >
< div class = "line" > < a id = "l01662" name = "l01662" > < / a > < span class = "lineno" > 1662< / span > x,< / div >
< div class = "line" > < a id = "l01663" name = "l01663" > < / a > < span class = "lineno" > 1663< / span > w,< / div >
< div class = "line" > < a id = "l01664" name = "l01664" > < / a > < span class = "lineno" > 1664< / span > scales,< / div >
< div class = "line" > < a id = "l01665" name = "l01665" > < / a > < span class = "lineno" > 1665< / span > biases,< / div >
< div class = "line" > < a id = "l01666" name = "l01666" > < / a > < span class = "lineno" > 1666< / span > y,< / div >
< div class = "line" > < a id = "l01667" name = "l01667" > < / a > < span class = "lineno" > 1667< / span > M * N,< / div >
< div class = "line" > < a id = "l01668" name = "l01668" > < / a > < span class = "lineno" > 1668< / span > x_batch_ndims,< / div >
< div class = "line" > < a id = "l01669" name = "l01669" > < / a > < span class = "lineno" > 1669< / span > x_shape,< / div >
< div class = "line" > < a id = "l01670" name = "l01670" > < / a > < span class = "lineno" > 1670< / span > x_strides,< / div >
< div class = "line" > < a id = "l01671" name = "l01671" > < / a > < span class = "lineno" > 1671< / span > w_batch_ndims,< / div >
< div class = "line" > < a id = "l01672" name = "l01672" > < / a > < span class = "lineno" > 1672< / span > w_shape,< / div >
< div class = "line" > < a id = "l01673" name = "l01673" > < / a > < span class = "lineno" > 1673< / span > w_strides,< / div >
< div class = "line" > < a id = "l01674" name = "l01674" > < / a > < span class = "lineno" > 1674< / span > s_strides,< / div >
< div class = "line" > < a id = "l01675" name = "l01675" > < / a > < span class = "lineno" > 1675< / span > b_strides,< / div >
< div class = "line" > < a id = "l01676" name = "l01676" > < / a > < span class = "lineno" > 1676< / span > tid);< / div >
< div class = "line" > < a id = "l01677" name = "l01677" > < / a > < span class = "lineno" > 1677< / span > }< / div >
< div class = "line" > < a id = "l01678" name = "l01678" > < / a > < span class = "lineno" > 1678< / span > < / div >
< div class = "line" > < a id = "l01679" name = "l01679" > < / a > < span class = "lineno" > 1679< / span > < a class = "code hl_function" href = "quantized_8h.html#a0ba59096494f1001c195312571523ae9" > qmm_n_impl< T, group_size, bits, BM, BK, BN> < / a > (< / div >
< div class = "line" > < a id = "l01680" name = "l01680" > < / a > < span class = "lineno" > 1680< / span > w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);< / div >
< div class = "line" > < a id = "l01681" name = "l01681" > < / a > < span class = "lineno" > 1681< / span > }< / div >
2024-11-23 04:24:16 +08:00
< / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01682" name = "l01682" > < / a > < span class = "lineno" > 1682< / span > < / div >
< div class = "line" > < a id = "l01683" name = "l01683" > < / a > < span class = "lineno" > 1683< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T, < span class = "keywordtype" > int< / span > group_size, < span class = "keywordtype" > int< / span > bits> < / div >
< div class = "foldopen" id = "foldopen01684" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l01684" name = "l01684" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7" > 1684< / a > < / span > [[kernel]] < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7" > bs_qmv_fast< / a > (< / div >
< div class = "line" > < a id = "l01685" name = "l01685" > < / a > < span class = "lineno" > 1685< / span > < span class = "keyword" > const< / span > device uint32_t* w [[buffer(0)]],< / div >
< div class = "line" > < a id = "l01686" name = "l01686" > < / a > < span class = "lineno" > 1686< / span > < span class = "keyword" > const< / span > device T* scales [[buffer(1)]],< / div >
< div class = "line" > < a id = "l01687" name = "l01687" > < / a > < span class = "lineno" > 1687< / span > < span class = "keyword" > const< / span > device T* biases [[buffer(2)]],< / div >
< div class = "line" > < a id = "l01688" name = "l01688" > < / a > < span class = "lineno" > 1688< / span > < span class = "keyword" > const< / span > device T* x [[buffer(3)]],< / div >
< div class = "line" > < a id = "l01689" name = "l01689" > < / a > < span class = "lineno" > 1689< / span > device T* y [[buffer(4)]],< / div >
< div class = "line" > < a id = "l01690" name = "l01690" > < / a > < span class = "lineno" > 1690< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & in_vec_size [[buffer(5)]],< / div >
< div class = "line" > < a id = "l01691" name = "l01691" > < / a > < span class = "lineno" > 1691< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & out_vec_size [[buffer(6)]],< / div >
< div class = "line" > < a id = "l01692" name = "l01692" > < / a > < span class = "lineno" > 1692< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & x_batch_ndims [[buffer(7)]],< / div >
< div class = "line" > < a id = "l01693" name = "l01693" > < / a > < span class = "lineno" > 1693< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * x_shape [[buffer(8)]],< / div >
< div class = "line" > < a id = "l01694" name = "l01694" > < / a > < span class = "lineno" > 1694< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * x_strides [[buffer(9)]],< / div >
< div class = "line" > < a id = "l01695" name = "l01695" > < / a > < span class = "lineno" > 1695< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & w_batch_ndims [[buffer(10)]],< / div >
< div class = "line" > < a id = "l01696" name = "l01696" > < / a > < span class = "lineno" > 1696< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * w_shape [[buffer(11)]],< / div >
< div class = "line" > < a id = "l01697" name = "l01697" > < / a > < span class = "lineno" > 1697< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * w_strides [[buffer(12)]],< / div >
< div class = "line" > < a id = "l01698" name = "l01698" > < / a > < span class = "lineno" > 1698< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * s_strides [[buffer(13)]],< / div >
< div class = "line" > < a id = "l01699" name = "l01699" > < / a > < span class = "lineno" > 1699< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * b_strides [[buffer(14)]],< / div >
< div class = "line" > < a id = "l01700" name = "l01700" > < / a > < span class = "lineno" > 1700< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & batch_ndims [[buffer(15)]],< / div >
< div class = "line" > < a id = "l01701" name = "l01701" > < / a > < span class = "lineno" > 1701< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * batch_shape [[buffer(16)]],< / div >
< div class = "line" > < a id = "l01702" name = "l01702" > < / a > < span class = "lineno" > 1702< / span > < span class = "keyword" > const< / span > device uint32_t* lhs_indices [[buffer(17)]],< / div >
< div class = "line" > < a id = "l01703" name = "l01703" > < / a > < span class = "lineno" > 1703< / span > < span class = "keyword" > const< / span > device uint32_t* rhs_indices [[buffer(18)]],< / div >
< div class = "line" > < a id = "l01704" name = "l01704" > < / a > < span class = "lineno" > 1704< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * lhs_strides [[buffer(19)]],< / div >
< div class = "line" > < a id = "l01705" name = "l01705" > < / a > < span class = "lineno" > 1705< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * rhs_strides [[buffer(20)]],< / div >
< div class = "line" > < a id = "l01706" name = "l01706" > < / a > < span class = "lineno" > 1706< / span > uint3 tid [[threadgroup_position_in_grid]],< / div >
< div class = "line" > < a id = "l01707" name = "l01707" > < / a > < span class = "lineno" > 1707< / span > uint simd_gid [[simdgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l01708" name = "l01708" > < / a > < span class = "lineno" > 1708< / span > uint simd_lid [[thread_index_in_simdgroup]]) {< / div >
< div class = "line" > < a id = "l01709" name = "l01709" > < / a > < span class = "lineno" > 1709< / span > < a class = "code hl_function" href = "quantized_8h.html#accab1f9e17a65242347c051f98e4c0be" > adjust_matrix_offsets< T> < / a > (< / div >
< div class = "line" > < a id = "l01710" name = "l01710" > < / a > < span class = "lineno" > 1710< / span > x,< / div >
< div class = "line" > < a id = "l01711" name = "l01711" > < / a > < span class = "lineno" > 1711< / span > w,< / div >
< div class = "line" > < a id = "l01712" name = "l01712" > < / a > < span class = "lineno" > 1712< / span > scales,< / div >
< div class = "line" > < a id = "l01713" name = "l01713" > < / a > < span class = "lineno" > 1713< / span > biases,< / div >
< div class = "line" > < a id = "l01714" name = "l01714" > < / a > < span class = "lineno" > 1714< / span > lhs_indices,< / div >
< div class = "line" > < a id = "l01715" name = "l01715" > < / a > < span class = "lineno" > 1715< / span > rhs_indices,< / div >
< div class = "line" > < a id = "l01716" name = "l01716" > < / a > < span class = "lineno" > 1716< / span > y,< / div >
< div class = "line" > < a id = "l01717" name = "l01717" > < / a > < span class = "lineno" > 1717< / span > out_vec_size,< / div >
< div class = "line" > < a id = "l01718" name = "l01718" > < / a > < span class = "lineno" > 1718< / span > batch_ndims,< / div >
< div class = "line" > < a id = "l01719" name = "l01719" > < / a > < span class = "lineno" > 1719< / span > batch_shape,< / div >
< div class = "line" > < a id = "l01720" name = "l01720" > < / a > < span class = "lineno" > 1720< / span > lhs_strides,< / div >
< div class = "line" > < a id = "l01721" name = "l01721" > < / a > < span class = "lineno" > 1721< / span > rhs_strides,< / div >
< div class = "line" > < a id = "l01722" name = "l01722" > < / a > < span class = "lineno" > 1722< / span > x_batch_ndims,< / div >
< div class = "line" > < a id = "l01723" name = "l01723" > < / a > < span class = "lineno" > 1723< / span > x_shape,< / div >
< div class = "line" > < a id = "l01724" name = "l01724" > < / a > < span class = "lineno" > 1724< / span > x_strides,< / div >
< div class = "line" > < a id = "l01725" name = "l01725" > < / a > < span class = "lineno" > 1725< / span > w_batch_ndims,< / div >
< div class = "line" > < a id = "l01726" name = "l01726" > < / a > < span class = "lineno" > 1726< / span > w_shape,< / div >
< div class = "line" > < a id = "l01727" name = "l01727" > < / a > < span class = "lineno" > 1727< / span > w_strides,< / div >
< div class = "line" > < a id = "l01728" name = "l01728" > < / a > < span class = "lineno" > 1728< / span > s_strides,< / div >
< div class = "line" > < a id = "l01729" name = "l01729" > < / a > < span class = "lineno" > 1729< / span > b_strides,< / div >
< div class = "line" > < a id = "l01730" name = "l01730" > < / a > < span class = "lineno" > 1730< / span > tid);< / div >
< div class = "line" > < a id = "l01731" name = "l01731" > < / a > < span class = "lineno" > 1731< / span > < a class = "code hl_function" href = "quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81" > qmv_fast_impl< T, group_size, bits> < / a > (< / div >
< div class = "line" > < a id = "l01732" name = "l01732" > < / a > < span class = "lineno" > 1732< / span > w,< / div >
< div class = "line" > < a id = "l01733" name = "l01733" > < / a > < span class = "lineno" > 1733< / span > scales,< / div >
< div class = "line" > < a id = "l01734" name = "l01734" > < / a > < span class = "lineno" > 1734< / span > biases,< / div >
< div class = "line" > < a id = "l01735" name = "l01735" > < / a > < span class = "lineno" > 1735< / span > x,< / div >
< div class = "line" > < a id = "l01736" name = "l01736" > < / a > < span class = "lineno" > 1736< / span > y,< / div >
< div class = "line" > < a id = "l01737" name = "l01737" > < / a > < span class = "lineno" > 1737< / span > in_vec_size,< / div >
< div class = "line" > < a id = "l01738" name = "l01738" > < / a > < span class = "lineno" > 1738< / span > out_vec_size,< / div >
< div class = "line" > < a id = "l01739" name = "l01739" > < / a > < span class = "lineno" > 1739< / span > tid,< / div >
< div class = "line" > < a id = "l01740" name = "l01740" > < / a > < span class = "lineno" > 1740< / span > simd_gid,< / div >
< div class = "line" > < a id = "l01741" name = "l01741" > < / a > < span class = "lineno" > 1741< / span > simd_lid);< / div >
< div class = "line" > < a id = "l01742" name = "l01742" > < / a > < span class = "lineno" > 1742< / span > }< / div >
2024-10-15 23:12:17 +08:00
< / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01743" name = "l01743" > < / a > < span class = "lineno" > 1743< / span > < / div >
< div class = "line" > < a id = "l01744" name = "l01744" > < / a > < span class = "lineno" > 1744< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T, < span class = "keywordtype" > int< / span > group_size, < span class = "keywordtype" > int< / span > bits> < / div >
< div class = "foldopen" id = "foldopen01745" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l01745" name = "l01745" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#acf4c7fc77821a83b31aedfb48443d3ed" > 1745< / a > < / span > [[kernel]] < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#acf4c7fc77821a83b31aedfb48443d3ed" > bs_qmv< / a > (< / div >
< div class = "line" > < a id = "l01746" name = "l01746" > < / a > < span class = "lineno" > 1746< / span > < span class = "keyword" > const< / span > device uint32_t* w [[buffer(0)]],< / div >
< div class = "line" > < a id = "l01747" name = "l01747" > < / a > < span class = "lineno" > 1747< / span > < span class = "keyword" > const< / span > device T* scales [[buffer(1)]],< / div >
< div class = "line" > < a id = "l01748" name = "l01748" > < / a > < span class = "lineno" > 1748< / span > < span class = "keyword" > const< / span > device T* biases [[buffer(2)]],< / div >
< div class = "line" > < a id = "l01749" name = "l01749" > < / a > < span class = "lineno" > 1749< / span > < span class = "keyword" > const< / span > device T* x [[buffer(3)]],< / div >
< div class = "line" > < a id = "l01750" name = "l01750" > < / a > < span class = "lineno" > 1750< / span > device T* y [[buffer(4)]],< / div >
< div class = "line" > < a id = "l01751" name = "l01751" > < / a > < span class = "lineno" > 1751< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & in_vec_size [[buffer(5)]],< / div >
< div class = "line" > < a id = "l01752" name = "l01752" > < / a > < span class = "lineno" > 1752< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & out_vec_size [[buffer(6)]],< / div >
< div class = "line" > < a id = "l01753" name = "l01753" > < / a > < span class = "lineno" > 1753< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & x_batch_ndims [[buffer(7)]],< / div >
< div class = "line" > < a id = "l01754" name = "l01754" > < / a > < span class = "lineno" > 1754< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * x_shape [[buffer(8)]],< / div >
< div class = "line" > < a id = "l01755" name = "l01755" > < / a > < span class = "lineno" > 1755< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * x_strides [[buffer(9)]],< / div >
< div class = "line" > < a id = "l01756" name = "l01756" > < / a > < span class = "lineno" > 1756< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & w_batch_ndims [[buffer(10)]],< / div >
< div class = "line" > < a id = "l01757" name = "l01757" > < / a > < span class = "lineno" > 1757< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * w_shape [[buffer(11)]],< / div >
< div class = "line" > < a id = "l01758" name = "l01758" > < / a > < span class = "lineno" > 1758< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * w_strides [[buffer(12)]],< / div >
< div class = "line" > < a id = "l01759" name = "l01759" > < / a > < span class = "lineno" > 1759< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * s_strides [[buffer(13)]],< / div >
< div class = "line" > < a id = "l01760" name = "l01760" > < / a > < span class = "lineno" > 1760< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * b_strides [[buffer(14)]],< / div >
< div class = "line" > < a id = "l01761" name = "l01761" > < / a > < span class = "lineno" > 1761< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & batch_ndims [[buffer(15)]],< / div >
< div class = "line" > < a id = "l01762" name = "l01762" > < / a > < span class = "lineno" > 1762< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * batch_shape [[buffer(16)]],< / div >
< div class = "line" > < a id = "l01763" name = "l01763" > < / a > < span class = "lineno" > 1763< / span > < span class = "keyword" > const< / span > device uint32_t* lhs_indices [[buffer(17)]],< / div >
< div class = "line" > < a id = "l01764" name = "l01764" > < / a > < span class = "lineno" > 1764< / span > < span class = "keyword" > const< / span > device uint32_t* rhs_indices [[buffer(18)]],< / div >
< div class = "line" > < a id = "l01765" name = "l01765" > < / a > < span class = "lineno" > 1765< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * lhs_strides [[buffer(19)]],< / div >
< div class = "line" > < a id = "l01766" name = "l01766" > < / a > < span class = "lineno" > 1766< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * rhs_strides [[buffer(20)]],< / div >
< div class = "line" > < a id = "l01767" name = "l01767" > < / a > < span class = "lineno" > 1767< / span > uint3 tid [[threadgroup_position_in_grid]],< / div >
< div class = "line" > < a id = "l01768" name = "l01768" > < / a > < span class = "lineno" > 1768< / span > uint simd_gid [[simdgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l01769" name = "l01769" > < / a > < span class = "lineno" > 1769< / span > uint simd_lid [[thread_index_in_simdgroup]]) {< / div >
< div class = "line" > < a id = "l01770" name = "l01770" > < / a > < span class = "lineno" > 1770< / span > < a class = "code hl_function" href = "quantized_8h.html#accab1f9e17a65242347c051f98e4c0be" > adjust_matrix_offsets< T> < / a > (< / div >
< div class = "line" > < a id = "l01771" name = "l01771" > < / a > < span class = "lineno" > 1771< / span > x,< / div >
< div class = "line" > < a id = "l01772" name = "l01772" > < / a > < span class = "lineno" > 1772< / span > w,< / div >
< div class = "line" > < a id = "l01773" name = "l01773" > < / a > < span class = "lineno" > 1773< / span > scales,< / div >
< div class = "line" > < a id = "l01774" name = "l01774" > < / a > < span class = "lineno" > 1774< / span > biases,< / div >
< div class = "line" > < a id = "l01775" name = "l01775" > < / a > < span class = "lineno" > 1775< / span > lhs_indices,< / div >
< div class = "line" > < a id = "l01776" name = "l01776" > < / a > < span class = "lineno" > 1776< / span > rhs_indices,< / div >
< div class = "line" > < a id = "l01777" name = "l01777" > < / a > < span class = "lineno" > 1777< / span > y,< / div >
< div class = "line" > < a id = "l01778" name = "l01778" > < / a > < span class = "lineno" > 1778< / span > out_vec_size,< / div >
< div class = "line" > < a id = "l01779" name = "l01779" > < / a > < span class = "lineno" > 1779< / span > batch_ndims,< / div >
< div class = "line" > < a id = "l01780" name = "l01780" > < / a > < span class = "lineno" > 1780< / span > batch_shape,< / div >
< div class = "line" > < a id = "l01781" name = "l01781" > < / a > < span class = "lineno" > 1781< / span > lhs_strides,< / div >
< div class = "line" > < a id = "l01782" name = "l01782" > < / a > < span class = "lineno" > 1782< / span > rhs_strides,< / div >
< div class = "line" > < a id = "l01783" name = "l01783" > < / a > < span class = "lineno" > 1783< / span > x_batch_ndims,< / div >
< div class = "line" > < a id = "l01784" name = "l01784" > < / a > < span class = "lineno" > 1784< / span > x_shape,< / div >
< div class = "line" > < a id = "l01785" name = "l01785" > < / a > < span class = "lineno" > 1785< / span > x_strides,< / div >
< div class = "line" > < a id = "l01786" name = "l01786" > < / a > < span class = "lineno" > 1786< / span > w_batch_ndims,< / div >
< div class = "line" > < a id = "l01787" name = "l01787" > < / a > < span class = "lineno" > 1787< / span > w_shape,< / div >
< div class = "line" > < a id = "l01788" name = "l01788" > < / a > < span class = "lineno" > 1788< / span > w_strides,< / div >
< div class = "line" > < a id = "l01789" name = "l01789" > < / a > < span class = "lineno" > 1789< / span > s_strides,< / div >
< div class = "line" > < a id = "l01790" name = "l01790" > < / a > < span class = "lineno" > 1790< / span > b_strides,< / div >
< div class = "line" > < a id = "l01791" name = "l01791" > < / a > < span class = "lineno" > 1791< / span > tid);< / div >
< div class = "line" > < a id = "l01792" name = "l01792" > < / a > < span class = "lineno" > 1792< / span > < a class = "code hl_function" href = "quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd" > qmv_impl< T, group_size, bits> < / a > (< / div >
< div class = "line" > < a id = "l01793" name = "l01793" > < / a > < span class = "lineno" > 1793< / span > w,< / div >
< div class = "line" > < a id = "l01794" name = "l01794" > < / a > < span class = "lineno" > 1794< / span > scales,< / div >
< div class = "line" > < a id = "l01795" name = "l01795" > < / a > < span class = "lineno" > 1795< / span > biases,< / div >
< div class = "line" > < a id = "l01796" name = "l01796" > < / a > < span class = "lineno" > 1796< / span > x,< / div >
< div class = "line" > < a id = "l01797" name = "l01797" > < / a > < span class = "lineno" > 1797< / span > y,< / div >
< div class = "line" > < a id = "l01798" name = "l01798" > < / a > < span class = "lineno" > 1798< / span > in_vec_size,< / div >
< div class = "line" > < a id = "l01799" name = "l01799" > < / a > < span class = "lineno" > 1799< / span > out_vec_size,< / div >
< div class = "line" > < a id = "l01800" name = "l01800" > < / a > < span class = "lineno" > 1800< / span > tid,< / div >
< div class = "line" > < a id = "l01801" name = "l01801" > < / a > < span class = "lineno" > 1801< / span > simd_gid,< / div >
< div class = "line" > < a id = "l01802" name = "l01802" > < / a > < span class = "lineno" > 1802< / span > simd_lid);< / div >
< div class = "line" > < a id = "l01803" name = "l01803" > < / a > < span class = "lineno" > 1803< / span > }< / div >
2024-11-23 04:24:16 +08:00
< / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01804" name = "l01804" > < / a > < span class = "lineno" > 1804< / span > < / div >
< div class = "line" > < a id = "l01805" name = "l01805" > < / a > < span class = "lineno" > 1805< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T, < span class = "keywordtype" > int< / span > group_size, < span class = "keywordtype" > int< / span > bits> < / div >
< div class = "foldopen" id = "foldopen01806" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l01806" name = "l01806" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494" > 1806< / a > < / span > [[kernel]] < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494" > bs_qvm< / a > (< / div >
< div class = "line" > < a id = "l01807" name = "l01807" > < / a > < span class = "lineno" > 1807< / span > < span class = "keyword" > const< / span > device uint32_t* w [[buffer(0)]],< / div >
< div class = "line" > < a id = "l01808" name = "l01808" > < / a > < span class = "lineno" > 1808< / span > < span class = "keyword" > const< / span > device T* scales [[buffer(1)]],< / div >
< div class = "line" > < a id = "l01809" name = "l01809" > < / a > < span class = "lineno" > 1809< / span > < span class = "keyword" > const< / span > device T* biases [[buffer(2)]],< / div >
< div class = "line" > < a id = "l01810" name = "l01810" > < / a > < span class = "lineno" > 1810< / span > < span class = "keyword" > const< / span > device T* x [[buffer(3)]],< / div >
< div class = "line" > < a id = "l01811" name = "l01811" > < / a > < span class = "lineno" > 1811< / span > device T* y [[buffer(4)]],< / div >
< div class = "line" > < a id = "l01812" name = "l01812" > < / a > < span class = "lineno" > 1812< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & in_vec_size [[buffer(5)]],< / div >
< div class = "line" > < a id = "l01813" name = "l01813" > < / a > < span class = "lineno" > 1813< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & out_vec_size [[buffer(6)]],< / div >
< div class = "line" > < a id = "l01814" name = "l01814" > < / a > < span class = "lineno" > 1814< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & x_batch_ndims [[buffer(7)]],< / div >
< div class = "line" > < a id = "l01815" name = "l01815" > < / a > < span class = "lineno" > 1815< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * x_shape [[buffer(8)]],< / div >
< div class = "line" > < a id = "l01816" name = "l01816" > < / a > < span class = "lineno" > 1816< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * x_strides [[buffer(9)]],< / div >
< div class = "line" > < a id = "l01817" name = "l01817" > < / a > < span class = "lineno" > 1817< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & w_batch_ndims [[buffer(10)]],< / div >
< div class = "line" > < a id = "l01818" name = "l01818" > < / a > < span class = "lineno" > 1818< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * w_shape [[buffer(11)]],< / div >
< div class = "line" > < a id = "l01819" name = "l01819" > < / a > < span class = "lineno" > 1819< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * w_strides [[buffer(12)]],< / div >
< div class = "line" > < a id = "l01820" name = "l01820" > < / a > < span class = "lineno" > 1820< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * s_strides [[buffer(13)]],< / div >
< div class = "line" > < a id = "l01821" name = "l01821" > < / a > < span class = "lineno" > 1821< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * b_strides [[buffer(14)]],< / div >
< div class = "line" > < a id = "l01822" name = "l01822" > < / a > < span class = "lineno" > 1822< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & batch_ndims [[buffer(15)]],< / div >
< div class = "line" > < a id = "l01823" name = "l01823" > < / a > < span class = "lineno" > 1823< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * batch_shape [[buffer(16)]],< / div >
< div class = "line" > < a id = "l01824" name = "l01824" > < / a > < span class = "lineno" > 1824< / span > < span class = "keyword" > const< / span > device uint32_t* lhs_indices [[buffer(17)]],< / div >
< div class = "line" > < a id = "l01825" name = "l01825" > < / a > < span class = "lineno" > 1825< / span > < span class = "keyword" > const< / span > device uint32_t* rhs_indices [[buffer(18)]],< / div >
< div class = "line" > < a id = "l01826" name = "l01826" > < / a > < span class = "lineno" > 1826< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * lhs_strides [[buffer(19)]],< / div >
< div class = "line" > < a id = "l01827" name = "l01827" > < / a > < span class = "lineno" > 1827< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * rhs_strides [[buffer(20)]],< / div >
< div class = "line" > < a id = "l01828" name = "l01828" > < / a > < span class = "lineno" > 1828< / span > uint3 tid [[threadgroup_position_in_grid]],< / div >
< div class = "line" > < a id = "l01829" name = "l01829" > < / a > < span class = "lineno" > 1829< / span > uint simd_gid [[simdgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l01830" name = "l01830" > < / a > < span class = "lineno" > 1830< / span > uint simd_lid [[thread_index_in_simdgroup]]) {< / div >
< div class = "line" > < a id = "l01831" name = "l01831" > < / a > < span class = "lineno" > 1831< / span > < a class = "code hl_function" href = "quantized_8h.html#accab1f9e17a65242347c051f98e4c0be" > adjust_matrix_offsets< T> < / a > (< / div >
< div class = "line" > < a id = "l01832" name = "l01832" > < / a > < span class = "lineno" > 1832< / span > x,< / div >
< div class = "line" > < a id = "l01833" name = "l01833" > < / a > < span class = "lineno" > 1833< / span > w,< / div >
< div class = "line" > < a id = "l01834" name = "l01834" > < / a > < span class = "lineno" > 1834< / span > scales,< / div >
< div class = "line" > < a id = "l01835" name = "l01835" > < / a > < span class = "lineno" > 1835< / span > biases,< / div >
< div class = "line" > < a id = "l01836" name = "l01836" > < / a > < span class = "lineno" > 1836< / span > lhs_indices,< / div >
< div class = "line" > < a id = "l01837" name = "l01837" > < / a > < span class = "lineno" > 1837< / span > rhs_indices,< / div >
< div class = "line" > < a id = "l01838" name = "l01838" > < / a > < span class = "lineno" > 1838< / span > y,< / div >
< div class = "line" > < a id = "l01839" name = "l01839" > < / a > < span class = "lineno" > 1839< / span > out_vec_size,< / div >
< div class = "line" > < a id = "l01840" name = "l01840" > < / a > < span class = "lineno" > 1840< / span > batch_ndims,< / div >
< div class = "line" > < a id = "l01841" name = "l01841" > < / a > < span class = "lineno" > 1841< / span > batch_shape,< / div >
< div class = "line" > < a id = "l01842" name = "l01842" > < / a > < span class = "lineno" > 1842< / span > lhs_strides,< / div >
< div class = "line" > < a id = "l01843" name = "l01843" > < / a > < span class = "lineno" > 1843< / span > rhs_strides,< / div >
< div class = "line" > < a id = "l01844" name = "l01844" > < / a > < span class = "lineno" > 1844< / span > x_batch_ndims,< / div >
< div class = "line" > < a id = "l01845" name = "l01845" > < / a > < span class = "lineno" > 1845< / span > x_shape,< / div >
< div class = "line" > < a id = "l01846" name = "l01846" > < / a > < span class = "lineno" > 1846< / span > x_strides,< / div >
< div class = "line" > < a id = "l01847" name = "l01847" > < / a > < span class = "lineno" > 1847< / span > w_batch_ndims,< / div >
< div class = "line" > < a id = "l01848" name = "l01848" > < / a > < span class = "lineno" > 1848< / span > w_shape,< / div >
< div class = "line" > < a id = "l01849" name = "l01849" > < / a > < span class = "lineno" > 1849< / span > w_strides,< / div >
< div class = "line" > < a id = "l01850" name = "l01850" > < / a > < span class = "lineno" > 1850< / span > s_strides,< / div >
< div class = "line" > < a id = "l01851" name = "l01851" > < / a > < span class = "lineno" > 1851< / span > b_strides,< / div >
< div class = "line" > < a id = "l01852" name = "l01852" > < / a > < span class = "lineno" > 1852< / span > tid);< / div >
< div class = "line" > < a id = "l01853" name = "l01853" > < / a > < span class = "lineno" > 1853< / span > < a class = "code hl_function" href = "quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a" > qvm_impl< T, group_size, bits> < / a > (< / div >
< div class = "line" > < a id = "l01854" name = "l01854" > < / a > < span class = "lineno" > 1854< / span > w,< / div >
< div class = "line" > < a id = "l01855" name = "l01855" > < / a > < span class = "lineno" > 1855< / span > scales,< / div >
< div class = "line" > < a id = "l01856" name = "l01856" > < / a > < span class = "lineno" > 1856< / span > biases,< / div >
< div class = "line" > < a id = "l01857" name = "l01857" > < / a > < span class = "lineno" > 1857< / span > x,< / div >
< div class = "line" > < a id = "l01858" name = "l01858" > < / a > < span class = "lineno" > 1858< / span > y,< / div >
< div class = "line" > < a id = "l01859" name = "l01859" > < / a > < span class = "lineno" > 1859< / span > in_vec_size,< / div >
< div class = "line" > < a id = "l01860" name = "l01860" > < / a > < span class = "lineno" > 1860< / span > out_vec_size,< / div >
< div class = "line" > < a id = "l01861" name = "l01861" > < / a > < span class = "lineno" > 1861< / span > tid,< / div >
< div class = "line" > < a id = "l01862" name = "l01862" > < / a > < span class = "lineno" > 1862< / span > simd_gid,< / div >
< div class = "line" > < a id = "l01863" name = "l01863" > < / a > < span class = "lineno" > 1863< / span > simd_lid);< / div >
< div class = "line" > < a id = "l01864" name = "l01864" > < / a > < span class = "lineno" > 1864< / span > }< / div >
2024-11-23 04:24:16 +08:00
< / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01865" name = "l01865" > < / a > < span class = "lineno" > 1865< / span > < / div >
< div class = "line" > < a id = "l01866" name = "l01866" > < / a > < span class = "lineno" > 1866< / span > < span class = "keyword" > template< / span > < < / div >
< div class = "line" > < a id = "l01867" name = "l01867" > < / a > < span class = "lineno" > 1867< / span > < span class = "keyword" > typename< / span > T,< / div >
< div class = "line" > < a id = "l01868" name = "l01868" > < / a > < span class = "lineno" > 1868< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > group_size,< / div >
< div class = "line" > < a id = "l01869" name = "l01869" > < / a > < span class = "lineno" > 1869< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > < a class = "code hl_function" href = "namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825" > bits< / a > ,< / div >
< div class = "line" > < a id = "l01870" name = "l01870" > < / a > < span class = "lineno" > 1870< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > bool< / span > aligned_N,< / div >
< div class = "line" > < a id = "l01871" name = "l01871" > < / a > < span class = "lineno" > 1871< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > BM = 32,< / div >
< div class = "line" > < a id = "l01872" name = "l01872" > < / a > < span class = "lineno" > 1872< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > BK = 32,< / div >
< div class = "line" > < a id = "l01873" name = "l01873" > < / a > < span class = "lineno" > 1873< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > BN = 32> < / div >
< div class = "foldopen" id = "foldopen01874" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l01874" name = "l01874" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84" > 1874< / a > < / span > [[kernel]] < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84" > bs_qmm_t< / a > (< / div >
< div class = "line" > < a id = "l01875" name = "l01875" > < / a > < span class = "lineno" > 1875< / span > < span class = "keyword" > const< / span > device uint32_t* w [[buffer(0)]],< / div >
< div class = "line" > < a id = "l01876" name = "l01876" > < / a > < span class = "lineno" > 1876< / span > < span class = "keyword" > const< / span > device T* scales [[buffer(1)]],< / div >
< div class = "line" > < a id = "l01877" name = "l01877" > < / a > < span class = "lineno" > 1877< / span > < span class = "keyword" > const< / span > device T* biases [[buffer(2)]],< / div >
< div class = "line" > < a id = "l01878" name = "l01878" > < / a > < span class = "lineno" > 1878< / span > < span class = "keyword" > const< / span > device T* x [[buffer(3)]],< / div >
< div class = "line" > < a id = "l01879" name = "l01879" > < / a > < span class = "lineno" > 1879< / span > device T* y [[buffer(4)]],< / div >
< div class = "line" > < a id = "l01880" name = "l01880" > < / a > < span class = "lineno" > 1880< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & K [[buffer(5)]],< / div >
< div class = "line" > < a id = "l01881" name = "l01881" > < / a > < span class = "lineno" > 1881< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & N [[buffer(6)]],< / div >
< div class = "line" > < a id = "l01882" name = "l01882" > < / a > < span class = "lineno" > 1882< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & M [[buffer(7)]],< / div >
< div class = "line" > < a id = "l01883" name = "l01883" > < / a > < span class = "lineno" > 1883< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & x_batch_ndims [[buffer(8)]],< / div >
< div class = "line" > < a id = "l01884" name = "l01884" > < / a > < span class = "lineno" > 1884< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * x_shape [[buffer(9)]],< / div >
< div class = "line" > < a id = "l01885" name = "l01885" > < / a > < span class = "lineno" > 1885< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * x_strides [[buffer(10)]],< / div >
< div class = "line" > < a id = "l01886" name = "l01886" > < / a > < span class = "lineno" > 1886< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & w_batch_ndims [[buffer(11)]],< / div >
< div class = "line" > < a id = "l01887" name = "l01887" > < / a > < span class = "lineno" > 1887< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * w_shape [[buffer(12)]],< / div >
< div class = "line" > < a id = "l01888" name = "l01888" > < / a > < span class = "lineno" > 1888< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * w_strides [[buffer(13)]],< / div >
< div class = "line" > < a id = "l01889" name = "l01889" > < / a > < span class = "lineno" > 1889< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * s_strides [[buffer(14)]],< / div >
< div class = "line" > < a id = "l01890" name = "l01890" > < / a > < span class = "lineno" > 1890< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * b_strides [[buffer(15)]],< / div >
< div class = "line" > < a id = "l01891" name = "l01891" > < / a > < span class = "lineno" > 1891< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & batch_ndims [[buffer(16)]],< / div >
< div class = "line" > < a id = "l01892" name = "l01892" > < / a > < span class = "lineno" > 1892< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * batch_shape [[buffer(17)]],< / div >
< div class = "line" > < a id = "l01893" name = "l01893" > < / a > < span class = "lineno" > 1893< / span > < span class = "keyword" > const< / span > device uint32_t* lhs_indices [[buffer(18)]],< / div >
< div class = "line" > < a id = "l01894" name = "l01894" > < / a > < span class = "lineno" > 1894< / span > < span class = "keyword" > const< / span > device uint32_t* rhs_indices [[buffer(19)]],< / div >
< div class = "line" > < a id = "l01895" name = "l01895" > < / a > < span class = "lineno" > 1895< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * lhs_strides [[buffer(20)]],< / div >
< div class = "line" > < a id = "l01896" name = "l01896" > < / a > < span class = "lineno" > 1896< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * rhs_strides [[buffer(21)]],< / div >
< div class = "line" > < a id = "l01897" name = "l01897" > < / a > < span class = "lineno" > 1897< / span > uint3 tid [[threadgroup_position_in_grid]],< / div >
< div class = "line" > < a id = "l01898" name = "l01898" > < / a > < span class = "lineno" > 1898< / span > uint lid [[thread_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l01899" name = "l01899" > < / a > < span class = "lineno" > 1899< / span > uint simd_gid [[simdgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l01900" name = "l01900" > < / a > < span class = "lineno" > 1900< / span > uint simd_lid [[thread_index_in_simdgroup]]) {< / div >
< div class = "line" > < a id = "l01901" name = "l01901" > < / a > < span class = "lineno" > 1901< / span > (void)lid;< / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l01902" name = "l01902" > < / a > < span class = "lineno" > 1902< / span > < / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01903" name = "l01903" > < / a > < span class = "lineno" > 1903< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > BK_padded = (BK + 16 / < span class = "keyword" > sizeof< / span > (T));< / div >
< div class = "line" > < a id = "l01904" name = "l01904" > < / a > < span class = "lineno" > 1904< / span > < / div >
< div class = "line" > < a id = "l01905" name = "l01905" > < / a > < span class = "lineno" > 1905< / span > threadgroup T Xs[BM * BK_padded];< / div >
< div class = "line" > < a id = "l01906" name = "l01906" > < / a > < span class = "lineno" > 1906< / span > threadgroup T Ws[BN * BK_padded];< / div >
< div class = "line" > < a id = "l01907" name = "l01907" > < / a > < span class = "lineno" > 1907< / span > < / div >
< div class = "line" > < a id = "l01908" name = "l01908" > < / a > < span class = "lineno" > 1908< / span > < a class = "code hl_function" href = "quantized_8h.html#accab1f9e17a65242347c051f98e4c0be" > adjust_matrix_offsets< T> < / a > (< / div >
< div class = "line" > < a id = "l01909" name = "l01909" > < / a > < span class = "lineno" > 1909< / span > x,< / div >
< div class = "line" > < a id = "l01910" name = "l01910" > < / a > < span class = "lineno" > 1910< / span > w,< / div >
< div class = "line" > < a id = "l01911" name = "l01911" > < / a > < span class = "lineno" > 1911< / span > scales,< / div >
< div class = "line" > < a id = "l01912" name = "l01912" > < / a > < span class = "lineno" > 1912< / span > biases,< / div >
< div class = "line" > < a id = "l01913" name = "l01913" > < / a > < span class = "lineno" > 1913< / span > lhs_indices,< / div >
< div class = "line" > < a id = "l01914" name = "l01914" > < / a > < span class = "lineno" > 1914< / span > rhs_indices,< / div >
< div class = "line" > < a id = "l01915" name = "l01915" > < / a > < span class = "lineno" > 1915< / span > y,< / div >
< div class = "line" > < a id = "l01916" name = "l01916" > < / a > < span class = "lineno" > 1916< / span > M * N,< / div >
< div class = "line" > < a id = "l01917" name = "l01917" > < / a > < span class = "lineno" > 1917< / span > batch_ndims,< / div >
< div class = "line" > < a id = "l01918" name = "l01918" > < / a > < span class = "lineno" > 1918< / span > batch_shape,< / div >
< div class = "line" > < a id = "l01919" name = "l01919" > < / a > < span class = "lineno" > 1919< / span > lhs_strides,< / div >
< div class = "line" > < a id = "l01920" name = "l01920" > < / a > < span class = "lineno" > 1920< / span > rhs_strides,< / div >
< div class = "line" > < a id = "l01921" name = "l01921" > < / a > < span class = "lineno" > 1921< / span > x_batch_ndims,< / div >
< div class = "line" > < a id = "l01922" name = "l01922" > < / a > < span class = "lineno" > 1922< / span > x_shape,< / div >
< div class = "line" > < a id = "l01923" name = "l01923" > < / a > < span class = "lineno" > 1923< / span > x_strides,< / div >
< div class = "line" > < a id = "l01924" name = "l01924" > < / a > < span class = "lineno" > 1924< / span > w_batch_ndims,< / div >
< div class = "line" > < a id = "l01925" name = "l01925" > < / a > < span class = "lineno" > 1925< / span > w_shape,< / div >
< div class = "line" > < a id = "l01926" name = "l01926" > < / a > < span class = "lineno" > 1926< / span > w_strides,< / div >
< div class = "line" > < a id = "l01927" name = "l01927" > < / a > < span class = "lineno" > 1927< / span > s_strides,< / div >
< div class = "line" > < a id = "l01928" name = "l01928" > < / a > < span class = "lineno" > 1928< / span > b_strides,< / div >
< div class = "line" > < a id = "l01929" name = "l01929" > < / a > < span class = "lineno" > 1929< / span > tid);< / div >
< div class = "line" > < a id = "l01930" name = "l01930" > < / a > < span class = "lineno" > 1930< / span > < a class = "code hl_function" href = "quantized_8h.html#af5750a35e8f5462218effba719f7f5b8" > qmm_t_impl< T, group_size, bits, aligned_N, BM, BK, BN> < / a > (< / div >
< div class = "line" > < a id = "l01931" name = "l01931" > < / a > < span class = "lineno" > 1931< / span > w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);< / div >
< div class = "line" > < a id = "l01932" name = "l01932" > < / a > < span class = "lineno" > 1932< / span > }< / div >
2024-10-15 23:12:17 +08:00
< / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l01933" name = "l01933" > < / a > < span class = "lineno" > 1933< / span > < / div >
< div class = "line" > < a id = "l01934" name = "l01934" > < / a > < span class = "lineno" > 1934< / span > < span class = "keyword" > template< / span > < < / div >
< div class = "line" > < a id = "l01935" name = "l01935" > < / a > < span class = "lineno" > 1935< / span > < span class = "keyword" > typename< / span > T,< / div >
< div class = "line" > < a id = "l01936" name = "l01936" > < / a > < span class = "lineno" > 1936< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > group_size,< / div >
< div class = "line" > < a id = "l01937" name = "l01937" > < / a > < span class = "lineno" > 1937< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > < a class = "code hl_function" href = "namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825" > bits< / a > ,< / div >
< div class = "line" > < a id = "l01938" name = "l01938" > < / a > < span class = "lineno" > 1938< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > BM = 32,< / div >
< div class = "line" > < a id = "l01939" name = "l01939" > < / a > < span class = "lineno" > 1939< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > BK = 32,< / div >
< div class = "line" > < a id = "l01940" name = "l01940" > < / a > < span class = "lineno" > 1940< / span > < span class = "keyword" > const< / span > < span class = "keywordtype" > int< / span > BN = 32> < / div >
< div class = "foldopen" id = "foldopen01941" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l01941" name = "l01941" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#a1a66b061c46383952a0f067c3848971f" > 1941< / a > < / span > [[kernel]] < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#a1a66b061c46383952a0f067c3848971f" > bs_qmm_n< / a > (< / div >
< div class = "line" > < a id = "l01942" name = "l01942" > < / a > < span class = "lineno" > 1942< / span > < span class = "keyword" > const< / span > device uint32_t* w [[buffer(0)]],< / div >
< div class = "line" > < a id = "l01943" name = "l01943" > < / a > < span class = "lineno" > 1943< / span > < span class = "keyword" > const< / span > device T* scales [[buffer(1)]],< / div >
< div class = "line" > < a id = "l01944" name = "l01944" > < / a > < span class = "lineno" > 1944< / span > < span class = "keyword" > const< / span > device T* biases [[buffer(2)]],< / div >
< div class = "line" > < a id = "l01945" name = "l01945" > < / a > < span class = "lineno" > 1945< / span > < span class = "keyword" > const< / span > device T* x [[buffer(3)]],< / div >
< div class = "line" > < a id = "l01946" name = "l01946" > < / a > < span class = "lineno" > 1946< / span > device T* y [[buffer(4)]],< / div >
< div class = "line" > < a id = "l01947" name = "l01947" > < / a > < span class = "lineno" > 1947< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & K [[buffer(5)]],< / div >
< div class = "line" > < a id = "l01948" name = "l01948" > < / a > < span class = "lineno" > 1948< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & N [[buffer(6)]],< / div >
< div class = "line" > < a id = "l01949" name = "l01949" > < / a > < span class = "lineno" > 1949< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & M [[buffer(7)]],< / div >
< div class = "line" > < a id = "l01950" name = "l01950" > < / a > < span class = "lineno" > 1950< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & x_batch_ndims [[buffer(8)]],< / div >
< div class = "line" > < a id = "l01951" name = "l01951" > < / a > < span class = "lineno" > 1951< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * x_shape [[buffer(9)]],< / div >
< div class = "line" > < a id = "l01952" name = "l01952" > < / a > < span class = "lineno" > 1952< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * x_strides [[buffer(10)]],< / div >
< div class = "line" > < a id = "l01953" name = "l01953" > < / a > < span class = "lineno" > 1953< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & w_batch_ndims [[buffer(11)]],< / div >
< div class = "line" > < a id = "l01954" name = "l01954" > < / a > < span class = "lineno" > 1954< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * w_shape [[buffer(12)]],< / div >
< div class = "line" > < a id = "l01955" name = "l01955" > < / a > < span class = "lineno" > 1955< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * w_strides [[buffer(13)]],< / div >
< div class = "line" > < a id = "l01956" name = "l01956" > < / a > < span class = "lineno" > 1956< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * s_strides [[buffer(14)]],< / div >
< div class = "line" > < a id = "l01957" name = "l01957" > < / a > < span class = "lineno" > 1957< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * b_strides [[buffer(15)]],< / div >
< div class = "line" > < a id = "l01958" name = "l01958" > < / a > < span class = "lineno" > 1958< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > & batch_ndims [[buffer(16)]],< / div >
< div class = "line" > < a id = "l01959" name = "l01959" > < / a > < span class = "lineno" > 1959< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > int< / span > * batch_shape [[buffer(17)]],< / div >
< div class = "line" > < a id = "l01960" name = "l01960" > < / a > < span class = "lineno" > 1960< / span > < span class = "keyword" > const< / span > device uint32_t* lhs_indices [[buffer(18)]],< / div >
< div class = "line" > < a id = "l01961" name = "l01961" > < / a > < span class = "lineno" > 1961< / span > < span class = "keyword" > const< / span > device uint32_t* rhs_indices [[buffer(19)]],< / div >
< div class = "line" > < a id = "l01962" name = "l01962" > < / a > < span class = "lineno" > 1962< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * lhs_strides [[buffer(20)]],< / div >
< div class = "line" > < a id = "l01963" name = "l01963" > < / a > < span class = "lineno" > 1963< / span > < span class = "keyword" > const< / span > constant < span class = "keywordtype" > size_t< / span > * rhs_strides [[buffer(21)]],< / div >
< div class = "line" > < a id = "l01964" name = "l01964" > < / a > < span class = "lineno" > 1964< / span > uint3 tid [[threadgroup_position_in_grid]],< / div >
< div class = "line" > < a id = "l01965" name = "l01965" > < / a > < span class = "lineno" > 1965< / span > uint lid [[thread_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l01966" name = "l01966" > < / a > < span class = "lineno" > 1966< / span > uint simd_gid [[simdgroup_index_in_threadgroup]],< / div >
< div class = "line" > < a id = "l01967" name = "l01967" > < / a > < span class = "lineno" > 1967< / span > uint simd_lid [[thread_index_in_simdgroup]]) {< / div >
< div class = "line" > < a id = "l01968" name = "l01968" > < / a > < span class = "lineno" > 1968< / span > (void)lid;< / div >
< div class = "line" > < a id = "l01969" name = "l01969" > < / a > < span class = "lineno" > 1969< / span > < / div >
< div class = "line" > < a id = "l01970" name = "l01970" > < / a > < span class = "lineno" > 1970< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > BK_padded = (BK + 16 / < span class = "keyword" > sizeof< / span > (T));< / div >
< div class = "line" > < a id = "l01971" name = "l01971" > < / a > < span class = "lineno" > 1971< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > BN_padded = (BN + 16 / < span class = "keyword" > sizeof< / span > (T));< / div >
< div class = "line" > < a id = "l01972" name = "l01972" > < / a > < span class = "lineno" > 1972< / span > < / div >
< div class = "line" > < a id = "l01973" name = "l01973" > < / a > < span class = "lineno" > 1973< / span > threadgroup T Xs[BM * BK_padded];< / div >
< div class = "line" > < a id = "l01974" name = "l01974" > < / a > < span class = "lineno" > 1974< / span > threadgroup T Ws[BK * BN_padded];< / div >
< div class = "line" > < a id = "l01975" name = "l01975" > < / a > < span class = "lineno" > 1975< / span > < / div >
< div class = "line" > < a id = "l01976" name = "l01976" > < / a > < span class = "lineno" > 1976< / span > < a class = "code hl_function" href = "quantized_8h.html#accab1f9e17a65242347c051f98e4c0be" > adjust_matrix_offsets< T> < / a > (< / div >
< div class = "line" > < a id = "l01977" name = "l01977" > < / a > < span class = "lineno" > 1977< / span > x,< / div >
< div class = "line" > < a id = "l01978" name = "l01978" > < / a > < span class = "lineno" > 1978< / span > w,< / div >
< div class = "line" > < a id = "l01979" name = "l01979" > < / a > < span class = "lineno" > 1979< / span > scales,< / div >
< div class = "line" > < a id = "l01980" name = "l01980" > < / a > < span class = "lineno" > 1980< / span > biases,< / div >
< div class = "line" > < a id = "l01981" name = "l01981" > < / a > < span class = "lineno" > 1981< / span > lhs_indices,< / div >
< div class = "line" > < a id = "l01982" name = "l01982" > < / a > < span class = "lineno" > 1982< / span > rhs_indices,< / div >
< div class = "line" > < a id = "l01983" name = "l01983" > < / a > < span class = "lineno" > 1983< / span > y,< / div >
< div class = "line" > < a id = "l01984" name = "l01984" > < / a > < span class = "lineno" > 1984< / span > M * N,< / div >
< div class = "line" > < a id = "l01985" name = "l01985" > < / a > < span class = "lineno" > 1985< / span > batch_ndims,< / div >
< div class = "line" > < a id = "l01986" name = "l01986" > < / a > < span class = "lineno" > 1986< / span > batch_shape,< / div >
< div class = "line" > < a id = "l01987" name = "l01987" > < / a > < span class = "lineno" > 1987< / span > lhs_strides,< / div >
< div class = "line" > < a id = "l01988" name = "l01988" > < / a > < span class = "lineno" > 1988< / span > rhs_strides,< / div >
< div class = "line" > < a id = "l01989" name = "l01989" > < / a > < span class = "lineno" > 1989< / span > x_batch_ndims,< / div >
< div class = "line" > < a id = "l01990" name = "l01990" > < / a > < span class = "lineno" > 1990< / span > x_shape,< / div >
< div class = "line" > < a id = "l01991" name = "l01991" > < / a > < span class = "lineno" > 1991< / span > x_strides,< / div >
< div class = "line" > < a id = "l01992" name = "l01992" > < / a > < span class = "lineno" > 1992< / span > w_batch_ndims,< / div >
< div class = "line" > < a id = "l01993" name = "l01993" > < / a > < span class = "lineno" > 1993< / span > w_shape,< / div >
< div class = "line" > < a id = "l01994" name = "l01994" > < / a > < span class = "lineno" > 1994< / span > w_strides,< / div >
< div class = "line" > < a id = "l01995" name = "l01995" > < / a > < span class = "lineno" > 1995< / span > s_strides,< / div >
< div class = "line" > < a id = "l01996" name = "l01996" > < / a > < span class = "lineno" > 1996< / span > b_strides,< / div >
< div class = "line" > < a id = "l01997" name = "l01997" > < / a > < span class = "lineno" > 1997< / span > tid);< / div >
< div class = "line" > < a id = "l01998" name = "l01998" > < / a > < span class = "lineno" > 1998< / span > < a class = "code hl_function" href = "quantized_8h.html#a0ba59096494f1001c195312571523ae9" > qmm_n_impl< T, group_size, bits, BM, BK, BN> < / a > (< / div >
< div class = "line" > < a id = "l01999" name = "l01999" > < / a > < span class = "lineno" > 1999< / span > w, scales, biases, x, y, Xs, Ws, K, N, M, tid, lid, simd_gid, simd_lid);< / div >
< div class = "line" > < a id = "l02000" name = "l02000" > < / a > < span class = "lineno" > 2000< / span > }< / div >
2024-10-15 23:12:17 +08:00
< / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l02001" name = "l02001" > < / a > < span class = "lineno" > 2001< / span > < / div >
< div class = "line" > < a id = "l02002" name = "l02002" > < / a > < span class = "lineno" > 2002< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T, const < span class = "keywordtype" > int< / span > group_size, const < span class = "keywordtype" > int< / span > bits> < / div >
< div class = "foldopen" id = "foldopen02003" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l02003" name = "l02003" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59" > 2003< / a > < / span > [[kernel]] < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59" > affine_quantize< / a > (< / div >
< div class = "line" > < a id = "l02004" name = "l02004" > < / a > < span class = "lineno" > 2004< / span > < span class = "keyword" > const< / span > device T* w [[buffer(0)]],< / div >
< div class = "line" > < a id = "l02005" name = "l02005" > < / a > < span class = "lineno" > 2005< / span > device uint8_t* out [[buffer(1)]],< / div >
< div class = "line" > < a id = "l02006" name = "l02006" > < / a > < span class = "lineno" > 2006< / span > device T* scales [[buffer(2)]],< / div >
< div class = "line" > < a id = "l02007" name = "l02007" > < / a > < span class = "lineno" > 2007< / span > device T* biases [[buffer(3)]],< / div >
< div class = "line" > < a id = "l02008" name = "l02008" > < / a > < span class = "lineno" > 2008< / span > uint2 index [[thread_position_in_grid]],< / div >
< div class = "line" > < a id = "l02009" name = "l02009" > < / a > < span class = "lineno" > 2009< / span > uint2 grid_dim [[threads_per_grid]]) {< / div >
< div class = "line" > < a id = "l02010" name = "l02010" > < / a > < span class = "lineno" > 2010< / span > < span class = "keyword" > constexpr< / span > T eps = T(1e-7);< / div >
< div class = "line" > < a id = "l02011" name = "l02011" > < / a > < span class = "lineno" > 2011< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > < a class = "code hl_variable" href = "backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3" > simd_size< / a > = 32;< / div >
< div class = "line" > < a id = "l02012" name = "l02012" > < / a > < span class = "lineno" > 2012< / span > < span class = "keyword" > constexpr< / span > T n_bins = (1 < < bits) - 1;< / div >
< div class = "line" > < a id = "l02013" name = "l02013" > < / a > < span class = "lineno" > 2013< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > packs_per_int = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;< / div >
< div class = "line" > < a id = "l02014" name = "l02014" > < / a > < span class = "lineno" > 2014< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > values_per_reduce = group_size / < a class = "code hl_variable" href = "backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3" > simd_size< / a > ;< / div >
< div class = "line" > < a id = "l02015" name = "l02015" > < / a > < span class = "lineno" > 2015< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > writes_per_reduce = packs_per_int / values_per_reduce;< / div >
< div class = "line" > < a id = "l02016" name = "l02016" > < / a > < span class = "lineno" > 2016< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > writes_per_pack =< / div >
< div class = "line" > < a id = "l02017" name = "l02017" > < / a > < span class = "lineno" > 2017< / span > writes_per_reduce > 1 ? 1 : values_per_reduce / packs_per_int;< / div >
< div class = "line" > < a id = "l02018" name = "l02018" > < / a > < span class = "lineno" > 2018< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > power_of_2_bits = (bits & (bits - 1)) == 0;< / div >
< div class = "line" > < a id = "l02019" name = "l02019" > < / a > < span class = "lineno" > 2019< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > bytes_per_pack = power_of_2_bits ? 1 : 3;< / div >
< div class = "line" > < a id = "l02020" name = "l02020" > < / a > < span class = "lineno" > 2020< / span > < / div >
< div class = "line" > < a id = "l02021" name = "l02021" > < / a > < span class = "lineno" > 2021< / span > < span class = "keyword" > static_assert< / span > (< / div >
< div class = "line" > < a id = "l02022" name = "l02022" > < / a > < span class = "lineno" > 2022< / span > group_size % < a class = "code hl_variable" href = "backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3" > simd_size< / a > == 0,< / div >
< div class = "line" > < a id = "l02023" name = "l02023" > < / a > < span class = "lineno" > 2023< / span > < span class = "stringliteral" > " Group size must be divisible by simd size." < / span > );< / div >
< div class = "line" > < a id = "l02024" name = "l02024" > < / a > < span class = "lineno" > 2024< / span > < / div >
< div class = "line" > < a id = "l02025" name = "l02025" > < / a > < span class = "lineno" > 2025< / span > < span class = "keywordtype" > size_t< / span > offset = index.x + grid_dim.x * size_t(index.y);< / div >
< div class = "line" > < a id = "l02026" name = "l02026" > < / a > < span class = "lineno" > 2026< / span > < span class = "keywordtype" > size_t< / span > in_index = offset * values_per_reduce;< / div >
< div class = "line" > < a id = "l02027" name = "l02027" > < / a > < span class = "lineno" > 2027< / span > < span class = "keywordtype" > size_t< / span > out_index = power_of_2_bits< / div >
< div class = "line" > < a id = "l02028" name = "l02028" > < / a > < span class = "lineno" > 2028< / span > ? offset * writes_per_pack< / div >
< div class = "line" > < a id = "l02029" name = "l02029" > < / a > < span class = "lineno" > 2029< / span > : offset * bytes_per_pack / writes_per_reduce;< / div >
< div class = "line" > < a id = "l02030" name = "l02030" > < / a > < span class = "lineno" > 2030< / span > < / div >
< div class = "line" > < a id = "l02031" name = "l02031" > < / a > < span class = "lineno" > 2031< / span > T w_thread[values_per_reduce];< / div >
< div class = "line" > < a id = "l02032" name = "l02032" > < / a > < span class = "lineno" > 2032< / span > T w_min = < a class = "code hl_struct" href = "struct_limits.html" > Limits< T> ::max< / a > ;< / div >
< div class = "line" > < a id = "l02033" name = "l02033" > < / a > < span class = "lineno" > 2033< / span > T w_max = 0;< / div >
< div class = "line" > < a id = "l02034" name = "l02034" > < / a > < span class = "lineno" > 2034< / span > < / div >
< div class = "line" > < a id = "l02035" name = "l02035" > < / a > < span class = "lineno" > 2035< / span > < span class = "preprocessor" > #pragma clang loop unroll(full)< / span > < / div >
< div class = "line" > < a id = "l02036" name = "l02036" > < / a > < span class = "lineno" > 2036< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < values_per_reduce; i++) {< / div >
< div class = "line" > < a id = "l02037" name = "l02037" > < / a > < span class = "lineno" > 2037< / span > T val = w[in_index + i];< / div >
< div class = "line" > < a id = "l02038" name = "l02038" > < / a > < span class = "lineno" > 2038< / span > w_thread[i] = val;< / div >
< div class = "line" > < a id = "l02039" name = "l02039" > < / a > < span class = "lineno" > 2039< / span > w_min = < a class = "code hl_function" href = "namespacemetal.html#a6653b28c9473087141eddce39878d4d3" > min< / a > (w_min, val);< / div >
< div class = "line" > < a id = "l02040" name = "l02040" > < / a > < span class = "lineno" > 2040< / span > w_max = < a class = "code hl_function" href = "namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b" > max< / a > (w_max, val);< / div >
< div class = "line" > < a id = "l02041" name = "l02041" > < / a > < span class = "lineno" > 2041< / span > }< / div >
< div class = "line" > < a id = "l02042" name = "l02042" > < / a > < span class = "lineno" > 2042< / span > < / div >
< div class = "line" > < a id = "l02043" name = "l02043" > < / a > < span class = "lineno" > 2043< / span > w_min = < a class = "code hl_function" href = "namespacemetal.html#ae9e2a23e00724ba2d7868bc4112b386b" > simd_min< / a > (w_min);< / div >
< div class = "line" > < a id = "l02044" name = "l02044" > < / a > < span class = "lineno" > 2044< / span > w_max = < a class = "code hl_function" href = "namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49" > simd_max< / a > (w_max);< / div >
< div class = "line" > < a id = "l02045" name = "l02045" > < / a > < span class = "lineno" > 2045< / span > < / div >
< div class = "line" > < a id = "l02046" name = "l02046" > < / a > < span class = "lineno" > 2046< / span > T scale = < a class = "code hl_function" href = "namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b" > max< / a > ((w_max - w_min) / n_bins, eps);< / div >
< div class = "line" > < a id = "l02047" name = "l02047" > < / a > < span class = "lineno" > 2047< / span > < span class = "keywordtype" > bool< / span > side = < a class = "code hl_function" href = "namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb" > abs< / a > (w_min) > < a class = "code hl_function" href = "namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb" > abs< / a > (w_max);< / div >
< div class = "line" > < a id = "l02048" name = "l02048" > < / a > < span class = "lineno" > 2048< / span > scale = side ? scale : -scale;< / div >
< div class = "line" > < a id = "l02049" name = "l02049" > < / a > < span class = "lineno" > 2049< / span > T edge = side ? w_min : w_max;< / div >
< div class = "line" > < a id = "l02050" name = "l02050" > < / a > < span class = "lineno" > 2050< / span > T q0 = < a class = "code hl_function" href = "namespacemetal.html#a46c667e169ff9d51a9204a045305442f" > round< / a > (edge / scale);< / div >
< div class = "line" > < a id = "l02051" name = "l02051" > < / a > < span class = "lineno" > 2051< / span > < span class = "keywordtype" > bool< / span > at_zero = q0 == 0.0f;< / div >
< div class = "line" > < a id = "l02052" name = "l02052" > < / a > < span class = "lineno" > 2052< / span > scale = at_zero ? scale : edge / q0;< / div >
< div class = "line" > < a id = "l02053" name = "l02053" > < / a > < span class = "lineno" > 2053< / span > T bias = at_zero ? T(0) : edge;< / div >
< div class = "line" > < a id = "l02054" name = "l02054" > < / a > < span class = "lineno" > 2054< / span > < / div >
< div class = "line" > < a id = "l02055" name = "l02055" > < / a > < span class = "lineno" > 2055< / span > < span class = "comment" > // Write out the scales and biases< / span > < / div >
< div class = "line" > < a id = "l02056" name = "l02056" > < / a > < span class = "lineno" > 2056< / span > < span class = "keywordtype" > size_t< / span > gindex = in_index / group_size;< / div >
< div class = "line" > < a id = "l02057" name = "l02057" > < / a > < span class = "lineno" > 2057< / span > < span class = "keywordflow" > if< / span > (in_index % group_size == 0) {< / div >
< div class = "line" > < a id = "l02058" name = "l02058" > < / a > < span class = "lineno" > 2058< / span > scales[gindex] = scale;< / div >
< div class = "line" > < a id = "l02059" name = "l02059" > < / a > < span class = "lineno" > 2059< / span > biases[gindex] = bias;< / div >
< div class = "line" > < a id = "l02060" name = "l02060" > < / a > < span class = "lineno" > 2060< / span > }< / div >
< div class = "line" > < a id = "l02061" name = "l02061" > < / a > < span class = "lineno" > 2061< / span > < / div >
< div class = "line" > < a id = "l02062" name = "l02062" > < / a > < span class = "lineno" > 2062< / span > < span class = "comment" > // We accumulate 3 bytes worth for 3/6 bit so we need a uint32_t< / span > < / div >
< div class = "line" > < a id = "l02063" name = "l02063" > < / a > < span class = "lineno" > 2063< / span > uint32_t output = 0;< / div >
< div class = "line" > < a id = "l02064" name = "l02064" > < / a > < span class = "lineno" > 2064< / span > < / div >
< div class = "line" > < a id = "l02065" name = "l02065" > < / a > < span class = "lineno" > 2065< / span > < span class = "preprocessor" > #pragma clang loop unroll(full)< / span > < / div >
< div class = "line" > < a id = "l02066" name = "l02066" > < / a > < span class = "lineno" > 2066< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < values_per_reduce; i++) {< / div >
< div class = "line" > < a id = "l02067" name = "l02067" > < / a > < span class = "lineno" > 2067< / span > uint8_t val = < a class = "code hl_function" href = "namespacemetal.html#a6653b28c9473087141eddce39878d4d3" > min< / a > (< a class = "code hl_function" href = "namespacemetal.html#a46c667e169ff9d51a9204a045305442f" > round< / a > ((w_thread[i] - bias) / scale), n_bins);< / div >
< div class = "line" > < a id = "l02068" name = "l02068" > < / a > < span class = "lineno" > 2068< / span > < span class = "keywordflow" > if< / span > (bits == 8) {< / div >
< div class = "line" > < a id = "l02069" name = "l02069" > < / a > < span class = "lineno" > 2069< / span > output = val;< / div >
< div class = "line" > < a id = "l02070" name = "l02070" > < / a > < span class = "lineno" > 2070< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l02071" name = "l02071" > < / a > < span class = "lineno" > 2071< / span > output += val < < (bits * (i % packs_per_int));< / div >
< div class = "line" > < a id = "l02072" name = "l02072" > < / a > < span class = "lineno" > 2072< / span > }< / div >
< div class = "line" > < a id = "l02073" name = "l02073" > < / a > < span class = "lineno" > 2073< / span > < / div >
< div class = "line" > < a id = "l02074" name = "l02074" > < / a > < span class = "lineno" > 2074< / span > < span class = "keywordflow" > if< / span > (packs_per_int < values_per_reduce & & < / div >
< div class = "line" > < a id = "l02075" name = "l02075" > < / a > < span class = "lineno" > 2075< / span > i % packs_per_int == packs_per_int - 1) {< / div >
< div class = "line" > < a id = "l02076" name = "l02076" > < / a > < span class = "lineno" > 2076< / span > out[out_index + i / packs_per_int] = output;< / div >
< div class = "line" > < a id = "l02077" name = "l02077" > < / a > < span class = "lineno" > 2077< / span > output = 0;< / div >
< div class = "line" > < a id = "l02078" name = "l02078" > < / a > < span class = "lineno" > 2078< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l02079" name = "l02079" > < / a > < span class = "lineno" > 2079< / span > < span class = "preprocessor" > #pragma clang loop unroll(full)< / span > < / div >
< div class = "line" > < a id = "l02080" name = "l02080" > < / a > < span class = "lineno" > 2080< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > j = 1; j < writes_per_reduce; j++) {< / div >
< div class = "line" > < a id = "l02081" name = "l02081" > < / a > < span class = "lineno" > 2081< / span > uint8_t sval = < a class = "code hl_function" href = "namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c" > simd_shuffle_down< / a > (val, j);< / div >
< div class = "line" > < a id = "l02082" name = "l02082" > < / a > < span class = "lineno" > 2082< / span > output += sval < < (bits * (j * values_per_reduce + i));< / div >
< div class = "line" > < a id = "l02083" name = "l02083" > < / a > < span class = "lineno" > 2083< / span > }< / div >
< div class = "line" > < a id = "l02084" name = "l02084" > < / a > < span class = "lineno" > 2084< / span > }< / div >
< div class = "line" > < a id = "l02085" name = "l02085" > < / a > < span class = "lineno" > 2085< / span > }< / div >
< div class = "line" > < a id = "l02086" name = "l02086" > < / a > < span class = "lineno" > 2086< / span > < span class = "keywordflow" > if< / span > (bits == 3 || bits == 6) {< / div >
< div class = "line" > < a id = "l02087" name = "l02087" > < / a > < span class = "lineno" > 2087< / span > < span class = "keywordflow" > if< / span > (in_index % packs_per_int == 0 & & out_index % bytes_per_pack == 0) {< / div >
< div class = "line" > < a id = "l02088" name = "l02088" > < / a > < span class = "lineno" > 2088< / span > out[out_index] = output & 0xff;< / div >
< div class = "line" > < a id = "l02089" name = "l02089" > < / a > < span class = "lineno" > 2089< / span > out[out_index + 1] = (output & 0xff00) > > 8;< / div >
< div class = "line" > < a id = "l02090" name = "l02090" > < / a > < span class = "lineno" > 2090< / span > out[out_index + 2] = (output & 0xff0000) > > 16;< / div >
< div class = "line" > < a id = "l02091" name = "l02091" > < / a > < span class = "lineno" > 2091< / span > }< / div >
< div class = "line" > < a id = "l02092" name = "l02092" > < / a > < span class = "lineno" > 2092< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l02093" name = "l02093" > < / a > < span class = "lineno" > 2093< / span > < span class = "keywordflow" > if< / span > (writes_per_reduce > 0 & & out_index % writes_per_reduce == 0) {< / div >
< div class = "line" > < a id = "l02094" name = "l02094" > < / a > < span class = "lineno" > 2094< / span > out[out_index / writes_per_reduce] = output;< / div >
< div class = "line" > < a id = "l02095" name = "l02095" > < / a > < span class = "lineno" > 2095< / span > }< / div >
< div class = "line" > < a id = "l02096" name = "l02096" > < / a > < span class = "lineno" > 2096< / span > }< / div >
< div class = "line" > < a id = "l02097" name = "l02097" > < / a > < span class = "lineno" > 2097< / span > }< / div >
2024-11-06 03:54:16 +08:00
< / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l02098" name = "l02098" > < / a > < span class = "lineno" > 2098< / span > < / div >
< div class = "line" > < a id = "l02099" name = "l02099" > < / a > < span class = "lineno" > 2099< / span > < span class = "keyword" > template< / span > < < span class = "keyword" > typename< / span > T, const < span class = "keywordtype" > int< / span > group_size, const < span class = "keywordtype" > int< / span > bits> < / div >
< div class = "foldopen" id = "foldopen02100" data-start = "{" data-end = "}" >
< div class = "line" > < a id = "l02100" name = "l02100" > < / a > < span class = "lineno" > < a class = "line" href = "quantized_8h.html#a6076203615038eb06816158f7b3869c6" > 2100< / a > < / span > [[kernel]] < span class = "keywordtype" > void< / span > < a class = "code hl_function" href = "quantized_8h.html#a6076203615038eb06816158f7b3869c6" > affine_dequantize< / a > (< / div >
< div class = "line" > < a id = "l02101" name = "l02101" > < / a > < span class = "lineno" > 2101< / span > < span class = "keyword" > const< / span > device uint8_t* w [[buffer(0)]],< / div >
< div class = "line" > < a id = "l02102" name = "l02102" > < / a > < span class = "lineno" > 2102< / span > < span class = "keyword" > const< / span > device T* scales [[buffer(1)]],< / div >
< div class = "line" > < a id = "l02103" name = "l02103" > < / a > < span class = "lineno" > 2103< / span > < span class = "keyword" > const< / span > device T* biases [[buffer(2)]],< / div >
< div class = "line" > < a id = "l02104" name = "l02104" > < / a > < span class = "lineno" > 2104< / span > device T* out [[buffer(3)]],< / div >
< div class = "line" > < a id = "l02105" name = "l02105" > < / a > < span class = "lineno" > 2105< / span > uint2 index [[thread_position_in_grid]],< / div >
< div class = "line" > < a id = "l02106" name = "l02106" > < / a > < span class = "lineno" > 2106< / span > uint2 grid_dim [[threads_per_grid]]) {< / div >
< div class = "line" > < a id = "l02107" name = "l02107" > < / a > < span class = "lineno" > 2107< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > packs_per_int = bits == 3 ? 8 : bits == 6 ? 4 : 8 / bits;< / div >
< div class = "line" > < a id = "l02108" name = "l02108" > < / a > < span class = "lineno" > 2108< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > power_of_2_bits = (bits & (bits - 1)) == 0;< / div >
< div class = "line" > < a id = "l02109" name = "l02109" > < / a > < span class = "lineno" > 2109< / span > < span class = "keyword" > constexpr< / span > < span class = "keywordtype" > int< / span > bytes_per_pack = power_of_2_bits ? 1 : 3;< / div >
< div class = "line" > < a id = "l02110" name = "l02110" > < / a > < span class = "lineno" > 2110< / span > < / div >
< div class = "line" > < a id = "l02111" name = "l02111" > < / a > < span class = "lineno" > 2111< / span > < span class = "keywordtype" > size_t< / span > offset = index.x + grid_dim.x * size_t(index.y);< / div >
< div class = "line" > < a id = "l02112" name = "l02112" > < / a > < span class = "lineno" > 2112< / span > < span class = "keywordtype" > size_t< / span > oindex = offset * packs_per_int;< / div >
< div class = "line" > < a id = "l02113" name = "l02113" > < / a > < span class = "lineno" > 2113< / span > < span class = "keywordtype" > size_t< / span > gindex = oindex / group_size;< / div >
< div class = "line" > < a id = "l02114" name = "l02114" > < / a > < span class = "lineno" > 2114< / span > T scale = scales[gindex];< / div >
< div class = "line" > < a id = "l02115" name = "l02115" > < / a > < span class = "lineno" > 2115< / span > T bias = biases[gindex];< / div >
2024-11-23 04:24:16 +08:00
< div class = "line" > < a id = "l02116" name = "l02116" > < / a > < span class = "lineno" > 2116< / span > < / div >
2024-12-07 05:22:39 +08:00
< div class = "line" > < a id = "l02117" name = "l02117" > < / a > < span class = "lineno" > 2117< / span > out += oindex;< / div >
< div class = "line" > < a id = "l02118" name = "l02118" > < / a > < span class = "lineno" > 2118< / span > < / div >
< div class = "line" > < a id = "l02119" name = "l02119" > < / a > < span class = "lineno" > 2119< / span > < span class = "keywordflow" > if< / span > (bits == 3) {< / div >
< div class = "line" > < a id = "l02120" name = "l02120" > < / a > < span class = "lineno" > 2120< / span > w += offset * bytes_per_pack;< / div >
< div class = "line" > < a id = "l02121" name = "l02121" > < / a > < span class = "lineno" > 2121< / span > out[0] = (w[0] & 0x7) * scale + bias;< / div >
< div class = "line" > < a id = "l02122" name = "l02122" > < / a > < span class = "lineno" > 2122< / span > out[1] = ((w[0] & 0x38) > > 3) * scale + bias;< / div >
< div class = "line" > < a id = "l02123" name = "l02123" > < / a > < span class = "lineno" > 2123< / span > out[2] = (((w[0] & 0xc0) > > 6) + ((w[1] & 0x1) < < 2)) * scale + bias;< / div >
< div class = "line" > < a id = "l02124" name = "l02124" > < / a > < span class = "lineno" > 2124< / span > out[3] = ((w[1] & 0xe) > > 1) * scale + bias;< / div >
< div class = "line" > < a id = "l02125" name = "l02125" > < / a > < span class = "lineno" > 2125< / span > out[4] = ((w[1] & 0x70) > > 4) * scale + bias;< / div >
< div class = "line" > < a id = "l02126" name = "l02126" > < / a > < span class = "lineno" > 2126< / span > out[5] = (((w[1] & 0x80) > > 7) + ((w[2] & 0x3) < < 1)) * scale + bias;< / div >
< div class = "line" > < a id = "l02127" name = "l02127" > < / a > < span class = "lineno" > 2127< / span > out[6] = ((w[2] & 0x1c) > > 2) * scale + bias;< / div >
< div class = "line" > < a id = "l02128" name = "l02128" > < / a > < span class = "lineno" > 2128< / span > out[7] = ((w[2] & 0xe0) > > 5) * scale + bias;< / div >
< div class = "line" > < a id = "l02129" name = "l02129" > < / a > < span class = "lineno" > 2129< / span > < / div >
< div class = "line" > < a id = "l02130" name = "l02130" > < / a > < span class = "lineno" > 2130< / span > } < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 6) {< / div >
< div class = "line" > < a id = "l02131" name = "l02131" > < / a > < span class = "lineno" > 2131< / span > w += offset * bytes_per_pack;< / div >
< div class = "line" > < a id = "l02132" name = "l02132" > < / a > < span class = "lineno" > 2132< / span > out[0] = (w[0] & 0x3f) * scale + bias;< / div >
< div class = "line" > < a id = "l02133" name = "l02133" > < / a > < span class = "lineno" > 2133< / span > out[1] = (((w[0] > > 6) & 0x03) + ((w[1] & 0x0f) < < 2)) * scale + bias;< / div >
< div class = "line" > < a id = "l02134" name = "l02134" > < / a > < span class = "lineno" > 2134< / span > out[2] = (((w[1] > > 4) & 0x0f) + ((w[2] & 0x03) < < 4)) * scale + bias;< / div >
< div class = "line" > < a id = "l02135" name = "l02135" > < / a > < span class = "lineno" > 2135< / span > out[3] = ((w[2] > > 2) & 0x3f) * scale + bias;< / div >
< div class = "line" > < a id = "l02136" name = "l02136" > < / a > < span class = "lineno" > 2136< / span > } < span class = "keywordflow" > else< / span > {< / div >
< div class = "line" > < a id = "l02137" name = "l02137" > < / a > < span class = "lineno" > 2137< / span > uint val = w[offset];< / div >
< div class = "line" > < a id = "l02138" name = "l02138" > < / a > < span class = "lineno" > 2138< / span > < span class = "preprocessor" > #pragma clang loop unroll(full)< / span > < / div >
< div class = "line" > < a id = "l02139" name = "l02139" > < / a > < span class = "lineno" > 2139< / span > < span class = "keywordflow" > for< / span > (< span class = "keywordtype" > int< / span > i = 0; i < packs_per_int; i++) {< / div >
< div class = "line" > < a id = "l02140" name = "l02140" > < / a > < span class = "lineno" > 2140< / span > uint8_t d;< / div >
< div class = "line" > < a id = "l02141" name = "l02141" > < / a > < span class = "lineno" > 2141< / span > < span class = "keywordflow" > if< / span > (bits == 2) {< / div >
< div class = "line" > < a id = "l02142" name = "l02142" > < / a > < span class = "lineno" > 2142< / span > d = (val > > (bits * i)) & 0x03;< / div >
< div class = "line" > < a id = "l02143" name = "l02143" > < / a > < span class = "lineno" > 2143< / span > } < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 4) {< / div >
< div class = "line" > < a id = "l02144" name = "l02144" > < / a > < span class = "lineno" > 2144< / span > d = (val > > (bits * i)) & 0x0f;< / div >
< div class = "line" > < a id = "l02145" name = "l02145" > < / a > < span class = "lineno" > 2145< / span > } < span class = "keywordflow" > else< / span > < span class = "keywordflow" > if< / span > (bits == 8) {< / div >
< div class = "line" > < a id = "l02146" name = "l02146" > < / a > < span class = "lineno" > 2146< / span > d = val;< / div >
< div class = "line" > < a id = "l02147" name = "l02147" > < / a > < span class = "lineno" > 2147< / span > }< / div >
< div class = "line" > < a id = "l02148" name = "l02148" > < / a > < span class = "lineno" > 2148< / span > out[i] = scale * d + bias;< / div >
< div class = "line" > < a id = "l02149" name = "l02149" > < / a > < span class = "lineno" > 2149< / span > }< / div >
< div class = "line" > < a id = "l02150" name = "l02150" > < / a > < span class = "lineno" > 2150< / span > }< / div >
< div class = "line" > < a id = "l02151" name = "l02151" > < / a > < span class = "lineno" > 2151< / span > }< / div >
2024-10-15 23:12:17 +08:00
< / div >
< div class = "ttc" id = "abackend_2metal_2kernels_2reduction_2ops_8h_html_a515b75d563a93d3c09ee677948dc83e3" > < div class = "ttname" > < a href = "backend_2metal_2kernels_2reduction_2ops_8h.html#a515b75d563a93d3c09ee677948dc83e3" > simd_size< / a > < / div > < div class = "ttdeci" > static constant constexpr const uint8_t simd_size< / div > < div class = "ttdef" > < b > Definition< / b > ops.h:22< / div > < / div >
< div class = "ttc" id = "abackend_2metal_2kernels_2steel_2utils_8h_html_aaf4974425147d6f26d031691e321637f" > < div class = "ttname" > < a href = "backend_2metal_2kernels_2steel_2utils_8h.html#aaf4974425147d6f26d031691e321637f" > elem_to_loc_broadcast< / a > < / div > < div class = "ttdeci" > METAL_FUNC ulong2 elem_to_loc_broadcast(uint elem, constant const int *shape, constant const size_t *a_strides, constant const size_t *b_strides, int ndim)< / div > < div class = "ttdef" > < b > Definition< / b > utils.h:7< / div > < / div >
2024-11-23 04:24:16 +08:00
< div class = "ttc" id = "abackend_2metal_2kernels_2utils_8h_html_a22eaa505dbc7dd2a63a895f2e16712f5" > < div class = "ttname" > < a href = "backend_2metal_2kernels_2utils_8h.html#a22eaa505dbc7dd2a63a895f2e16712f5" > elem_to_loc< / a > < / div > < div class = "ttdeci" > METAL_FUNC IdxT elem_to_loc(uint elem, constant const int *shape, constant const StrideT *strides, int ndim)< / div > < div class = "ttdef" > < b > Definition< / b > utils.h:93< / div > < / div >
< div class = "ttc" id = "anamespacemetal_html" > < div class = "ttname" > < a href = "namespacemetal.html" > metal< / a > < / div > < div class = "ttdef" > < b > Definition< / b > bf16_math.h:226< / div > < / div >
< div class = "ttc" id = "anamespacemetal_html_a048cad0aca52cb737ebf103e76bd1c49" > < div class = "ttname" > < a href = "namespacemetal.html#a048cad0aca52cb737ebf103e76bd1c49" > metal::simd_max< / a > < / div > < div class = "ttdeci" > METAL_FUNC bfloat16_t simd_max(bfloat16_t data)< / div > < div class = "ttdef" > < b > Definition< / b > bf16_math.h:378< / div > < / div >
< div class = "ttc" id = "anamespacemetal_html_a46c667e169ff9d51a9204a045305442f" > < div class = "ttname" > < a href = "namespacemetal.html#a46c667e169ff9d51a9204a045305442f" > metal::round< / a > < / div > < div class = "ttdeci" > METAL_FUNC bfloat16_t round(bfloat16_t x)< / div > < div class = "ttdef" > < b > Definition< / b > bf16_math.h:232< / div > < / div >
< div class = "ttc" id = "anamespacemetal_html_a6653b28c9473087141eddce39878d4d3" > < div class = "ttname" > < a href = "namespacemetal.html#a6653b28c9473087141eddce39878d4d3" > metal::min< / a > < / div > < div class = "ttdeci" > METAL_FUNC bfloat16_t min(bfloat16_t x, bfloat16_t y)< / div > < div class = "ttdef" > < b > Definition< / b > bf16_math.h:232< / div > < / div >
< div class = "ttc" id = "anamespacemetal_html_a85181e37a00cb4a4217f1bb25389bce5" > < div class = "ttname" > < a href = "namespacemetal.html#a85181e37a00cb4a4217f1bb25389bce5" > metal::simd_sum< / a > < / div > < div class = "ttdeci" > METAL_FUNC bfloat16_t simd_sum(bfloat16_t data)< / div > < div class = "ttdef" > < b > Definition< / b > bf16_math.h:378< / div > < / div >
< div class = "ttc" id = "anamespacemetal_html_a853c80479ab2264d9c4587c7bcac767b" > < div class = "ttname" > < a href = "namespacemetal.html#a853c80479ab2264d9c4587c7bcac767b" > metal::max< / a > < / div > < div class = "ttdeci" > METAL_FUNC bfloat16_t max(bfloat16_t x, bfloat16_t y)< / div > < div class = "ttdef" > < b > Definition< / b > bf16_math.h:232< / div > < / div >
< div class = "ttc" id = "anamespacemetal_html_a87c5122c60f9a12afceb9925a5b78ffb" > < div class = "ttname" > < a href = "namespacemetal.html#a87c5122c60f9a12afceb9925a5b78ffb" > metal::abs< / a > < / div > < div class = "ttdeci" > METAL_FUNC bfloat16_t abs(bfloat16_t x)< / div > < div class = "ttdef" > < b > Definition< / b > bf16_math.h:232< / div > < / div >
< div class = "ttc" id = "anamespacemetal_html_ae9e2a23e00724ba2d7868bc4112b386b" > < div class = "ttname" > < a href = "namespacemetal.html#ae9e2a23e00724ba2d7868bc4112b386b" > metal::simd_min< / a > < / div > < div class = "ttdeci" > METAL_FUNC bfloat16_t simd_min(bfloat16_t data)< / div > < div class = "ttdef" > < b > Definition< / b > bf16_math.h:378< / div > < / div >
< div class = "ttc" id = "anamespacemetal_html_af6e2dd7ae087aba6abac4f0350b7611c" > < div class = "ttname" > < a href = "namespacemetal.html#af6e2dd7ae087aba6abac4f0350b7611c" > metal::simd_shuffle_down< / a > < / div > < div class = "ttdeci" > METAL_FUNC bfloat16_t simd_shuffle_down(bfloat16_t data, ushort delta)< / div > < div class = "ttdef" > < b > Definition< / b > bf16_math.h:377< / div > < / div >
2024-10-15 23:12:17 +08:00
< div class = "ttc" id = "anamespacemlx_1_1core_1_1random_html_abb895baa477f5a06b5f88e69245f1825" > < div class = "ttname" > < a href = "namespacemlx_1_1core_1_1random.html#abb895baa477f5a06b5f88e69245f1825" > mlx::core::random::bits< / a > < / div > < div class = "ttdeci" > array bits(const std::vector< int > & shape, int width, const std::optional< array > & key=std::nullopt, StreamOrDevice s={})< / div > < div class = "ttdoc" > Generate an array with type uint32 filled with random bits.< / div > < / div >
< div class = "ttc" id = "aquantized_8h_html_a0386011c52d03e60885a31e6fbd903dd" > < div class = "ttname" > < a href = "quantized_8h.html#a0386011c52d03e60885a31e6fbd903dd" > MLX_MTL_CONST< / a > < / div > < div class = "ttdeci" > #define MLX_MTL_CONST< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:8< / div > < / div >
2024-11-23 04:24:16 +08:00
< div class = "ttc" id = "aquantized_8h_html_a07b26d2d0b0d65dfe925c452c453fa42" > < div class = "ttname" > < a href = "quantized_8h.html#a07b26d2d0b0d65dfe925c452c453fa42" > qdot_safe< / a > < / div > < div class = "ttdeci" > U qdot_safe(const device uint8_t *w, const thread U *x_thread, U scale, U bias, U sum, int N)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:225< / div > < / div >
2024-12-07 05:22:39 +08:00
< div class = "ttc" id = "aquantized_8h_html_a0ba59096494f1001c195312571523ae9" > < div class = "ttname" > < a href = "quantized_8h.html#a0ba59096494f1001c195312571523ae9" > qmm_n_impl< / a > < / div > < div class = "ttdeci" > METAL_FUNC void qmm_n_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, threadgroup T *Xs, threadgroup T *Ws, const constant int & K, const constant int & N, const constant int & M, uint3 tid, uint lid, uint simd_gid, uint simd_lid)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:1083< / div > < / div >
2024-11-23 04:24:16 +08:00
< div class = "ttc" id = "aquantized_8h_html_a1546533c5b925b2fbb3bec870ec7487a" > < div class = "ttname" > < a href = "quantized_8h.html#a1546533c5b925b2fbb3bec870ec7487a" > qvm_impl< / a > < / div > < div class = "ttdeci" > METAL_FUNC void qvm_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const int in_vec_size, const int out_vec_size, uint3 tid, uint simd_gid, uint simd_lid)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:843< / div > < / div >
2024-12-07 05:22:39 +08:00
< div class = "ttc" id = "aquantized_8h_html_a1a66b061c46383952a0f067c3848971f" > < div class = "ttname" > < a href = "quantized_8h.html#a1a66b061c46383952a0f067c3848971f" > bs_qmm_n< / a > < / div > < div class = "ttdeci" > void bs_qmm_n(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int & K, const constant int & N, const constant int & M, const constant int & x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int & w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int & batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:1941< / div > < / div >
< div class = "ttc" id = "aquantized_8h_html_a2ce135e392dbf9a3e5180fb083792ed7" > < div class = "ttname" > < a href = "quantized_8h.html#a2ce135e392dbf9a3e5180fb083792ed7" > qmm_n< / a > < / div > < div class = "ttdeci" > void qmm_n(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int & K, const constant int & N, const constant int & M, const constant int & x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int & w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:1631< / div > < / div >
< div class = "ttc" id = "aquantized_8h_html_a47bcf4a14566e01e14bd3c155811db59" > < div class = "ttname" > < a href = "quantized_8h.html#a47bcf4a14566e01e14bd3c155811db59" > affine_quantize< / a > < / div > < div class = "ttdeci" > void affine_quantize(const device T *w, device uint8_t *out, device T *scales, device T *biases, uint2 index, uint2 grid_dim)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:2003< / div > < / div >
< div class = "ttc" id = "aquantized_8h_html_a530b720e123e59d73ea89a0a2d0946b7" > < div class = "ttname" > < a href = "quantized_8h.html#a530b720e123e59d73ea89a0a2d0946b7" > bs_qmv_fast< / a > < / div > < div class = "ttdeci" > void bs_qmv_fast(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int & in_vec_size, const constant int & out_vec_size, const constant int & x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int & w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int & batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint simd_gid, uint simd_lid)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:1684< / div > < / div >
< div class = "ttc" id = "aquantized_8h_html_a6076203615038eb06816158f7b3869c6" > < div class = "ttname" > < a href = "quantized_8h.html#a6076203615038eb06816158f7b3869c6" > affine_dequantize< / a > < / div > < div class = "ttdeci" > void affine_dequantize(const device uint8_t *w, const device T *scales, const device T *biases, device T *out, uint2 index, uint2 grid_dim)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:2100< / div > < / div >
2024-10-15 23:12:17 +08:00
< div class = "ttc" id = "aquantized_8h_html_a62969a218d93680f5e35d0c61b160b99" > < div class = "ttname" > < a href = "quantized_8h.html#a62969a218d93680f5e35d0c61b160b99" > SIMD_SIZE< / a > < / div > < div class = "ttdeci" > static constant constexpr const int SIMD_SIZE< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:10< / div > < / div >
2024-12-07 05:22:39 +08:00
< div class = "ttc" id = "aquantized_8h_html_a639c50a08b5cf57e8be5279a116274bd" > < div class = "ttname" > < a href = "quantized_8h.html#a639c50a08b5cf57e8be5279a116274bd" > qmv< / a > < / div > < div class = "ttdeci" > void qmv(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int & in_vec_size, const constant int & out_vec_size, const constant int & x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int & w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint simd_gid, uint simd_lid)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:1408< / div > < / div >
< div class = "ttc" id = "aquantized_8h_html_a6d6e3c31e44f232e58ae9d605e1f4494" > < div class = "ttname" > < a href = "quantized_8h.html#a6d6e3c31e44f232e58ae9d605e1f4494" > bs_qvm< / a > < / div > < div class = "ttdeci" > void bs_qvm(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int & in_vec_size, const constant int & out_vec_size, const constant int & x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int & w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int & batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint simd_gid, uint simd_lid)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:1806< / div > < / div >
< div class = "ttc" id = "aquantized_8h_html_a7bd1d9f17c86c8fd34ec13678cff755f" > < div class = "ttname" > < a href = "quantized_8h.html#a7bd1d9f17c86c8fd34ec13678cff755f" > qmv_fast< / a > < / div > < div class = "ttdeci" > void qmv_fast(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int & in_vec_size, const constant int & out_vec_size, const constant int & x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int & w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint simd_gid, uint simd_lid)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:1357< / div > < / div >
< div class = "ttc" id = "aquantized_8h_html_a7ce5f53a4d6d1555e9402d545408d0ad" > < div class = "ttname" > < a href = "quantized_8h.html#a7ce5f53a4d6d1555e9402d545408d0ad" > qmv_quad< / a > < / div > < div class = "ttdeci" > void qmv_quad(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int & in_vec_size, const constant int & out_vec_size, const constant int & x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int & w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint quad_gid, uint quad_lid)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:1306< / div > < / div >
2024-10-26 04:23:45 +08:00
< div class = "ttc" id = "aquantized_8h_html_a803e4d5a1459844ba647aea5b004e133" > < div class = "ttname" > < a href = "quantized_8h.html#a803e4d5a1459844ba647aea5b004e133" > QUAD_SIZE< / a > < / div > < div class = "ttdeci" > static constant constexpr const int QUAD_SIZE< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:11< / div > < / div >
< div class = "ttc" id = "aquantized_8h_html_a8dbace41de9e1e21dd59d016db11b3e9" > < div class = "ttname" > < a href = "quantized_8h.html#a8dbace41de9e1e21dd59d016db11b3e9" > load_vector< / a > < / div > < div class = "ttdeci" > U load_vector(const device T *x, thread U *x_thread)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:14< / div > < / div >
2024-11-23 04:24:16 +08:00
< div class = "ttc" id = "aquantized_8h_html_a8e13c7d895624f738d2a6d9893b687fd" > < div class = "ttname" > < a href = "quantized_8h.html#a8e13c7d895624f738d2a6d9893b687fd" > qmv_impl< / a > < / div > < div class = "ttdeci" > METAL_FUNC void qmv_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int & in_vec_size, const constant int & out_vec_size, uint3 tid, uint simd_gid, uint simd_lid)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:688< / div > < / div >
< div class = "ttc" id = "aquantized_8h_html_aa69e143d646fad332c1a53e8c9b337b7" > < div class = "ttname" > < a href = "quantized_8h.html#aa69e143d646fad332c1a53e8c9b337b7" > load_vector_safe< / a > < / div > < div class = "ttdeci" > U load_vector_safe(const device T *x, thread U *x_thread, int N)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:77< / div > < / div >
2024-12-07 05:22:39 +08:00
< div class = "ttc" id = "aquantized_8h_html_ab1ae143eba2afceb8df63f38b26f9a84" > < div class = "ttname" > < a href = "quantized_8h.html#ab1ae143eba2afceb8df63f38b26f9a84" > bs_qmm_t< / a > < / div > < div class = "ttdeci" > void bs_qmm_t(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int & K, const constant int & N, const constant int & M, const constant int & x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int & w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int & batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:1874< / div > < / div >
2024-11-23 04:24:16 +08:00
< div class = "ttc" id = "aquantized_8h_html_ab364d58ab652e3ad87a8f80910556071" > < div class = "ttname" > < a href = "quantized_8h.html#ab364d58ab652e3ad87a8f80910556071" > qdot< / a > < / div > < div class = "ttdeci" > U qdot(const device uint8_t *w, const thread U *x_thread, U scale, U bias, U sum)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:145< / div > < / div >
2024-12-07 05:22:39 +08:00
< div class = "ttc" id = "aquantized_8h_html_ab8243818512d6078d23e6ffb65fd7bb8" > < div class = "ttname" > < a href = "quantized_8h.html#ab8243818512d6078d23e6ffb65fd7bb8" > qvm_split_k< / a > < / div > < div class = "ttdeci" > void qvm_split_k(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int & in_vec_size, const constant int & out_vec_size, const constant int & x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int & w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int & final_block_size, uint3 tid, uint simd_gid, uint simd_lid)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:1510< / div > < / div >
2024-11-23 04:24:16 +08:00
< div class = "ttc" id = "aquantized_8h_html_aba7687e6f8f1d29c0a1b2a3db150bd81" > < div class = "ttname" > < a href = "quantized_8h.html#aba7687e6f8f1d29c0a1b2a3db150bd81" > qmv_fast_impl< / a > < / div > < div class = "ttdeci" > METAL_FUNC void qmv_fast_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int & in_vec_size, const constant int & out_vec_size, uint3 tid, uint simd_gid, uint simd_lid)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:620< / div > < / div >
2024-12-07 05:22:39 +08:00
< div class = "ttc" id = "aquantized_8h_html_abe2e3ef0ee4ec2cb61dc5330ad463d10" > < div class = "ttname" > < a href = "quantized_8h.html#abe2e3ef0ee4ec2cb61dc5330ad463d10" > qmm_t< / a > < / div > < div class = "ttdeci" > void qmm_t(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int & K, const constant int & N, const constant int & M, const constant int & x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int & w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint lid, uint simd_gid, uint simd_lid)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:1573< / div > < / div >
< div class = "ttc" id = "aquantized_8h_html_accab1f9e17a65242347c051f98e4c0be" > < div class = "ttname" > < a href = "quantized_8h.html#accab1f9e17a65242347c051f98e4c0be" > adjust_matrix_offsets< / a > < / div > < div class = "ttdeci" > METAL_FUNC void adjust_matrix_offsets(const device T *& x, const device uint32_t *& w, const device T *& scales, const device T *& biases, device T *& y, int output_stride, const constant int & x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int & w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:1213< / div > < / div >
< div class = "ttc" id = "aquantized_8h_html_acf4c7fc77821a83b31aedfb48443d3ed" > < div class = "ttname" > < a href = "quantized_8h.html#acf4c7fc77821a83b31aedfb48443d3ed" > bs_qmv< / a > < / div > < div class = "ttdeci" > void bs_qmv(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int & in_vec_size, const constant int & out_vec_size, const constant int & x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int & w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, const constant int & batch_ndims, const constant int *batch_shape, const device uint32_t *lhs_indices, const device uint32_t *rhs_indices, const constant size_t *lhs_strides, const constant size_t *rhs_strides, uint3 tid, uint simd_gid, uint simd_lid)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:1745< / div > < / div >
2024-11-23 04:24:16 +08:00
< div class = "ttc" id = "aquantized_8h_html_ad5cf1cf63656bc1780685d22169cd4ef" > < div class = "ttname" > < a href = "quantized_8h.html#ad5cf1cf63656bc1780685d22169cd4ef" > qmv_quad_impl< / a > < / div > < div class = "ttdeci" > METAL_FUNC void qmv_quad_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, constant int & in_vec_size, const constant int & out_vec_size, uint3 tid, uint quad_gid, uint quad_lid)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:563< / div > < / div >
2024-12-07 05:22:39 +08:00
< div class = "ttc" id = "aquantized_8h_html_ad84f7d5ab9e32dbbe3ca759ae5d5d5c5" > < div class = "ttname" > < a href = "quantized_8h.html#ad84f7d5ab9e32dbbe3ca759ae5d5d5c5" > qvm< / a > < / div > < div class = "ttdeci" > void qvm(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, const constant int & in_vec_size, const constant int & out_vec_size, const constant int & x_batch_ndims, const constant int *x_shape, const constant size_t *x_strides, const constant int & w_batch_ndims, const constant int *w_shape, const constant size_t *w_strides, const constant size_t *s_strides, const constant size_t *b_strides, uint3 tid, uint simd_gid, uint simd_lid)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:1459< / div > < / div >
2024-11-23 04:24:16 +08:00
< div class = "ttc" id = "aquantized_8h_html_ae756f6817b584c60f5dcdd1d9c6b4f58" > < div class = "ttname" > < a href = "quantized_8h.html#ae756f6817b584c60f5dcdd1d9c6b4f58" > qouter< / a > < / div > < div class = "ttdeci" > void qouter(const thread uint8_t *w, U x, U scale, U bias, thread U *result)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:307< / div > < / div >
< div class = "ttc" id = "aquantized_8h_html_aecff265b63566d0d5689cfc4e5b037d2" > < div class = "ttname" > < a href = "quantized_8h.html#aecff265b63566d0d5689cfc4e5b037d2" > dequantize< / a > < / div > < div class = "ttdeci" > void dequantize(const device uint8_t *w, U scale, U bias, threadgroup U *w_local)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:372< / div > < / div >
2024-12-07 05:22:39 +08:00
< div class = "ttc" id = "aquantized_8h_html_af5750a35e8f5462218effba719f7f5b8" > < div class = "ttname" > < a href = "quantized_8h.html#af5750a35e8f5462218effba719f7f5b8" > qmm_t_impl< / a > < / div > < div class = "ttdeci" > METAL_FUNC void qmm_t_impl(const device uint32_t *w, const device T *scales, const device T *biases, const device T *x, device T *y, threadgroup T *Xs, threadgroup T *Ws, const constant int & K, const constant int & N, const constant int & M, uint3 tid, uint lid, uint simd_gid, uint simd_lid)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:958< / div > < / div >
< div class = "ttc" id = "astruct_conditional_type_html_a00bac71c43763817c4422bf0363dc92b" > < div class = "ttname" > < a href = "struct_conditional_type.html#a00bac71c43763817c4422bf0363dc92b" > ConditionalType::type< / a > < / div > < div class = "ttdeci" > U type< / div > < div class = "ttdef" > < b > Definition< / b > utils.h:428< / div > < / div >
2024-11-23 04:24:16 +08:00
< div class = "ttc" id = "astruct_limits_html" > < div class = "ttname" > < a href = "struct_limits.html" > Limits< / a > < / div > < div class = "ttdef" > < b > Definition< / b > utils.h:23< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html" > QuantizedBlockLoader< / a > < / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:443< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_a0ace7e3762ecfa5a4106e7dee7e1b6ab" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#a0ace7e3762ecfa5a4106e7dee7e1b6ab" > QuantizedBlockLoader::group_stride< / a > < / div > < div class = "ttdeci" > const int group_stride< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:464< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_a1392a5278cf6e090ea80ebe7c4ac5fbb" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#a1392a5278cf6e090ea80ebe7c4ac5fbb" > QuantizedBlockLoader::BCOLS_PACKED< / a > < / div > < div class = "ttdeci" > static constant constexpr const short BCOLS_PACKED< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:456< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_a17d01a6aba0833b073586ef2c09d0fbd" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#a17d01a6aba0833b073586ef2c09d0fbd" > QuantizedBlockLoader::biases< / a > < / div > < div class = "ttdeci" > const device T * biases< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:473< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_a234feacde36a4afc0d740332a3769fb6" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#a234feacde36a4afc0d740332a3769fb6" > QuantizedBlockLoader::group_step_cnt< / a > < / div > < div class = "ttdeci" > short group_step_cnt< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:463< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_a31e14175f3d4902d9fe5ab5a219f61ba" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#a31e14175f3d4902d9fe5ab5a219f61ba" > QuantizedBlockLoader::group_steps< / a > < / div > < div class = "ttdeci" > static constant constexpr const short group_steps< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:459< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_a50821537ea747bc03295a09bb0eef475" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#a50821537ea747bc03295a09bb0eef475" > QuantizedBlockLoader::thread_idx< / a > < / div > < div class = "ttdeci" > const short thread_idx< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:466< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_a60713ce7498aa683cbb2a0f19ab16589" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#a60713ce7498aa683cbb2a0f19ab16589" > QuantizedBlockLoader::QuantizedBlockLoader< / a > < / div > < div class = "ttdeci" > QuantizedBlockLoader(const device uint8_t *src_, const device T *scales_, const device T *biases_, const int src_ld_, threadgroup T *dst_, ushort simd_group_id, ushort simd_lane_id)< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:475< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_a6123e4a9209d6eacb58b2c2344ed1ecf" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#a6123e4a9209d6eacb58b2c2344ed1ecf" > QuantizedBlockLoader::scales< / a > < / div > < div class = "ttdeci" > const device T * scales< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:472< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_a6213479f7a6d9314d8879f8856b0b6fb" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#a6213479f7a6d9314d8879f8856b0b6fb" > QuantizedBlockLoader::n_reads< / a > < / div > < div class = "ttdeci" > static constant constexpr const short n_reads< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:457< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_a674138ef7c43cc45586ea9f8fd6f6bd9" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#a674138ef7c43cc45586ea9f8fd6f6bd9" > QuantizedBlockLoader::next< / a > < / div > < div class = "ttdeci" > void next()< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:541< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_a699dc9aa284b8fbf870310bbb224465b" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#a699dc9aa284b8fbf870310bbb224465b" > QuantizedBlockLoader::load_safe< / a > < / div > < div class = "ttdeci" > void load_safe(short2 src_tile_dim) const< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:511< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_a8050977d473d1a24fae5c833e609839e" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#a8050977d473d1a24fae5c833e609839e" > QuantizedBlockLoader::src_ld< / a > < / div > < div class = "ttdeci" > const int src_ld< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:461< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_a85041d72225a2095659c70509291a906" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#a85041d72225a2095659c70509291a906" > QuantizedBlockLoader::bi< / a > < / div > < div class = "ttdeci" > const short bi< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:467< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_a86009527cb4b53e4c21fd6b1f78cfefc" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#a86009527cb4b53e4c21fd6b1f78cfefc" > QuantizedBlockLoader::load_unsafe< / a > < / div > < div class = "ttdeci" > void load_unsafe() const< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:498< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_a8eae73a0c04bf1e41fb96131f6aa500d" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#a8eae73a0c04bf1e41fb96131f6aa500d" > QuantizedBlockLoader::pack_factor< / a > < / div > < div class = "ttdeci" > static constant constexpr const short pack_factor< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:454< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_a9857214690fe6abad0e19d1045152f83" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#a9857214690fe6abad0e19d1045152f83" > QuantizedBlockLoader::dst< / a > < / div > < div class = "ttdeci" > threadgroup T * dst< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:470< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_abbf8249ca99e3e87b296ddd60a984b76" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#abbf8249ca99e3e87b296ddd60a984b76" > QuantizedBlockLoader::src< / a > < / div > < div class = "ttdeci" > const device uint8_t * src< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:471< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_ac3f651c1a645291d1037a2cc8ded2320" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#ac3f651c1a645291d1037a2cc8ded2320" > QuantizedBlockLoader::tile_stride< / a > < / div > < div class = "ttdeci" > const int tile_stride< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:462< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_ad00fe6d8bd395206a41693a8ed65d4db" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#ad00fe6d8bd395206a41693a8ed65d4db" > QuantizedBlockLoader::bytes_per_pack< / a > < / div > < div class = "ttdeci" > static constant constexpr const short bytes_per_pack< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:455< / div > < / div >
< div class = "ttc" id = "astruct_quantized_block_loader_html_ae2add92b2aaf3414e91f0470b9b0cc00" > < div class = "ttname" > < a href = "struct_quantized_block_loader.html#ae2add92b2aaf3414e91f0470b9b0cc00" > QuantizedBlockLoader::bj< / a > < / div > < div class = "ttdeci" > const short bj< / div > < div class = "ttdef" > < b > Definition< / b > quantized.h:468< / div > < / div >
2024-10-15 23:12:17 +08:00
< div class = "ttc" id = "astructmlx_1_1steel_1_1_block_loader_html" > < div class = "ttname" > < a href = "structmlx_1_1steel_1_1_block_loader.html" > mlx::steel::BlockLoader< / a > < / div > < div class = "ttdef" > < b > Definition< / b > loader.h:25< / div > < / div >
< / div > <!-- fragment --> < / div > <!-- contents -->
<!-- start footer part -->
< hr class = "footer" / > < address class = "footer" > < small >
Generated by  < a href = "https://www.doxygen.org/index.html" > < img class = "footer" src = "doxygen.svg" width = "104" height = "31" alt = "doxygen" / > < / a > 1.12.0
< / small > < / address >
< / div > <!-- doc - content -->
< / body >
< / html >