mirror of
https://github.com/ml-explore/mlx.git
synced 2025-12-16 01:49:05 +08:00
try older image
This commit is contained in:
@@ -201,22 +201,16 @@ jobs:
|
|||||||
|
|
||||||
cuda_build_and_test:
|
cuda_build_and_test:
|
||||||
machine:
|
machine:
|
||||||
image: linux-cuda-12:default
|
image: linux-cuda-12:2023.11.1
|
||||||
resource_class: gpu.nvidia.small.gen2
|
resource_class: gpu.nvidia.small.gen2
|
||||||
steps:
|
steps:
|
||||||
- checkout
|
- checkout
|
||||||
- run:
|
- run:
|
||||||
name: Install Python package
|
name: Install Python package
|
||||||
command: |
|
command: |
|
||||||
wget http://security.ubuntu.com/ubuntu/pool/universe/n/ncurses/libtinfo5_6.3-2ubuntu0.1_amd64.deb
|
|
||||||
sudo dpkg -i libtinfo5_6.3-2ubuntu0.1_amd64.deb
|
|
||||||
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
|
|
||||||
sudo dpkg -i cuda-keyring_1.0-1_all.deb
|
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install cuda-toolkit-12-0
|
|
||||||
sudo update-alternatives --set cuda /usr/local/cuda-12.0
|
|
||||||
sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
|
sudo apt-get install libblas-dev liblapack-dev liblapacke-dev
|
||||||
python -m venv env
|
python3 -m venv env
|
||||||
source env/bin/activate
|
source env/bin/activate
|
||||||
CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
|
CMAKE_ARGS="-DMLX_BUILD_CUDA=ON -DCMAKE_CUDA_COMPILER=`which nvcc`" \
|
||||||
pip install -e ".[dev]"
|
pip install -e ".[dev]"
|
||||||
|
|||||||
@@ -36,7 +36,8 @@ affine_quantize(const T* w, uint8_t* out, T* scales, T* biases, size_t size) {
|
|||||||
auto tidx = block_idx.x * block_size.x + idx_in_block.x;
|
auto tidx = block_idx.x * block_size.x + idx_in_block.x;
|
||||||
auto tidy = block_idx.y * block_size.y + idx_in_block.y;
|
auto tidy = block_idx.y * block_size.y + idx_in_block.y;
|
||||||
|
|
||||||
auto grid_dim = cg::this_grid().dim_threads();
|
auto grid_dim_x =
|
||||||
|
cg::this_grid().dim_blocks().x * cg::this_grid().block_index().x;
|
||||||
constexpr float eps = 1e-7;
|
constexpr float eps = 1e-7;
|
||||||
constexpr int simd_size = WARP_SIZE;
|
constexpr int simd_size = WARP_SIZE;
|
||||||
constexpr float n_bins = (1 << bits) - 1;
|
constexpr float n_bins = (1 << bits) - 1;
|
||||||
@@ -48,7 +49,7 @@ affine_quantize(const T* w, uint8_t* out, T* scales, T* biases, size_t size) {
|
|||||||
writes_per_reduce > 1 ? 1 : values_per_reduce / pack_factor;
|
writes_per_reduce > 1 ? 1 : values_per_reduce / pack_factor;
|
||||||
constexpr int power_of_2_bits = (bits & (bits - 1)) == 0;
|
constexpr int power_of_2_bits = (bits & (bits - 1)) == 0;
|
||||||
|
|
||||||
size_t offset = tidx + grid_dim.x * size_t(tidy);
|
size_t offset = tidx + grid_dim_x * size_t(tidy);
|
||||||
size_t in_index = offset * values_per_reduce;
|
size_t in_index = offset * values_per_reduce;
|
||||||
if (in_index >= size) {
|
if (in_index >= size) {
|
||||||
return;
|
return;
|
||||||
@@ -153,12 +154,13 @@ __global__ void affine_dequantize(
|
|||||||
auto tidx = block_idx.x * block_size.x + idx_in_block.x;
|
auto tidx = block_idx.x * block_size.x + idx_in_block.x;
|
||||||
auto tidy = block_idx.y * block_size.y + idx_in_block.y;
|
auto tidy = block_idx.y * block_size.y + idx_in_block.y;
|
||||||
|
|
||||||
auto grid_dim = cg::this_grid().dim_threads();
|
auto grid_dim_x =
|
||||||
|
cg::this_grid().dim_blocks().x * cg::this_grid().block_index().x;
|
||||||
|
|
||||||
constexpr int pack_factor = get_pack_factor<bits, 8>();
|
constexpr int pack_factor = get_pack_factor<bits, 8>();
|
||||||
constexpr int bytes_per_pack = get_bytes_per_pack<bits>();
|
constexpr int bytes_per_pack = get_bytes_per_pack<bits>();
|
||||||
|
|
||||||
size_t offset = tidx + grid_dim.x * size_t(tidy);
|
size_t offset = tidx + grid_dim_x * size_t(tidy);
|
||||||
size_t oindex = offset * pack_factor;
|
size_t oindex = offset * pack_factor;
|
||||||
|
|
||||||
if (oindex >= size) {
|
if (oindex >= size) {
|
||||||
@@ -349,7 +351,8 @@ void fast::AffineQuantize::eval_gpu(
|
|||||||
dispatch_bits(bits_, [&](auto bits) {
|
dispatch_bits(bits_, [&](auto bits) {
|
||||||
using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
|
using DataType = cuda_type_t<MLX_GET_TYPE(type_tag)>;
|
||||||
if (dequantize_) {
|
if (dequantize_) {
|
||||||
auto kernel = cu::affine_dequantize<DataType, group_size(), bits()>;
|
auto kernel =
|
||||||
|
cu::affine_dequantize<DataType, group_size.value, bits.value>;
|
||||||
auto [num_blocks, block_dims] =
|
auto [num_blocks, block_dims] =
|
||||||
get_launch_args(kernel, size, grid_shape, w.strides(), large);
|
get_launch_args(kernel, size, grid_shape, w.strides(), large);
|
||||||
enc.add_kernel_node(
|
enc.add_kernel_node(
|
||||||
@@ -362,7 +365,8 @@ void fast::AffineQuantize::eval_gpu(
|
|||||||
out.data<DataType>(),
|
out.data<DataType>(),
|
||||||
out.size());
|
out.size());
|
||||||
} else {
|
} else {
|
||||||
auto kernel = cu::affine_quantize<DataType, group_size(), bits()>;
|
auto kernel =
|
||||||
|
cu::affine_quantize<DataType, group_size.value, bits.value>;
|
||||||
auto [num_blocks, block_dims] =
|
auto [num_blocks, block_dims] =
|
||||||
get_launch_args(kernel, size, grid_shape, w.strides(), large);
|
get_launch_args(kernel, size, grid_shape, w.strides(), large);
|
||||||
enc.add_kernel_node(
|
enc.add_kernel_node(
|
||||||
|
|||||||
Reference in New Issue
Block a user