Merge 8bb8b76ae4 into b8022c578a

2025-06-24 17:31:16 +08:00 · 2025-06-17 19:03:19 +12:00 · 2025-06-17 19:03:19 +12:00 · 6c2c1850a9
commit 6c2c1850a9
parent b8022c578a 8bb8b76ae4
38 changed files with 1044 additions and 2 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -35,6 +35,7 @@ option(MLX_BUILD_PYTHON_BINDINGS "Build python bindings for mlx" OFF)
 option(MLX_BUILD_METAL "Build metal backend" ON)
 option(MLX_BUILD_CPU "Build cpu backend" ON)
 option(MLX_BUILD_CUDA "Build cuda backend" OFF)
 option(MLX_BUILD_ROCM "Build ROCm backend" OFF)
 option(MLX_METAL_DEBUG "Enhance metal debug workflow" OFF)
 option(MLX_ENABLE_X64_MAC "Enable building for x64 macOS" OFF)
 option(MLX_BUILD_GGUF "Include support for GGUF format" ON)
@ -88,6 +89,10 @@ if(MLX_BUILD_CUDA)
  enable_language(CUDA)
 endif()
 if(MLX_BUILD_ROCM)
  enable_language(HIP)
 endif()
 if(MLX_BUILD_METAL AND NOT METAL_LIB)
  message(STATUS "Metal not found. Unable to build GPU")
  set(MLX_BUILD_METAL OFF)
--- a/mlx/CMakeLists.txt
+++ b/mlx/CMakeLists.txt
@ -60,7 +60,16 @@ else()
                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/backend/cuda/no_cuda.cpp)
 endif()
-if(MLX_BUILD_METAL OR MLX_BUILD_CUDA)
+if(MLX_BUILD_ROCM)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/rocm)
 else()
  target_sources(mlx
                 PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/backend/rocm/no_rocm.cpp)
 endif()
 if(MLX_BUILD_METAL
   OR MLX_BUILD_CUDA
   OR MLX_BUILD_ROCM)
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/gpu)
 else()
  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backend/no_gpu)
--- a/mlx/backend/rocm/CMakeLists.txt
+++ b/mlx/backend/rocm/CMakeLists.txt
@ -0,0 +1,85 @@
 # Filename rules in ROCm backend:
 #
 # * Use .hip/.hpp if code contains device code, and .cpp/.h if not.
 # * Device-only code should be put in device/ subdir.
 # * Files in device/ subdir should not include files outside.
 target_sources(
  mlx
  PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/allocator.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/arg_reduce.hip
          ${CMAKE_CURRENT_SOURCE_DIR}/binary.hip
          ${CMAKE_CURRENT_SOURCE_DIR}/compiled.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/copy.hip
          ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/eval.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/event.hip
          ${CMAKE_CURRENT_SOURCE_DIR}/fence.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/indexing.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/kernel_utils.hip
          ${CMAKE_CURRENT_SOURCE_DIR}/matmul.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/layer_norm.hip
          ${CMAKE_CURRENT_SOURCE_DIR}/logsumexp.hip
          ${CMAKE_CURRENT_SOURCE_DIR}/primitives.hip
          ${CMAKE_CURRENT_SOURCE_DIR}/random.hip
          ${CMAKE_CURRENT_SOURCE_DIR}/reduce.hip
          ${CMAKE_CURRENT_SOURCE_DIR}/rms_norm.hip
          ${CMAKE_CURRENT_SOURCE_DIR}/rope.hip
          ${CMAKE_CURRENT_SOURCE_DIR}/rocm.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/slicing.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/softmax.hip
          ${CMAKE_CURRENT_SOURCE_DIR}/sort.hip
          ${CMAKE_CURRENT_SOURCE_DIR}/ternary.hip
          ${CMAKE_CURRENT_SOURCE_DIR}/unary.hip
          ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
          ${CMAKE_CURRENT_SOURCE_DIR}/worker.cpp)
 target_compile_definitions(mlx PRIVATE MLX_USE_ROCM)
 # Embed kernel sources in binary for JIT compilation.
 file(
  GLOB MLX_JIT_SOURCES
  RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
  "${CMAKE_CURRENT_SOURCE_DIR}/device/*.h"
  "${CMAKE_CURRENT_SOURCE_DIR}/device/*.hpp")
 string(JOIN ":" MLX_JIT_SOURCES_ARG ${MLX_JIT_SOURCES})
 add_custom_command(
  OUTPUT gen/rocm_jit_sources.h
  COMMAND
    ${CMAKE_COMMAND} -DMLX_SOURCE_ROOT=${CMAKE_CURRENT_SOURCE_DIR}
    -DMLX_JIT_SOURCES=${MLX_JIT_SOURCES_ARG} -P
    "${CMAKE_CURRENT_SOURCE_DIR}/bin2h.cmake"
  DEPENDS bin2h.cmake ${MLX_JIT_SOURCES})
 add_custom_target(rocm_jit_sources DEPENDS gen/rocm_jit_sources.h)
 add_dependencies(mlx rocm_jit_sources)
 target_include_directories(mlx PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/gen")
 # Find ROCm installation
 find_package(hip REQUIRED)
 find_package(rocblas REQUIRED)
 # Link with ROCm libraries
 target_link_libraries(mlx PRIVATE hip::device roc::rocblas)
 # Set GPU architectures for ROCm Common ROCm architectures: gfx900, gfx906,
 # gfx908, gfx90a, gfx1030, gfx1100
 set(MLX_ROCM_ARCHITECTURES
    "gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100"
    CACHE STRING "ROCm GPU architectures")
 message(STATUS "ROCm GPU architectures: ${MLX_ROCM_ARCHITECTURES}")
 # Set GPU targets for HIP compilation
 set_property(TARGET mlx PROPERTY HIP_ARCHITECTURES "${MLX_ROCM_ARCHITECTURES}")
 # Enable HIP language support
 enable_language(HIP)
 # Set HIP compiler flags
 target_compile_options(
  mlx
  PRIVATE "$<$<COMPILE_LANGUAGE:HIP>:-fgpu-rdc>"
          "$<$<COMPILE_LANGUAGE:HIP>:-Xcompiler=-Wall>"
          "$<$<COMPILE_LANGUAGE:HIP>:-Xcompiler=-Wextra>")
 # Add ROCm include directories
 target_include_directories(mlx PRIVATE ${hip_INCLUDE_DIRS})
 target_include_directories(mlx PRIVATE ${rocblas_INCLUDE_DIRS})
--- a/mlx/backend/rocm/allocator.cpp
+++ b/mlx/backend/rocm/allocator.cpp
@ -0,0 +1,20 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/rocm/allocator.h"
 #include "mlx/backend/rocm/utils.h"
 namespace mlx::core::rocm {
 void* allocate(size_t size) {
  void* ptr;
  check_hip_error("hipMalloc", hipMalloc(&ptr, size));
  return ptr;
 }
 void deallocate(void* ptr) {
  if (ptr) {
    check_hip_error("hipFree", hipFree(ptr));
  }
 }
 } // namespace mlx::core::rocm
--- a/mlx/backend/rocm/allocator.h
+++ b/mlx/backend/rocm/allocator.h
@ -0,0 +1,12 @@
 // Copyright © 2025 Apple Inc.
 #pragma once
 #include <cstddef>
 namespace mlx::core::rocm {
 void* allocate(size_t size);
 void deallocate(void* ptr);
 } // namespace mlx::core::rocm
--- a/mlx/backend/rocm/arg_reduce.hip
+++ b/mlx/backend/rocm/arg_reduce.hip
@ -0,0 +1,28 @@
 // Copyright © 2025 Apple Inc.
 #include <hip/hip_runtime.h>
 namespace mlx::core::rocm {
 __global__ void argmax_kernel(float* input, int* output, int n) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  // Simple argmax placeholder
  if (idx == 0) {
    int max_idx = 0;
    float max_val = input[0];
    for (int i = 1; i < n; i++) {
      if (input[i] > max_val) {
        max_val = input[i];
        max_idx = i;
      }
    }
    output[0] = max_idx;
  }
 }
 void launch_argmax(float* input, int* output, int n, hipStream_t stream) {
  hipLaunchKernelGGL(argmax_kernel, dim3(1), dim3(1), 0, stream, input, output, n);
 }
 } // namespace mlx::core::rocm 
--- a/mlx/backend/rocm/bin2h.cmake
+++ b/mlx/backend/rocm/bin2h.cmake
@ -0,0 +1,47 @@
 # Copyright © 2025 Apple Inc.
 # Script to embed kernel source files as header for JIT compilation
 set(MLX_OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/gen/rocm_jit_sources.h")
 set(MLX_KERNEL_HEADER
    "#pragma once\n\n#include <unordered_map>\n#include <string>\n\nnamespace mlx::core::rocm {\n\n"
 )
 set(MLX_KERNEL_FOOTER "\n} // namespace mlx::core::rocm\n")
 # Create output directory
 get_filename_component(MLX_OUTPUT_DIR ${MLX_OUTPUT_FILE} DIRECTORY)
 file(MAKE_DIRECTORY ${MLX_OUTPUT_DIR})
 # Write header
 file(WRITE ${MLX_OUTPUT_FILE} ${MLX_KERNEL_HEADER})
 # Process JIT sources
 string(REPLACE ":" ";" MLX_JIT_SOURCES_LIST ${MLX_JIT_SOURCES})
 set(MLX_SOURCE_MAP
    "const std::unordered_map<std::string, std::string> kernel_sources = {\n")
 foreach(source IN LISTS MLX_JIT_SOURCES_LIST)
  set(source_file "${MLX_SOURCE_ROOT}/${source}")
  if(EXISTS ${source_file})
    # Read source file
    file(READ ${source_file} source_content)
    # Escape content for C++ string literal
    string(REPLACE "\\" "\\\\" source_content "${source_content}")
    string(REPLACE "\"" "\\\"" source_content "${source_content}")
    string(REPLACE "\n" "\\n\"\n\"" source_content "${source_content}")
    # Add to map
    set(MLX_SOURCE_MAP
        "${MLX_SOURCE_MAP}  {\"${source}\", \"${source_content}\"},\n")
  endif()
 endforeach()
 set(MLX_SOURCE_MAP "${MLX_SOURCE_MAP}};\n")
 # Write source map
 file(APPEND ${MLX_OUTPUT_FILE} ${MLX_SOURCE_MAP})
 # Write footer
 file(APPEND ${MLX_OUTPUT_FILE} ${MLX_KERNEL_FOOTER})
--- a/mlx/backend/rocm/binary.hip
+++ b/mlx/backend/rocm/binary.hip
@ -0,0 +1,36 @@
 // Copyright © 2025 Apple Inc.
 #include <hip/hip_runtime.h>
 #include "mlx/backend/rocm/utils.h"
 namespace mlx::core::rocm {
 // Basic binary operation kernels will go here
 __global__ void add_kernel(float* a, float* b, float* c, int n) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < n) {
    c[idx] = a[idx] + b[idx];
  }
 }
 __global__ void multiply_kernel(float* a, float* b, float* c, int n) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < n) {
    c[idx] = a[idx] * b[idx];
  }
 }
 void launch_add(float* a, float* b, float* c, int n, hipStream_t stream) {
  int threads = 256;
  int blocks = (n + threads - 1) / threads;
  hipLaunchKernelGGL(add_kernel, dim3(blocks), dim3(threads), 0, stream, a, b, c, n);
 }
 void launch_multiply(float* a, float* b, float* c, int n, hipStream_t stream) {
  int threads = 256;
  int blocks = (n + threads - 1) / threads;
  hipLaunchKernelGGL(multiply_kernel, dim3(blocks), dim3(threads), 0, stream, a, b, c, n);
 }
 } // namespace mlx::core::rocm 
--- a/mlx/backend/rocm/compiled.cpp
+++ b/mlx/backend/rocm/compiled.cpp
@ -0,0 +1,9 @@
 // Copyright © 2025 Apple Inc.
 namespace mlx::core::rocm {
 void compile() {
  // Placeholder for ROCm compilation
 }
 } // namespace mlx::core::rocm
--- a/mlx/backend/rocm/copy.hip
+++ b/mlx/backend/rocm/copy.hip
@ -0,0 +1,20 @@
 // Copyright © 2025 Apple Inc.
 #include <hip/hip_runtime.h>
 namespace mlx::core::rocm {
 __global__ void copy_kernel(float* src, float* dst, int n) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < n) {
    dst[idx] = src[idx];
  }
 }
 void launch_copy(float* src, float* dst, int n, hipStream_t stream) {
  int threads = 256;
  int blocks = (n + threads - 1) / threads;
  hipLaunchKernelGGL(copy_kernel, dim3(blocks), dim3(threads), 0, stream, src, dst, n);
 }
 } // namespace mlx::core::rocm 
--- a/mlx/backend/rocm/device.cpp
+++ b/mlx/backend/rocm/device.cpp
@ -0,0 +1,104 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/rocm/device.h"
 #include "mlx/backend/rocm/utils.h"
 namespace mlx::core::rocm {
 DeviceStream::DeviceStream(Device& device) : device_(device) {
  check_hip_error("hipStreamCreate", hipStreamCreate(&stream_));
  encoder_ = std::make_unique<CommandEncoder>(*this);
 }
 void DeviceStream::synchronize() {
  check_hip_error("hipStreamSynchronize", hipStreamSynchronize(stream_));
 }
 hipStream_t DeviceStream::schedule_hip_stream() {
  return stream_;
 }
 hipStream_t DeviceStream::last_hip_stream() {
  return stream_;
 }
 CommandEncoder& DeviceStream::get_encoder() {
  return *encoder_;
 }
 Device::Device(int device) : device_(device) {
  check_hip_error("hipSetDevice", hipSetDevice(device_));
  // Get device properties
  hipDeviceProp_t prop;
  check_hip_error(
      "hipGetDeviceProperties", hipGetDeviceProperties(&prop, device_));
  compute_capability_major_ = prop.major;
  compute_capability_minor_ = prop.minor;
  // Create rocBLAS handle
  check_hip_error(
      "rocblas_create_handle",
      static_cast<hipError_t>(rocblas_create_handle(&rocblas_handle_)));
 }
 Device::~Device() {
  if (rocblas_handle_) {
    rocblas_destroy_handle(rocblas_handle_);
  }
 }
 void Device::make_current() {
  check_hip_error("hipSetDevice", hipSetDevice(device_));
 }
 DeviceStream& Device::get_stream(Stream s) {
  auto it = streams_.find(s.index);
  if (it != streams_.end()) {
    return it->second;
  }
  auto [new_it, inserted] = streams_.emplace(s.index, DeviceStream(*this));
  return new_it->second;
 }
 CommandEncoder::CommandEncoder(DeviceStream& stream)
    : device_(stream.device()), stream_(stream), worker_() {}
 void CommandEncoder::add_completed_handler(std::function<void()> task) {
  worker_.enqueue(task);
 }
 void CommandEncoder::end_encoding() {
  // Implementation for ending encoding
 }
 void CommandEncoder::commit() {
  worker_.commit();
 }
 // Global device management
 static std::unordered_map<int, std::unique_ptr<Device>> devices_;
 Device& device(mlx::core::Device device) {
  auto it = devices_.find(device.index);
  if (it != devices_.end()) {
    return *it->second;
  }
  auto new_device = std::make_unique<Device>(device.index);
  Device& dev_ref = *new_device;
  devices_[device.index] = std::move(new_device);
  return dev_ref;
 }
 DeviceStream& get_stream(Stream s) {
  // Use default device (index 0) for now
  return device(mlx::core::Device{mlx::core::Device::gpu, 0}).get_stream(s);
 }
 CommandEncoder& get_command_encoder(Stream s) {
  return get_stream(s).get_encoder();
 }
 } // namespace mlx::core::rocm
--- a/mlx/backend/rocm/device.h
+++ b/mlx/backend/rocm/device.h
@ -0,0 +1,141 @@
 // Copyright © 2025 Apple Inc.
 #pragma once
 #include "mlx/array.h"
 #include "mlx/backend/rocm/worker.h"
 #include "mlx/stream.h"
 #include <hip/hip_runtime.h>
 #include <rocblas/rocblas.h>
 #include <unordered_map>
 namespace mlx::core::rocm {
 class Device;
 class CommandEncoder;
 class DeviceStream {
 public:
  explicit DeviceStream(Device& device);
  DeviceStream(const DeviceStream&) = delete;
  DeviceStream& operator=(const DeviceStream&) = delete;
  // Wait until kernels in the stream complete.
  void synchronize();
  // Return a HIP stream for launching kernels.
  hipStream_t schedule_hip_stream();
  // Return the last HIP stream used.
  hipStream_t last_hip_stream();
  CommandEncoder& get_encoder();
  Device& device() {
    return device_;
  }
 private:
  Device& device_;
  HipStream stream_;
  std::unique_ptr<CommandEncoder> encoder_;
 };
 class Device {
 public:
  explicit Device(int device);
  ~Device();
  Device(const Device&) = delete;
  Device& operator=(const Device&) = delete;
  // Make this device the current HIP device, required by some HIP calls.
  void make_current();
  DeviceStream& get_stream(Stream s);
  int hip_device() const {
    return device_;
  }
  int compute_capability_major() const {
    return compute_capability_major_;
  }
  int compute_capability_minor() const {
    return compute_capability_minor_;
  }
  rocblas_handle rocblas_handle() const {
    return rocblas_handle_;
  }
 private:
  int device_;
  int compute_capability_major_;
  int compute_capability_minor_;
  rocblas_handle rocblas_handle_;
  std::unordered_map<int, DeviceStream> streams_;
 };
 class CommandEncoder {
 public:
  explicit CommandEncoder(DeviceStream& stream);
  CommandEncoder(const CommandEncoder&) = delete;
  CommandEncoder& operator=(const CommandEncoder&) = delete;
  void set_input_array(const array& arr) {}
  void set_output_array(const array& arr) {}
  void add_temporary(const array& arr) {
    temporaries_.push_back(arr.data_shared_ptr());
  }
  void add_completed_handler(std::function<void()> task);
  void end_encoding();
  void commit();
  // Schedule a HIP stream for |fun| to launch kernels, and check error
  // afterwards.
  template <typename F>
  void launch_kernel(F&& fun) {
    launch_kernel(stream_.schedule_hip_stream(), std::forward<F>(fun));
  }
  template <typename F>
  void launch_kernel(hipStream_t stream, F&& fun) {
    device_.make_current();
    fun(stream);
    check_hip_error("kernel launch", hipGetLastError());
    has_gpu_work_ = true;
  }
  Device& device() {
    return device_;
  }
  DeviceStream& stream() {
    return stream_;
  }
  bool has_gpu_work() const {
    return has_gpu_work_;
  }
 private:
  Device& device_;
  DeviceStream& stream_;
  Worker worker_;
  bool has_gpu_work_{false};
  std::vector<std::shared_ptr<array::Data>> temporaries_;
 };
 Device& device(mlx::core::Device device);
 DeviceStream& get_stream(Stream s);
 CommandEncoder& get_command_encoder(Stream s);
 // Utility function to check HIP errors
 void check_hip_error(const char* msg, hipError_t error);
 } // namespace mlx::core::rocm
--- a/mlx/backend/rocm/eval.cpp
+++ b/mlx/backend/rocm/eval.cpp
@ -0,0 +1,11 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/rocm/utils.h"
 namespace mlx::core::rocm {
 void eval() {
  // Placeholder for ROCm evaluation
 }
 } // namespace mlx::core::rocm
--- a/mlx/backend/rocm/event.hip
+++ b/mlx/backend/rocm/event.hip
@ -0,0 +1,32 @@
 // Copyright © 2025 Apple Inc.
 #include <hip/hip_runtime.h>
 #include "mlx/backend/rocm/utils.h"
 namespace mlx::core::rocm {
 class Event {
 public:
  Event() {
    check_hip_error("hipEventCreate", hipEventCreate(&event_));
  }
  ~Event() {
    hipEventDestroy(event_);
  }
  void record(hipStream_t stream) {
    check_hip_error("hipEventRecord", hipEventRecord(event_, stream));
  }
  void wait() {
    check_hip_error("hipEventSynchronize", hipEventSynchronize(event_));
  }
  hipEvent_t event() const { return event_; }
 private:
  hipEvent_t event_;
 };
 } // namespace mlx::core::rocm 
--- a/mlx/backend/rocm/fence.cpp
+++ b/mlx/backend/rocm/fence.cpp
@ -0,0 +1,9 @@
 // Copyright © 2025 Apple Inc.
 namespace mlx::core::rocm {
 void fence() {
  // Placeholder for ROCm fence operation
 }
 } // namespace mlx::core::rocm
--- a/mlx/backend/rocm/indexing.cpp
+++ b/mlx/backend/rocm/indexing.cpp
@ -0,0 +1,9 @@
 // Copyright © 2025 Apple Inc.
 namespace mlx::core::rocm {
 void index() {
  // Placeholder for ROCm indexing operation
 }
 } // namespace mlx::core::rocm
--- a/mlx/backend/rocm/kernel_utils.hip
+++ b/mlx/backend/rocm/kernel_utils.hip
@ -0,0 +1,29 @@
 // Copyright © 2025 Apple Inc.
 #include <hip/hip_runtime.h>
 namespace mlx::core::rocm {
 // Utility functions for HIP kernels
 __device__ inline int get_global_id() {
  return blockIdx.x * blockDim.x + threadIdx.x;
 }
 __device__ inline int get_local_id() {
  return threadIdx.x;
 }
 __device__ inline int get_group_id() {
  return blockIdx.x;
 }
 __device__ inline int get_local_size() {
  return blockDim.x;
 }
 __device__ inline int get_num_groups() {
  return gridDim.x;
 }
 } // namespace mlx::core::rocm 
--- a/mlx/backend/rocm/layer_norm.hip
+++ b/mlx/backend/rocm/layer_norm.hip
@ -0,0 +1,37 @@
 // Copyright © 2025 Apple Inc.
 #include <hip/hip_runtime.h>
 namespace mlx::core::rocm {
 __global__ void layer_norm_kernel(
    float* input, 
    float* output, 
    float* gamma, 
    float* beta, 
    int n, 
    float eps) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < n) {
    // Simplified layer norm placeholder
    // Real implementation would compute mean and variance
    output[idx] = gamma[idx] * input[idx] + beta[idx];
  }
 }
 void launch_layer_norm(
    float* input, 
    float* output, 
    float* gamma, 
    float* beta, 
    int n, 
    float eps, 
    hipStream_t stream) {
  int threads = 256;
  int blocks = (n + threads - 1) / threads;
  hipLaunchKernelGGL(layer_norm_kernel, dim3(blocks), dim3(threads), 0, stream, 
                     input, output, gamma, beta, n, eps);
 }
 } // namespace mlx::core::rocm 
--- a/mlx/backend/rocm/logsumexp.hip
+++ b/mlx/backend/rocm/logsumexp.hip
@ -0,0 +1,13 @@
 // Copyright © 2025 Apple Inc.
 #include <hip/hip_runtime.h>
 namespace mlx::core::rocm {
 __global__ void logsumexp_kernel(float* input, float* output, int n) {
  // Placeholder implementation
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  (void)input; (void)output; (void)n; (void)idx;
 }
 } // namespace mlx::core::rocm 
--- a/mlx/backend/rocm/matmul.cpp
+++ b/mlx/backend/rocm/matmul.cpp
@ -0,0 +1,30 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/rocm/device.h"
 #include "mlx/backend/rocm/utils.h"
 namespace mlx::core::rocm {
 void matmul_hip(
    float* a,
    float* b,
    float* c,
    int m,
    int n,
    int k,
    hipStream_t stream) {
  // This is a placeholder - in a real implementation, this would use rocBLAS
  // auto& device = get_current_device();
  // rocblas_sgemm(device.rocblas_handle(), ...);
  // For now, just a placeholder
  (void)a;
  (void)b;
  (void)c;
  (void)m;
  (void)n;
  (void)k;
  (void)stream;
 }
 } // namespace mlx::core::rocm
--- a/mlx/backend/rocm/no_rocm.cpp
+++ b/mlx/backend/rocm/no_rocm.cpp
@ -0,0 +1,11 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/rocm/rocm.h"
 namespace mlx::core::rocm {
 bool is_available() {
  return false;
 }
 } // namespace mlx::core::rocm
--- a/mlx/backend/rocm/primitives.hip
+++ b/mlx/backend/rocm/primitives.hip
@ -0,0 +1,21 @@
 // Copyright © 2025 Apple Inc.
 #include <hip/hip_runtime.h>
 #include "mlx/backend/rocm/utils.h"
 #include "mlx/backend/common/primitives.h"
 namespace mlx::core::rocm {
 // Basic kernel implementations will go here
 // This is a placeholder for ROCm-specific primitive operations
 void add_hip() {
  // Placeholder for HIP add operation
 }
 void multiply_hip() {
  // Placeholder for HIP multiply operation  
 }
 } // namespace mlx::core::rocm 
--- a/mlx/backend/rocm/random.hip
+++ b/mlx/backend/rocm/random.hip
@ -0,0 +1,23 @@
 // Copyright © 2025 Apple Inc.
 #include <hip/hip_runtime.h>
 namespace mlx::core::rocm {
 __global__ void random_uniform_kernel(float* output, int n, unsigned int seed) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < n) {
    // Simple LCG placeholder - real implementation would use rocRAND
    unsigned int state = seed + idx;
    state = state * 1103515245 + 12345;
    output[idx] = (float)(state & 0x7FFFFFFF) / (float)0x7FFFFFFF;
  }
 }
 void launch_random_uniform(float* output, int n, unsigned int seed, hipStream_t stream) {
  int threads = 256;
  int blocks = (n + threads - 1) / threads;
  hipLaunchKernelGGL(random_uniform_kernel, dim3(blocks), dim3(threads), 0, stream, output, n, seed);
 }
 } // namespace mlx::core::rocm 
--- a/mlx/backend/rocm/reduce.hip
+++ b/mlx/backend/rocm/reduce.hip
@ -0,0 +1,24 @@
 // Copyright © 2025 Apple Inc.
 #include <hip/hip_runtime.h>
 namespace mlx::core::rocm {
 __global__ void sum_reduce_kernel(float* input, float* output, int n) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  // Simple reduction placeholder
  if (idx == 0) {
    float sum = 0.0f;
    for (int i = 0; i < n; i++) {
      sum += input[i];
    }
    output[0] = sum;
  }
 }
 void launch_sum_reduce(float* input, float* output, int n, hipStream_t stream) {
  hipLaunchKernelGGL(sum_reduce_kernel, dim3(1), dim3(1), 0, stream, input, output, n);
 }
 } // namespace mlx::core::rocm 
--- a/mlx/backend/rocm/rms_norm.hip
+++ b/mlx/backend/rocm/rms_norm.hip
@ -0,0 +1,13 @@
 // Copyright © 2025 Apple Inc.
 #include <hip/hip_runtime.h>
 namespace mlx::core::rocm {
 __global__ void rms_norm_kernel(float* input, float* output, int n) {
  // Placeholder implementation
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  (void)input; (void)output; (void)n; (void)idx;
 }
 } // namespace mlx::core::rocm 
--- a/mlx/backend/rocm/rocm.cpp
+++ b/mlx/backend/rocm/rocm.cpp
@ -0,0 +1,11 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/rocm/rocm.h"
 namespace mlx::core::rocm {
 bool is_available() {
  return true;
 }
 } // namespace mlx::core::rocm
--- a/mlx/backend/rocm/rocm.h
+++ b/mlx/backend/rocm/rocm.h
@ -0,0 +1,10 @@
 // Copyright © 2025 Apple Inc.
 #pragma once
 namespace mlx::core::rocm {
 /* Check if the ROCm backend is available. */
 bool is_available();
 } // namespace mlx::core::rocm
--- a/mlx/backend/rocm/rope.hip
+++ b/mlx/backend/rocm/rope.hip
@ -0,0 +1,13 @@
 // Copyright © 2025 Apple Inc.
 #include <hip/hip_runtime.h>
 namespace mlx::core::rocm {
 __global__ void rope_kernel(float* input, float* output, int n) {
  // Placeholder for RoPE implementation
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  (void)input; (void)output; (void)n; (void)idx;
 }
 } // namespace mlx::core::rocm 
--- a/mlx/backend/rocm/slicing.cpp
+++ b/mlx/backend/rocm/slicing.cpp
@ -0,0 +1,9 @@
 // Copyright © 2025 Apple Inc.
 namespace mlx::core::rocm {
 void slice() {
  // Placeholder for ROCm slicing operation
 }
 } // namespace mlx::core::rocm
--- a/mlx/backend/rocm/softmax.hip
+++ b/mlx/backend/rocm/softmax.hip
@ -0,0 +1,22 @@
 // Copyright © 2025 Apple Inc.
 #include <hip/hip_runtime.h>
 namespace mlx::core::rocm {
 __global__ void softmax_kernel(float* input, float* output, int n) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < n) {
    // Simplified softmax placeholder - real implementation needs reduction
    output[idx] = expf(input[idx]);
  }
 }
 void launch_softmax(float* input, float* output, int n, hipStream_t stream) {
  int threads = 256;
  int blocks = (n + threads - 1) / threads;
  hipLaunchKernelGGL(softmax_kernel, dim3(blocks), dim3(threads), 0, stream, input, output, n);
 }
 } // namespace mlx::core::rocm 
--- a/mlx/backend/rocm/sort.hip
+++ b/mlx/backend/rocm/sort.hip
@ -0,0 +1 @@
--- a/mlx/backend/rocm/ternary.hip
+++ b/mlx/backend/rocm/ternary.hip
@ -0,0 +1,20 @@
 // Copyright © 2025 Apple Inc.
 #include <hip/hip_runtime.h>
 namespace mlx::core::rocm {
 __global__ void select_kernel(float* condition, float* a, float* b, float* output, int n) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < n) {
    output[idx] = (condition[idx] != 0.0f) ? a[idx] : b[idx];
  }
 }
 void launch_select(float* condition, float* a, float* b, float* output, int n, hipStream_t stream) {
  int threads = 256;
  int blocks = (n + threads - 1) / threads;
  hipLaunchKernelGGL(select_kernel, dim3(blocks), dim3(threads), 0, stream, condition, a, b, output, n);
 }
 } // namespace mlx::core::rocm 
--- a/mlx/backend/rocm/unary.hip
+++ b/mlx/backend/rocm/unary.hip
@ -0,0 +1,33 @@
 // Copyright © 2025 Apple Inc.
 #include <hip/hip_runtime.h>
 namespace mlx::core::rocm {
 __global__ void relu_kernel(float* input, float* output, int n) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < n) {
    output[idx] = fmaxf(0.0f, input[idx]);
  }
 }
 __global__ void sigmoid_kernel(float* input, float* output, int n) {
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx < n) {
    output[idx] = 1.0f / (1.0f + expf(-input[idx]));
  }
 }
 void launch_relu(float* input, float* output, int n, hipStream_t stream) {
  int threads = 256;
  int blocks = (n + threads - 1) / threads;
  hipLaunchKernelGGL(relu_kernel, dim3(blocks), dim3(threads), 0, stream, input, output, n);
 }
 void launch_sigmoid(float* input, float* output, int n, hipStream_t stream) {
  int threads = 256;
  int blocks = (n + threads - 1) / threads;
  hipLaunchKernelGGL(sigmoid_kernel, dim3(blocks), dim3(threads), 0, stream, input, output, n);
 }
 } // namespace mlx::core::rocm 
--- a/mlx/backend/rocm/utils.cpp
+++ b/mlx/backend/rocm/utils.cpp
@ -0,0 +1,17 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/rocm/utils.h"
 #include <sstream>
 #include <stdexcept>
 namespace mlx::core::rocm {
 void check_hip_error(const char* msg, hipError_t error) {
  if (error != hipSuccess) {
    std::ostringstream oss;
    oss << "[ROCm] " << msg << ": " << hipGetErrorString(error);
    throw std::runtime_error(oss.str());
  }
 }
 } // namespace mlx::core::rocm
--- a/mlx/backend/rocm/utils.h
+++ b/mlx/backend/rocm/utils.h
@ -0,0 +1,12 @@
 // Copyright © 2025 Apple Inc.
 #pragma once
 #include <hip/hip_runtime.h>
 namespace mlx::core::rocm {
 // Utility function to check HIP errors
 void check_hip_error(const char* msg, hipError_t error);
 } // namespace mlx::core::rocm
--- a/mlx/backend/rocm/worker.cpp
+++ b/mlx/backend/rocm/worker.cpp
@ -0,0 +1,61 @@
 // Copyright © 2025 Apple Inc.
 #include "mlx/backend/rocm/worker.h"
 namespace mlx::core::rocm {
 Worker::Worker() : worker_thread_(&Worker::worker_loop, this) {}
 Worker::~Worker() {
  {
    std::lock_guard<std::mutex> lock(mutex_);
    stop_ = true;
  }
  cv_.notify_all();
  if (worker_thread_.joinable()) {
    worker_thread_.join();
  }
 }
 void Worker::enqueue(std::function<void()> task) {
  {
    std::lock_guard<std::mutex> lock(mutex_);
    tasks_.push(task);
  }
  cv_.notify_one();
 }
 void Worker::commit() {
  std::lock_guard<std::mutex> lock(mutex_);
  committed_ = true;
 }
 void Worker::join() {
  std::unique_lock<std::mutex> lock(mutex_);
  cv_.wait(lock, [this] { return tasks_.empty() && committed_; });
 }
 void Worker::worker_loop() {
  while (true) {
    std::function<void()> task;
    {
      std::unique_lock<std::mutex> lock(mutex_);
      cv_.wait(lock, [this] { return stop_ || !tasks_.empty(); });
      if (stop_) {
        break;
      }
      if (!tasks_.empty()) {
        task = tasks_.front();
        tasks_.pop();
      }
    }
    if (task) {
      task();
    }
  }
 }
 } // namespace mlx::core::rocm
--- a/mlx/backend/rocm/worker.h
+++ b/mlx/backend/rocm/worker.h
@ -0,0 +1,38 @@
 // Copyright © 2025 Apple Inc.
 #pragma once
 #include <hip/hip_runtime.h>
 #include <functional>
 #include <future>
 #include <queue>
 #include <thread>
 namespace mlx::core::rocm {
 using HipStream = hipStream_t;
 class Worker {
 public:
  Worker();
  ~Worker();
  Worker(const Worker&) = delete;
  Worker& operator=(const Worker&) = delete;
  void enqueue(std::function<void()> task);
  void commit();
  void join();
 private:
  void worker_loop();
  std::thread worker_thread_;
  std::queue<std::function<void()>> tasks_;
  std::mutex mutex_;
  std::condition_variable cv_;
  bool stop_{false};
  bool committed_{false};
 };
 } // namespace mlx::core::rocm
--- a/mlx/device.cpp
+++ b/mlx/device.cpp
@ -6,10 +6,23 @@
 #include "mlx/backend/gpu/available.h"
 #include "mlx/device.h"
 #ifdef MLX_USE_ROCM
 #include "mlx/backend/rocm/rocm.h"
 #endif
 namespace mlx::core {
 Device& mutable_default_device() {
-  static Device default_device{gpu::is_available() ? Device::gpu : Device::cpu};
+  Device::DeviceType default_type = Device::cpu;
  if (gpu::is_available()) {
    default_type = Device::gpu;
  }
 #ifdef MLX_USE_ROCM
  else if (rocm::is_available()) {
    default_type = Device::gpu; // ROCm devices use the generic gpu type
  }
 #endif
  static Device default_device{default_type};
  return default_device;
 }
@ -38,7 +51,11 @@ bool is_available(const Device& d) {
    case Device::cpu:
      return cpu::is_available();
    case Device::gpu:
 #ifdef MLX_USE_ROCM
      return gpu::is_available() || rocm::is_available();
 #else
      return gpu::is_available();
 #endif
  }
  // appease compiler
  return false;