From 26e2d97e27d8cd02d5467555ae3d840880c5bc76 Mon Sep 17 00:00:00 2001 From: Awni Hannun Date: Thu, 9 Jan 2025 15:30:16 -0800 Subject: [PATCH] comment --- llms/export/mlxlm.h | 2 -- llms/export/third_party/CMakeLists.txt | 10 ++++------ llms/export/third_party/download_unicode.sh | 4 +--- llms/export/tokenizer.cpp | 8 ++++++-- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/llms/export/mlxlm.h b/llms/export/mlxlm.h index 0b1da6ec..9dba3b0a 100644 --- a/llms/export/mlxlm.h +++ b/llms/export/mlxlm.h @@ -10,8 +10,6 @@ std::function load_model(const std::string &path); BPETokenizer load_tokenizer(const std::string &path); -struct GenerationResponse {}; - void generate(const std::function &model, const BPETokenizer &tokenizer, const std::string &prompt, int max_tokens = 256); diff --git a/llms/export/third_party/CMakeLists.txt b/llms/export/third_party/CMakeLists.txt index 2ece0d1f..0496ea57 100644 --- a/llms/export/third_party/CMakeLists.txt +++ b/llms/export/third_party/CMakeLists.txt @@ -7,12 +7,10 @@ FetchContent_MakeAvailable(json) target_include_directories( mlxlm PRIVATE $) -add_custom_target(unicode) -add_custom_command( - TARGET unicode - PRE_BUILD - COMMAND /bin/bash ${CMAKE_CURRENT_SOURCE_DIR}/download_unicode.sh) -add_dependencies(mlxlm unicode) +execute_process( + COMMAND zsh "${CMAKE_CURRENT_SOURCE_DIR}/download_unicode.sh" "${CMAKE_CURRENT_BINARY_DIR}" + COMMAND_ERROR_IS_FATAL ANY +) target_sources(mlxlm PRIVATE diff --git a/llms/export/third_party/download_unicode.sh b/llms/export/third_party/download_unicode.sh index bacc0c46..6e52fe38 100644 --- a/llms/export/third_party/download_unicode.sh +++ b/llms/export/third_party/download_unicode.sh @@ -5,7 +5,5 @@ url=https://raw.githubusercontent.com/ggerganov/llama.cpp/${commit}/src/ for file in 'unicode.cpp' 'unicode.h' 'unicode-data.cpp' 'unicode-data.h' do - curl -OL ${url}/${file} + curl -OL ${url}/${file} --output-dir $1 2>/dev/null done - -touch unicode_downloaded diff --git a/llms/export/tokenizer.cpp b/llms/export/tokenizer.cpp index 6ecd8b1e..5e68a43b 100644 --- a/llms/export/tokenizer.cpp +++ b/llms/export/tokenizer.cpp @@ -98,13 +98,17 @@ std::vector BPETokenizer::encode(std::string text) const { auto one_step_merge = [this](std::string segment, std::vector &splits) { int merge_idx; int rank = INT32_MAX; + std::string candidate; for (int i = 0; i < splits.size() - 2; ++i) { auto start = splits[i]; auto mid = splits[i + 1]; auto end = splits[i + 2]; - std::string candidate = segment.substr(start, mid - start); + candidate.clear(); + candidate.insert(candidate.end(), segment.begin() + start, + segment.begin() + mid); candidate += " "; - candidate += segment.substr(mid, end - mid); + candidate.insert(candidate.end(), segment.begin() + mid, + segment.begin() + end); if (auto it = merges_.find(candidate); it != merges_.end()) { if (it->second < rank) { merge_idx = i;