This commit is contained in:
Awni Hannun 2025-01-09 15:30:16 -08:00
parent 5e8f88d079
commit 26e2d97e27
4 changed files with 11 additions and 13 deletions

View File

@ -10,8 +10,6 @@ std::function<mx::Args(mx::Args)> load_model(const std::string &path);
BPETokenizer load_tokenizer(const std::string &path);
struct GenerationResponse {};
void generate(const std::function<mx::Args(mx::Args)> &model,
const BPETokenizer &tokenizer, const std::string &prompt,
int max_tokens = 256);

View File

@ -7,12 +7,10 @@ FetchContent_MakeAvailable(json)
target_include_directories(
mlxlm PRIVATE $<BUILD_INTERFACE:${json_SOURCE_DIR}/single_include/nlohmann>)
add_custom_target(unicode)
add_custom_command(
TARGET unicode
PRE_BUILD
COMMAND /bin/bash ${CMAKE_CURRENT_SOURCE_DIR}/download_unicode.sh)
add_dependencies(mlxlm unicode)
execute_process(
COMMAND zsh "${CMAKE_CURRENT_SOURCE_DIR}/download_unicode.sh" "${CMAKE_CURRENT_BINARY_DIR}"
COMMAND_ERROR_IS_FATAL ANY
)
target_sources(mlxlm
PRIVATE

View File

@ -5,7 +5,5 @@ url=https://raw.githubusercontent.com/ggerganov/llama.cpp/${commit}/src/
for file in 'unicode.cpp' 'unicode.h' 'unicode-data.cpp' 'unicode-data.h'
do
curl -OL ${url}/${file}
curl -OL ${url}/${file} --output-dir $1 2>/dev/null
done
touch unicode_downloaded

View File

@ -98,13 +98,17 @@ std::vector<int> BPETokenizer::encode(std::string text) const {
auto one_step_merge = [this](std::string segment, std::vector<int> &splits) {
int merge_idx;
int rank = INT32_MAX;
std::string candidate;
for (int i = 0; i < splits.size() - 2; ++i) {
auto start = splits[i];
auto mid = splits[i + 1];
auto end = splits[i + 2];
std::string candidate = segment.substr(start, mid - start);
candidate.clear();
candidate.insert(candidate.end(), segment.begin() + start,
segment.begin() + mid);
candidate += " ";
candidate += segment.substr(mid, end - mid);
candidate.insert(candidate.end(), segment.begin() + mid,
segment.begin() + end);
if (auto it = merges_.find(candidate); it != merges_.end()) {
if (it->second < rank) {
merge_idx = i;