Even Faster I/O (#1369)

* try multithreading for faster IO

* smaller batch size

* Account for pread returning less than size

* nit

---------

Co-authored-by: Angelos Katharopoulos <a_katharopoulos@apple.com>
This commit is contained in:
Awni Hannun 2024-08-28 11:49:07 -07:00 committed by GitHub
parent 4e22a1dffe
commit fcb65a3897
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 157 additions and 27 deletions

View File

@ -33,8 +33,7 @@ void Load::eval(const std::vector<array>& inputs, array& out) {
assert(inputs.size() == 0); assert(inputs.size() == 0);
out.set_data(allocator::malloc_or_wait(out.nbytes())); out.set_data(allocator::malloc_or_wait(out.nbytes()));
reader_->seek(offset_); reader_->read(out.data<char>(), out.nbytes(), offset_);
reader_->read(out.data<char>(), out.nbytes());
if (swap_endianness_) { if (swap_endianness_) {
switch (out.itemsize()) { switch (out.itemsize()) {

View File

@ -298,7 +298,51 @@ array load(std::shared_ptr<io::Reader> in_stream, StreamOrDevice s) {
/** Load array from file in .npy format */ /** Load array from file in .npy format */
array load(std::string file, StreamOrDevice s) { array load(std::string file, StreamOrDevice s) {
return load(std::make_shared<io::FileReader>(std::move(file)), s); return load(std::make_shared<io::ParallelFileReader>(std::move(file), 4), s);
} }
namespace io {
void ParallelFileReader::read(char* data, size_t n) {
while (n != 0) {
auto m = ::read(fd_, data, std::min(n, static_cast<size_t>(INT32_MAX)));
if (m <= 0) {
std::ostringstream msg;
msg << "[read] Unable to read " << n << " bytes from file.";
throw std::runtime_error(msg.str());
}
data += m;
n -= m;
}
}
void ParallelFileReader::read(char* data, size_t n, size_t offset) {
auto readfn = [fd = fd_](size_t offset, size_t size, char* buffer) -> bool {
while (size != 0) {
auto m = pread(fd, buffer, size, offset);
if (m <= 0) {
return false;
}
buffer += m;
size -= m;
}
return true;
};
std::vector<std::future<bool>> futs;
while (n != 0) {
size_t m = std::min(batch_size_, n);
futs.emplace_back(thread_pool_.enqueue(readfn, offset, m, data));
data += m;
n -= m;
offset += m;
}
for (auto& f : futs) {
if (!f.get()) {
throw std::runtime_error("[read] Unable to read from file.");
}
}
}
} // namespace io
} // namespace mlx::core } // namespace mlx::core

View File

@ -8,6 +8,8 @@
#include <memory> #include <memory>
#include <sstream> #include <sstream>
#include "mlx/io/threadpool.h"
namespace mlx::core { namespace mlx::core {
namespace io { namespace io {
@ -21,6 +23,7 @@ class Reader {
int64_t off, int64_t off,
std::ios_base::seekdir way = std::ios_base::beg) = 0; std::ios_base::seekdir way = std::ios_base::beg) = 0;
virtual void read(char* data, size_t n) = 0; virtual void read(char* data, size_t n) = 0;
virtual void read(char* data, size_t n, size_t offset) = 0;
virtual std::string label() const = 0; virtual std::string label() const = 0;
virtual ~Reader() = default; virtual ~Reader() = default;
}; };
@ -38,12 +41,14 @@ class Writer {
virtual ~Writer() = default; virtual ~Writer() = default;
}; };
class FileReader : public Reader { class ParallelFileReader : public Reader {
public: public:
explicit FileReader(std::string file_path) explicit ParallelFileReader(std::string file_path, int num_threads)
: fd_(open(file_path.c_str(), O_RDONLY)), label_(std::move(file_path)) {} : fd_(open(file_path.c_str(), O_RDONLY)),
label_(std::move(file_path)),
thread_pool_(ThreadPool(num_threads)) {}
~FileReader() override { ~ParallelFileReader() override {
close(fd_); close(fd_);
} }
@ -59,35 +64,26 @@ class FileReader : public Reader {
return lseek(fd_, 0, SEEK_CUR); return lseek(fd_, 0, SEEK_CUR);
} }
void seek(int64_t off, std::ios_base::seekdir way = std::ios_base::beg) void seek(int64_t, std::ios_base::seekdir = std::ios_base::beg) override {
override { throw std::runtime_error("[ParallelFileReader::seek] Not allowed");
if (way == std::ios_base::beg) {
lseek(fd_, off, 0);
} else {
lseek(fd_, off, SEEK_CUR);
}
} }
void read(char* data, size_t n) override { // Warning: do not use this function from multiple threads as
while (n != 0) { // it advances the file descriptor
auto m = ::read(fd_, data, std::min(n, static_cast<size_t>(INT32_MAX))); void read(char* data, size_t n) override;
if (m <= 0) {
std::ostringstream msg; void read(char* data, size_t n, size_t offset) override;
msg << "[read] Unable to read " << n << " bytes from file.";
throw std::runtime_error(msg.str());
}
data += m;
n -= m;
}
}
std::string label() const override { std::string label() const override {
return "file " + label_; return "file " + label_;
} }
private: private:
// 4MB
static constexpr size_t batch_size_ = (1 << 22);
int fd_; int fd_;
std::string label_; std::string label_;
ThreadPool thread_pool_;
}; };
class FileWriter : public Writer { class FileWriter : public Writer {

View File

@ -147,7 +147,7 @@ SafetensorsLoad load_safetensors(
} }
SafetensorsLoad load_safetensors(const std::string& file, StreamOrDevice s) { SafetensorsLoad load_safetensors(const std::string& file, StreamOrDevice s) {
return load_safetensors(std::make_shared<io::FileReader>(file), s); return load_safetensors(std::make_shared<io::ParallelFileReader>(file, 4), s);
} }
void save_safetensors( void save_safetensors(

86
mlx/io/threadpool.h Normal file
View File

@ -0,0 +1,86 @@
#pragma once
#include <condition_variable>
#include <functional>
#include <future>
#include <memory>
#include <mutex>
#include <queue>
#include <stdexcept>
#include <thread>
#include <vector>
class ThreadPool {
public:
ThreadPool(size_t);
template <class F, class... Args>
auto enqueue(F&& f, Args&&... args)
-> std::future<typename std::result_of_t<F(Args...)>>;
~ThreadPool();
private:
// need to keep track of threads so we can join them
std::vector<std::thread> workers;
// the task queue
std::queue<std::function<void()>> tasks;
// synchronization
std::mutex queue_mutex;
std::condition_variable condition;
bool stop;
};
inline ThreadPool::ThreadPool(size_t threads) : stop(false) {
for (size_t i = 0; i < threads; ++i)
workers.emplace_back([this] {
for (;;) {
std::function<void()> task;
{
std::unique_lock<std::mutex> lock(this->queue_mutex);
this->condition.wait(
lock, [this] { return this->stop || !this->tasks.empty(); });
if (this->stop && this->tasks.empty())
return;
task = std::move(this->tasks.front());
this->tasks.pop();
}
task();
}
});
}
template <class F, class... Args>
auto ThreadPool::enqueue(F&& f, Args&&... args)
-> std::future<typename std::result_of_t<F(Args...)>> {
using return_type = typename std::result_of_t<F(Args...)>;
auto task = std::make_shared<std::packaged_task<return_type()>>(
std::bind(std::forward<F>(f), std::forward<Args>(args)...));
std::future<return_type> res = task->get_future();
{
std::unique_lock<std::mutex> lock(queue_mutex);
// don't allow enqueueing after stopping the pool
if (stop) {
throw std::runtime_error(
"[ThreadPool::enqueue] Not allowed on stopped ThreadPool");
}
tasks.emplace([task]() { (*task)(); });
}
condition.notify_one();
return res;
}
inline ThreadPool::~ThreadPool() {
{
std::unique_lock<std::mutex> lock(queue_mutex);
stop = true;
}
condition.notify_all();
for (std::thread& worker : workers)
worker.join();
}

View File

@ -146,6 +146,11 @@ class PyFileReader : public io::Reader {
} }
} }
void read(char* data, size_t n, size_t offset) override {
seek(offset);
read(data, n);
}
std::string label() const override { std::string label() const override {
return "python file object"; return "python file object";
} }