// Copyright (c) 2015 Sandstorm Development Group, Inc. and contributors // Licensed under the MIT License: // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #if !_WIN32 #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif #include "filesystem.h" #include "debug.h" #include <sys/types.h> #include <sys/stat.h> #include <sys/ioctl.h> #include <fcntl.h> #include <unistd.h> #include <stdio.h> #include <sys/mman.h> #include <errno.h> #include <dirent.h> #include <stdlib.h> #include "vector.h" #include "miniposix.h" #include <algorithm> #if __linux__ #include <syscall.h> #include <linux/fs.h> #include <sys/sendfile.h> #endif namespace kj { namespace { #define HIDDEN_PREFIX ".kj-tmp." // Prefix for temp files which should be hidden when listing a directory. // // If you change this, make sure to update the unit test. #ifdef O_CLOEXEC #define MAYBE_O_CLOEXEC O_CLOEXEC #else #define MAYBE_O_CLOEXEC 0 #endif #ifdef O_DIRECTORY #define MAYBE_O_DIRECTORY O_DIRECTORY #else #define MAYBE_O_DIRECTORY 0 #endif #if __APPLE__ // Mac OSX defines SEEK_HOLE, but it doesn't work. ("Inappropriate ioctl for device", it says.) #undef SEEK_HOLE #endif static void setCloexec(int fd) KJ_UNUSED; static void setCloexec(int fd) { // Set the O_CLOEXEC flag on the given fd. // // We try to avoid the need to call this by taking advantage of syscall flags that set it // atomically on new file descriptors. Unfortunately some platforms do not support such syscalls. #ifdef FIOCLEX // Yay, we can set the flag in one call. KJ_SYSCALL_HANDLE_ERRORS(ioctl(fd, FIOCLEX)) { case EINVAL: case EOPNOTSUPP: break; default: KJ_FAIL_SYSCALL("ioctl(fd, FIOCLEX)", error) { break; } break; } else { // success return; } #endif // Sadness, we must resort to read/modify/write. // // (On many platforms, FD_CLOEXEC is the only flag modifiable via F_SETFD and therefore we could // skip the read... but it seems dangerous to assume that's true of all platforms, and anyway // most platforms support FIOCLEX.) int flags; KJ_SYSCALL(flags = fcntl(fd, F_GETFD)); if (!(flags & FD_CLOEXEC)) { KJ_SYSCALL(fcntl(fd, F_SETFD, flags | FD_CLOEXEC)); } } static Date toKjDate(struct timespec tv) { return tv.tv_sec * SECONDS + tv.tv_nsec * NANOSECONDS + UNIX_EPOCH; } static FsNode::Type modeToType(mode_t mode) { switch (mode & S_IFMT) { case S_IFREG : return FsNode::Type::FILE; case S_IFDIR : return FsNode::Type::DIRECTORY; case S_IFLNK : return FsNode::Type::SYMLINK; case S_IFBLK : return FsNode::Type::BLOCK_DEVICE; case S_IFCHR : return FsNode::Type::CHARACTER_DEVICE; case S_IFIFO : return FsNode::Type::NAMED_PIPE; case S_IFSOCK: return FsNode::Type::SOCKET; default: return FsNode::Type::OTHER; } } static FsNode::Metadata statToMetadata(struct stat& stats) { // Probably st_ino and st_dev are usually under 32 bits, so mix by rotating st_dev left 32 bits // and XOR. uint64_t d = stats.st_dev; uint64_t hash = ((d << 32) | (d >> 32)) ^ stats.st_ino; return FsNode::Metadata { modeToType(stats.st_mode), implicitCast<uint64_t>(stats.st_size), implicitCast<uint64_t>(stats.st_blocks * 512u), #if __APPLE__ toKjDate(stats.st_mtimespec), #else toKjDate(stats.st_mtim), #endif implicitCast<uint>(stats.st_nlink), hash }; } static bool rmrf(int fd, StringPtr path); static void rmrfChildrenAndClose(int fd) { // Assumes fd is seeked to beginning. DIR* dir = fdopendir(fd); if (dir == nullptr) { close(fd); KJ_FAIL_SYSCALL("fdopendir", errno); }; KJ_DEFER(closedir(dir)); for (;;) { errno = 0; struct dirent* entry = readdir(dir); if (entry == nullptr) { int error = errno; if (error == 0) { break; } else { KJ_FAIL_SYSCALL("readdir", error); } } if (entry->d_name[0] == '.' && (entry->d_name[1] == '\0' || (entry->d_name[1] == '.' && entry->d_name[2] == '\0'))) { // ignore . and .. } else { #ifdef DT_UNKNOWN // d_type is not available on all platforms. if (entry->d_type == DT_DIR) { int subdirFd; KJ_SYSCALL(subdirFd = openat( fd, entry->d_name, O_RDONLY | MAYBE_O_DIRECTORY | MAYBE_O_CLOEXEC)); rmrfChildrenAndClose(subdirFd); KJ_SYSCALL(unlinkat(fd, entry->d_name, AT_REMOVEDIR)); } else if (entry->d_type != DT_UNKNOWN) { KJ_SYSCALL(unlinkat(fd, entry->d_name, 0)); } else { #endif KJ_ASSERT(rmrf(fd, entry->d_name)); #ifdef DT_UNKNOWN } #endif } } } static bool rmrf(int fd, StringPtr path) { struct stat stats; KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd, path.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) { case ENOENT: case ENOTDIR: // Doesn't exist. return false; default: KJ_FAIL_SYSCALL("lstat(path)", error, path) { return false; } } if (S_ISDIR(stats.st_mode)) { int subdirFd; KJ_SYSCALL(subdirFd = openat( fd, path.cStr(), O_RDONLY | MAYBE_O_DIRECTORY | MAYBE_O_CLOEXEC)) { return false; } rmrfChildrenAndClose(subdirFd); KJ_SYSCALL(unlinkat(fd, path.cStr(), AT_REMOVEDIR)) { return false; } } else { KJ_SYSCALL(unlinkat(fd, path.cStr(), 0)) { return false; } } return true; } struct MmapRange { uint64_t offset; uint64_t size; }; static MmapRange getMmapRange(uint64_t offset, uint64_t size) { // Rounds the given byte range up to page boundaries. #ifndef _SC_PAGESIZE #define _SC_PAGESIZE _SC_PAGE_SIZE #endif static const uint64_t pageSize = sysconf(_SC_PAGESIZE); uint64_t pageMask = pageSize - 1; uint64_t realOffset = offset & ~pageMask; uint64_t end = offset + size; uint64_t realEnd = (end + pageMask) & ~pageMask; return { realOffset, realEnd - realOffset }; } class MmapDisposer: public ArrayDisposer { protected: void disposeImpl(void* firstElement, size_t elementSize, size_t elementCount, size_t capacity, void (*destroyElement)(void*)) const { auto range = getMmapRange(reinterpret_cast<uintptr_t>(firstElement), elementSize * elementCount); KJ_SYSCALL(munmap(reinterpret_cast<byte*>(range.offset), range.size)) { break; } } }; constexpr MmapDisposer mmapDisposer = MmapDisposer(); class DiskHandle { // We need to implement each of ReadableFile, AppendableFile, File, ReadableDirectory, and // Directory for disk handles. There is a lot of implementation overlap between these, especially // stat(), sync(), etc. We can't have everything inherit from a common DiskFsNode that implements // these because then we get diamond inheritance which means we need to make all our inheritance // virtual which means downcasting requires RTTI which violates our goal of supporting compiling // with no RTTI. So instead we have the DiskHandle class which implements all the methods without // inheriting anything, and then we have DiskFile, DiskDirectory, etc. hold this and delegate to // it. Ugly, but works. public: DiskHandle(AutoCloseFd&& fd): fd(kj::mv(fd)) {} // OsHandle ------------------------------------------------------------------ AutoCloseFd clone() const { int fd2; #ifdef F_DUPFD_CLOEXEC KJ_SYSCALL_HANDLE_ERRORS(fd2 = fcntl(fd, F_DUPFD_CLOEXEC, 3)) { case EINVAL: case EOPNOTSUPP: // fall back break; default: KJ_FAIL_SYSCALL("fnctl(fd, F_DUPFD_CLOEXEC, 3)", error) { break; } break; } else { return AutoCloseFd(fd2); } #endif KJ_SYSCALL(fd2 = ::dup(fd)); AutoCloseFd result(fd2); setCloexec(result); return result; } int getFd() const { return fd.get(); } // FsNode -------------------------------------------------------------------- FsNode::Metadata stat() const { struct stat stats; KJ_SYSCALL(::fstat(fd, &stats)); return statToMetadata(stats); } void sync() const { #if __APPLE__ // For whatever reason, fsync() on OSX only flushes kernel buffers. It does not flush hardware // disk buffers. This makes it not very useful. But OSX documents fcntl F_FULLFSYNC which does // the right thing. Why they don't just make fsync() do the right thing, I do not know. KJ_SYSCALL(fcntl(fd, F_FULLFSYNC)); #else KJ_SYSCALL(fsync(fd)); #endif } void datasync() const { // The presence of the _POSIX_SYNCHRONIZED_IO define is supposed to tell us that fdatasync() // exists. But Apple defines this yet doesn't offer fdatasync(). Thanks, Apple. #if _POSIX_SYNCHRONIZED_IO && !__APPLE__ KJ_SYSCALL(fdatasync(fd)); #else this->sync(); #endif } // ReadableFile -------------------------------------------------------------- size_t read(uint64_t offset, ArrayPtr<byte> buffer) const { // pread() probably never returns short reads unless it hits EOF. Unfortunately, though, per // spec we are not allowed to assume this. size_t total = 0; while (buffer.size() > 0) { ssize_t n; KJ_SYSCALL(n = pread(fd, buffer.begin(), buffer.size(), offset)); if (n == 0) break; total += n; offset += n; buffer = buffer.slice(n, buffer.size()); } return total; } Array<const byte> mmap(uint64_t offset, uint64_t size) const { auto range = getMmapRange(offset, size); const void* mapping = ::mmap(NULL, range.size, PROT_READ, MAP_SHARED, fd, range.offset); if (mapping == MAP_FAILED) { KJ_FAIL_SYSCALL("mmap", errno); } return Array<const byte>(reinterpret_cast<const byte*>(mapping) + (offset - range.offset), size, mmapDisposer); } Array<byte> mmapPrivate(uint64_t offset, uint64_t size) const { auto range = getMmapRange(offset, size); void* mapping = ::mmap(NULL, range.size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, range.offset); if (mapping == MAP_FAILED) { KJ_FAIL_SYSCALL("mmap", errno); } return Array<byte>(reinterpret_cast<byte*>(mapping) + (offset - range.offset), size, mmapDisposer); } // File ---------------------------------------------------------------------- void write(uint64_t offset, ArrayPtr<const byte> data) const { // pwrite() probably never returns short writes unless there's no space left on disk. // Unfortunately, though, per spec we are not allowed to assume this. while (data.size() > 0) { ssize_t n; KJ_SYSCALL(n = pwrite(fd, data.begin(), data.size(), offset)); KJ_ASSERT(n > 0, "pwrite() returned zero?"); offset += n; data = data.slice(n, data.size()); } } void zero(uint64_t offset, uint64_t size) const { #ifdef FALLOC_FL_PUNCH_HOLE KJ_SYSCALL_HANDLE_ERRORS( fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, size)) { case EOPNOTSUPP: // fall back to below break; default: KJ_FAIL_SYSCALL("fallocate(FALLOC_FL_PUNCH_HOLE)", error) { return; } } else { return; } #endif static const byte ZEROS[4096] = { 0 }; #if __APPLE__ || __CYGWIN__ // Mac & Cygwin doesn't have pwritev(). while (size > sizeof(ZEROS)) { write(offset, ZEROS); size -= sizeof(ZEROS); offset += sizeof(ZEROS); } write(offset, kj::arrayPtr(ZEROS, size)); #else // Use a 4k buffer of zeros amplified by iov to write zeros with as few syscalls as possible. size_t count = (size + sizeof(ZEROS) - 1) / sizeof(ZEROS); const size_t iovmax = miniposix::iovMax(count); KJ_STACK_ARRAY(struct iovec, iov, kj::min(iovmax, count), 16, 256); for (auto& item: iov) { item.iov_base = const_cast<byte*>(ZEROS); item.iov_len = sizeof(ZEROS); } while (size > 0) { size_t iovCount; if (size >= iov.size() * sizeof(ZEROS)) { iovCount = iov.size(); } else { iovCount = size / sizeof(ZEROS); size_t rem = size % sizeof(ZEROS); if (rem > 0) { iov[iovCount++].iov_len = rem; } } ssize_t n; KJ_SYSCALL(n = pwritev(fd, iov.begin(), count, offset)); KJ_ASSERT(n > 0, "pwrite() returned zero?"); offset += n; size -= n; } #endif } void truncate(uint64_t size) const { KJ_SYSCALL(ftruncate(fd, size)); } class WritableFileMappingImpl final: public WritableFileMapping { public: WritableFileMappingImpl(Array<byte> bytes): bytes(kj::mv(bytes)) {} ArrayPtr<byte> get() const override { // const_cast OK because WritableFileMapping does indeed provide a writable view despite // being const itself. return arrayPtr(const_cast<byte*>(bytes.begin()), bytes.size()); } void changed(ArrayPtr<byte> slice) const override { KJ_REQUIRE(slice.begin() >= bytes.begin() && slice.end() <= bytes.end(), "byte range is not part of this mapping"); // msync() requires page-alignment, apparently, so use getMmapRange() to accomplish that. auto range = getMmapRange(reinterpret_cast<uintptr_t>(slice.begin()), slice.size()); KJ_SYSCALL(msync(reinterpret_cast<void*>(range.offset), range.size, MS_ASYNC)); } void sync(ArrayPtr<byte> slice) const override { KJ_REQUIRE(slice.begin() >= bytes.begin() && slice.end() <= bytes.end(), "byte range is not part of this mapping"); // msync() requires page-alignment, apparently, so use getMmapRange() to accomplish that. auto range = getMmapRange(reinterpret_cast<uintptr_t>(slice.begin()), slice.size()); KJ_SYSCALL(msync(reinterpret_cast<void*>(range.offset), range.size, MS_SYNC)); } private: Array<byte> bytes; }; Own<const WritableFileMapping> mmapWritable(uint64_t offset, uint64_t size) const { auto range = getMmapRange(offset, size); void* mapping = ::mmap(NULL, range.size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, range.offset); if (mapping == MAP_FAILED) { KJ_FAIL_SYSCALL("mmap", errno); } auto array = Array<byte>(reinterpret_cast<byte*>(mapping) + (offset - range.offset), size, mmapDisposer); return heap<WritableFileMappingImpl>(kj::mv(array)); } size_t copyChunk(uint64_t offset, int fromFd, uint64_t fromOffset, uint64_t size) const { // Copies a range of bytes from `fromFd` to this file in the most efficient way possible for // the OS. Only returns less than `size` if EOF. Does not account for holes. #if __linux__ { KJ_SYSCALL(lseek(fd, offset, SEEK_SET)); off_t fromPos = fromOffset; off_t end = fromOffset + size; while (fromPos < end) { ssize_t n; KJ_SYSCALL_HANDLE_ERRORS(n = sendfile(fd, fromFd, &fromPos, end - fromPos)) { case EINVAL: case ENOSYS: goto sendfileNotAvailable; default: KJ_FAIL_SYSCALL("sendfile", error) { return fromPos - fromOffset; } } } return fromPos - fromOffset; } sendfileNotAvailable: #endif uint64_t total = 0; while (size > 0) { byte buffer[4096]; ssize_t n; KJ_SYSCALL(n = pread(fromFd, buffer, kj::min(sizeof(buffer), size), fromOffset)); if (n == 0) break; write(offset, arrayPtr(buffer, n)); fromOffset += n; offset += n; total += n; size -= n; } return total; } kj::Maybe<size_t> copy(uint64_t offset, const ReadableFile& from, uint64_t fromOffset, uint64_t size) const { KJ_IF_MAYBE(otherFd, from.getFd()) { #ifdef FICLONE if (offset == 0 && fromOffset == 0 && size == kj::maxValue && stat().size == 0) { if (ioctl(fd, FICLONE, *otherFd) >= 0) { return stat().size; } } else if (size > 0) { // src_length = 0 has special meaning for the syscall, so avoid. struct file_clone_range range; memset(&range, 0, sizeof(range)); range.src_fd = *otherFd; range.dest_offset = offset; range.src_offset = fromOffset; range.src_length = size == kj::maxValue ? 0 : size; if (ioctl(fd, FICLONERANGE, &range) >= 0) { // TODO(someday): What does FICLONERANGE actually do if the range goes past EOF? The docs // don't say. Maybe it only copies the parts that exist. Maybe it punches holes for the // rest. Where does the destination file's EOF marker end up? Who knows? return kj::min(from.stat().size - fromOffset, size); } } else { // size == 0 return size_t(0); } // ioctl failed. Almost all failures documented for these are of the form "the operation is // not supported for the filesystem(s) specified", so fall back to other approaches. #endif off_t toPos = offset; off_t fromPos = fromOffset; off_t end = size == kj::maxValue ? off_t(kj::maxValue) : off_t(fromOffset + size); for (;;) { // Handle data. { // Find out how much data there is before the next hole. off_t nextHole; #ifdef SEEK_HOLE KJ_SYSCALL_HANDLE_ERRORS(nextHole = lseek(*otherFd, fromPos, SEEK_HOLE)) { case EINVAL: // SEEK_HOLE probably not supported. Assume no holes. nextHole = end; break; case ENXIO: // Past EOF. Stop here. return fromPos - fromOffset; default: KJ_FAIL_SYSCALL("lseek(fd, pos, SEEK_HOLE)", error) { return fromPos - fromOffset; } } #else // SEEK_HOLE not supported. Assume no holes. nextHole = end; #endif // Copy the next chunk of data. off_t copyTo = kj::min(end, nextHole); size_t amount = copyTo - fromPos; if (amount > 0) { size_t n = copyChunk(toPos, *otherFd, fromPos, amount); fromPos += n; toPos += n; if (n < amount) { return fromPos - fromOffset; } } if (fromPos == end) { return fromPos - fromOffset; } } #ifdef SEEK_HOLE // Handle hole. { // Find out how much hole there is before the next data. off_t nextData; KJ_SYSCALL_HANDLE_ERRORS(nextData = lseek(*otherFd, fromPos, SEEK_DATA)) { case EINVAL: // SEEK_DATA probably not supported. But we should only have gotten here if we // were expecting a hole. KJ_FAIL_ASSERT("can't determine hole size; SEEK_DATA not supported"); break; case ENXIO: // No more data. Set to EOF. KJ_SYSCALL(nextData = lseek(*otherFd, 0, SEEK_END)); if (nextData > end) { end = nextData; } break; default: KJ_FAIL_SYSCALL("lseek(fd, pos, SEEK_HOLE)", error) { return fromPos - fromOffset; } } // Write zeros. off_t zeroTo = kj::min(end, nextData); off_t amount = zeroTo - fromPos; if (amount > 0) { zero(toPos, amount); toPos += amount; fromPos = zeroTo; } if (fromPos == end) { return fromPos - fromOffset; } } #endif } } // Indicates caller should call File::copy() default implementation. return nullptr; } // ReadableDirectory --------------------------------------------------------- template <typename Func> auto list(bool needTypes, Func&& func) const -> Array<Decay<decltype(func(instance<StringPtr>(), instance<FsNode::Type>()))>> { // Seek to start of directory. KJ_SYSCALL(lseek(fd, 0, SEEK_SET)); // Unfortunately, fdopendir() takes ownership of the file descriptor. Therefore we need to // make a duplicate. int duped; KJ_SYSCALL(duped = dup(fd)); DIR* dir = fdopendir(duped); if (dir == nullptr) { close(duped); KJ_FAIL_SYSCALL("fdopendir", errno); } KJ_DEFER(closedir(dir)); typedef Decay<decltype(func(instance<StringPtr>(), instance<FsNode::Type>()))> Entry; kj::Vector<Entry> entries; for (;;) { errno = 0; struct dirent* entry = readdir(dir); if (entry == nullptr) { int error = errno; if (error == 0) { break; } else { KJ_FAIL_SYSCALL("readdir", error); } } kj::StringPtr name = entry->d_name; if (name != "." && name != ".." && !name.startsWith(HIDDEN_PREFIX)) { #ifdef DT_UNKNOWN // d_type is not available on all platforms. if (entry->d_type != DT_UNKNOWN) { entries.add(func(name, modeToType(DTTOIF(entry->d_type)))); } else { #endif if (needTypes) { // Unknown type. Fall back to stat. struct stat stats; KJ_SYSCALL(fstatat(fd, name.cStr(), &stats, AT_SYMLINK_NOFOLLOW)); entries.add(func(name, modeToType(stats.st_mode))); } else { entries.add(func(name, FsNode::Type::OTHER)); } #ifdef DT_UNKNOWN } #endif } } auto result = entries.releaseAsArray(); std::sort(result.begin(), result.end()); return result; } Array<String> listNames() const { return list(false, [](StringPtr name, FsNode::Type type) { return heapString(name); }); } Array<ReadableDirectory::Entry> listEntries() const { return list(true, [](StringPtr name, FsNode::Type type) { return ReadableDirectory::Entry { type, heapString(name), }; }); } bool exists(PathPtr path) const { KJ_SYSCALL_HANDLE_ERRORS(faccessat(fd, path.toString().cStr(), F_OK, 0)) { case ENOENT: case ENOTDIR: return false; default: KJ_FAIL_SYSCALL("faccessat(fd, path)", error, path) { return false; } } return true; } Maybe<FsNode::Metadata> tryLstat(PathPtr path) const { struct stat stats; KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd, path.toString().cStr(), &stats, AT_SYMLINK_NOFOLLOW)) { case ENOENT: case ENOTDIR: return nullptr; default: KJ_FAIL_SYSCALL("faccessat(fd, path)", error, path) { return nullptr; } } return statToMetadata(stats); } Maybe<Own<const ReadableFile>> tryOpenFile(PathPtr path) const { int newFd; KJ_SYSCALL_HANDLE_ERRORS(newFd = openat( fd, path.toString().cStr(), O_RDONLY | MAYBE_O_CLOEXEC)) { case ENOENT: case ENOTDIR: return nullptr; default: KJ_FAIL_SYSCALL("openat(fd, path, O_RDONLY)", error, path) { return nullptr; } } kj::AutoCloseFd result(newFd); #ifndef O_CLOEXEC setCloexec(result); #endif return newDiskReadableFile(kj::mv(result)); } Maybe<AutoCloseFd> tryOpenSubdirInternal(PathPtr path) const { int newFd; KJ_SYSCALL_HANDLE_ERRORS(newFd = openat( fd, path.toString().cStr(), O_RDONLY | MAYBE_O_CLOEXEC | MAYBE_O_DIRECTORY)) { case ENOENT: return nullptr; case ENOTDIR: // Could mean that a parent is not a directory, which we treat as "doesn't exist". // Could also mean that the specified file is not a directory, which should throw. // Check using exists(). if (!exists(path)) { return nullptr; } // fallthrough default: KJ_FAIL_SYSCALL("openat(fd, path, O_DIRECTORY)", error, path) { return nullptr; } } kj::AutoCloseFd result(newFd); #ifndef O_CLOEXEC setCloexec(result); #endif return kj::mv(result); } Maybe<Own<const ReadableDirectory>> tryOpenSubdir(PathPtr path) const { return tryOpenSubdirInternal(path).map(newDiskReadableDirectory); } Maybe<String> tryReadlink(PathPtr path) const { size_t trySize = 256; for (;;) { KJ_STACK_ARRAY(char, buf, trySize, 256, 4096); ssize_t n = readlinkat(fd, path.toString().cStr(), buf.begin(), buf.size()); if (n < 0) { int error = errno; switch (error) { case EINTR: continue; case ENOENT: case ENOTDIR: case EINVAL: // not a link return nullptr; default: KJ_FAIL_SYSCALL("readlinkat(fd, path)", error, path) { return nullptr; } } } if (n >= buf.size()) { // Didn't give it enough space. Better retry with a bigger buffer. trySize *= 2; continue; } return heapString(buf.begin(), n); } } // Directory ----------------------------------------------------------------- bool tryMkdir(PathPtr path, WriteMode mode, bool noThrow) const { // Internal function to make a directory. auto filename = path.toString(); mode_t acl = has(mode, WriteMode::PRIVATE) ? 0700 : 0777; KJ_SYSCALL_HANDLE_ERRORS(mkdirat(fd, filename.cStr(), acl)) { case EEXIST: { // Apparently this path exists. if (!has(mode, WriteMode::MODIFY)) { // Require exclusive create. return false; } // MODIFY is allowed, so we just need to check whether the existing entry is a directory. struct stat stats; KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd, filename.cStr(), &stats, 0)) { default: // mkdir() says EEXIST but we can't stat it. Maybe it's a dangling link, or maybe // we can't access it for some reason. Assume failure. // // TODO(someday): Maybe we should be creating the directory at the target of the // link? goto failed; } return (stats.st_mode & S_IFMT) == S_IFDIR; } case ENOENT: if (has(mode, WriteMode::CREATE_PARENT) && path.size() > 0 && tryMkdir(path.parent(), WriteMode::CREATE | WriteMode::MODIFY | WriteMode::CREATE_PARENT, true)) { // Retry, but make sure we don't try to create the parent again. return tryMkdir(path, mode - WriteMode::CREATE_PARENT, noThrow); } else { goto failed; } default: failed: if (noThrow) { // Caller requested no throwing. return false; } else { KJ_FAIL_SYSCALL("mkdirat(fd, path)", error, path); } } return true; } kj::Maybe<String> createNamedTemporary( PathPtr finalName, WriteMode mode, Function<int(StringPtr)> tryCreate) const { // Create a temporary file which will eventually replace `finalName`. // // Calls `tryCreate` to actually create the temporary, passing in the desired path. tryCreate() // is expected to behave like a syscall, returning a negative value and setting `errno` on // error. tryCreate() MUST fail with EEXIST if the path exists -- this is not checked in // advance, since it needs to be checked atomically. In the case of EEXIST, tryCreate() will // be called again with a new path. // // Returns the temporary path that succeeded. Only returns nullptr if there was an exception // but we're compiled with -fno-exceptions. if (finalName.size() == 0) { KJ_FAIL_REQUIRE("can't replace self") { break; } return nullptr; } static uint counter = 0; static const pid_t pid = getpid(); String pathPrefix; if (finalName.size() > 1) { pathPrefix = kj::str(finalName.parent(), '/'); } auto path = kj::str(pathPrefix, HIDDEN_PREFIX, pid, '.', counter++, '.', finalName.basename()[0], ".partial"); KJ_SYSCALL_HANDLE_ERRORS(tryCreate(path)) { case EEXIST: return createNamedTemporary(finalName, mode, kj::mv(tryCreate)); case ENOENT: if (has(mode, WriteMode::CREATE_PARENT) && finalName.size() > 1 && tryMkdir(finalName.parent(), WriteMode::CREATE | WriteMode::MODIFY | WriteMode::CREATE_PARENT, true)) { // Retry, but make sure we don't try to create the parent again. mode = mode - WriteMode::CREATE_PARENT; return createNamedTemporary(finalName, mode, kj::mv(tryCreate)); } // fallthrough default: KJ_FAIL_SYSCALL("create(path)", error, path) { break; } return nullptr; } return kj::mv(path); } bool tryReplaceNode(PathPtr path, WriteMode mode, Function<int(StringPtr)> tryCreate) const { // Replaces the given path with an object created by calling tryCreate(). // // tryCreate() must behave like a syscall which creates the node at the path passed to it, // returning a negative value on error. If the path passed to tryCreate already exists, it // MUST fail with EEXIST. // // When `mode` includes MODIFY, replaceNode() reacts to EEXIST by creating the node in a // temporary location and then rename()ing it into place. if (path.size() == 0) { KJ_FAIL_REQUIRE("can't replace self") { return false; } } auto filename = path.toString(); if (has(mode, WriteMode::CREATE)) { // First try just cerating the node in-place. KJ_SYSCALL_HANDLE_ERRORS(tryCreate(filename)) { case EEXIST: // Target exists. if (has(mode, WriteMode::MODIFY)) { // Fall back to MODIFY path, below. break; } else { return false; } case ENOENT: if (has(mode, WriteMode::CREATE_PARENT) && path.size() > 0 && tryMkdir(path.parent(), WriteMode::CREATE | WriteMode::MODIFY | WriteMode::CREATE_PARENT, true)) { // Retry, but make sure we don't try to create the parent again. return tryReplaceNode(path, mode - WriteMode::CREATE_PARENT, kj::mv(tryCreate)); } default: KJ_FAIL_SYSCALL("create(path)", error, path) { return false; } } else { // Success. return true; } } // Either we don't have CREATE mode or the target already exists. We need to perform a // replacement instead. KJ_IF_MAYBE(tempPath, createNamedTemporary(path, mode, kj::mv(tryCreate))) { if (tryCommitReplacement(filename, fd, *tempPath, mode)) { return true; } else { KJ_SYSCALL_HANDLE_ERRORS(unlinkat(fd, tempPath->cStr(), 0)) { case ENOENT: // meh break; default: KJ_FAIL_SYSCALL("unlinkat(fd, tempPath, 0)", error, *tempPath); } return false; } } else { // threw, but exceptions are disabled return false; } } Maybe<AutoCloseFd> tryOpenFileInternal(PathPtr path, WriteMode mode, bool append) const { uint flags = O_RDWR | MAYBE_O_CLOEXEC; mode_t acl = 0666; if (has(mode, WriteMode::CREATE)) { flags |= O_CREAT; } if (!has(mode, WriteMode::MODIFY)) { if (!has(mode, WriteMode::CREATE)) { // Neither CREATE nor MODIFY -- impossible to satisfy preconditions. return nullptr; } flags |= O_EXCL; } if (append) { flags |= O_APPEND; } if (has(mode, WriteMode::EXECUTABLE)) { acl = 0777; } if (has(mode, WriteMode::PRIVATE)) { acl &= 0700; } auto filename = path.toString(); int newFd; KJ_SYSCALL_HANDLE_ERRORS(newFd = openat(fd, filename.cStr(), flags, acl)) { case ENOENT: if (has(mode, WriteMode::CREATE)) { // Either: // - The file is a broken symlink. // - A parent directory didn't exist. if (has(mode, WriteMode::CREATE_PARENT) && path.size() > 0 && tryMkdir(path.parent(), WriteMode::CREATE | WriteMode::MODIFY | WriteMode::CREATE_PARENT, true)) { // Retry, but make sure we don't try to create the parent again. return tryOpenFileInternal(path, mode - WriteMode::CREATE_PARENT, append); } // Check for broken link. if (!has(mode, WriteMode::MODIFY) && faccessat(fd, filename.cStr(), F_OK, AT_SYMLINK_NOFOLLOW) >= 0) { // Yep. We treat this as already-exists, which means in CREATE-only mode this is a // simple failure. return nullptr; } KJ_FAIL_REQUIRE("parent is not a directory", path) { return nullptr; } } else { // MODIFY-only mode. ENOENT = doesn't exist = return null. return nullptr; } case ENOTDIR: if (!has(mode, WriteMode::CREATE)) { // MODIFY-only mode. ENOTDIR = parent not a directory = doesn't exist = return null. return nullptr; } goto failed; case EEXIST: if (!has(mode, WriteMode::MODIFY)) { // CREATE-only mode. EEXIST = already exists = return null. return nullptr; } goto failed; default: failed: KJ_FAIL_SYSCALL("openat(fd, path, O_RDWR | ...)", error, path) { return nullptr; } } kj::AutoCloseFd result(newFd); #ifndef O_CLOEXEC setCloexec(result); #endif return kj::mv(result); } bool tryCommitReplacement(StringPtr toPath, int fromDirFd, StringPtr fromPath, WriteMode mode, int* errorReason = nullptr) const { if (has(mode, WriteMode::CREATE) && has(mode, WriteMode::MODIFY)) { // Always clobber. Try it. KJ_SYSCALL_HANDLE_ERRORS(renameat(fromDirFd, fromPath.cStr(), fd.get(), toPath.cStr())) { case EISDIR: case ENOTDIR: case ENOTEMPTY: case EEXIST: // Failed because target exists and due to the various weird quirks of rename(), it // can't remove it for us. On Linux we can try an exchange instead. On others we have // to move the target out of the way. break; default: if (errorReason == nullptr) { KJ_FAIL_SYSCALL("rename(fromPath, toPath)", error, fromPath, toPath) { return false; } } else { *errorReason = error; return false; } } else { return true; } } #if __linux__ && defined(RENAME_EXCHANGE) // Try to use Linux's renameat2() to atomically check preconditions and apply. if (has(mode, WriteMode::MODIFY)) { // Use an exchange to implement modification. // // We reach this branch when performing a MODIFY-only, or when performing a CREATE | MODIFY // in which we determined above that there's a node of a different type blocking the // exchange. KJ_SYSCALL_HANDLE_ERRORS(syscall(SYS_renameat2, fromDirFd, fromPath.cStr(), fd.get(), toPath.cStr(), RENAME_EXCHANGE)) { case ENOSYS: break; // fall back to traditional means case ENOENT: // Presumably because the target path doesn't exist. if (has(mode, WriteMode::CREATE)) { KJ_FAIL_ASSERT("rename(tmp, path) claimed path exists but " "renameat2(fromPath, toPath, EXCAHNGE) said it doest; concurrent modification?", fromPath, toPath) { return false; } } else { // Assume target doesn't exist. return false; } default: if (errorReason == nullptr) { KJ_FAIL_SYSCALL("renameat2(fromPath, toPath, EXCHANGE)", error, fromPath, toPath) { return false; } } else { *errorReason = error; return false; } } else { // Successful swap! Delete swapped-out content. rmrf(fromDirFd, fromPath); return true; } } else if (has(mode, WriteMode::CREATE)) { KJ_SYSCALL_HANDLE_ERRORS(syscall(SYS_renameat2, fromDirFd, fromPath.cStr(), fd.get(), toPath.cStr(), RENAME_NOREPLACE)) { case ENOSYS: break; // fall back to traditional means case EEXIST: return false; default: if (errorReason == nullptr) { KJ_FAIL_SYSCALL("renameat2(fromPath, toPath, NOREPLACE)", error, fromPath, toPath) { return false; } } else { *errorReason = error; return false; } } else { return true; } } #endif // We're unable to do what we wanted atomically. :( if (has(mode, WriteMode::CREATE) && has(mode, WriteMode::MODIFY)) { // We failed to atomically delete the target previously. So now we need to do two calls in // rapid succession to move the old file away then move the new one into place. // Find out what kind of file exists at the target path. struct stat stats; KJ_SYSCALL(fstatat(fd, toPath.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) { return false; } // Create a temporary location to move the existing object to. Note that rename() allows a // non-directory to replace a non-directory, and allows a directory to replace an empty // directory. So we have to create the right type. Path toPathParsed = Path::parse(toPath); String away; KJ_IF_MAYBE(awayPath, createNamedTemporary(toPathParsed, WriteMode::CREATE, [&](StringPtr candidatePath) { if (S_ISDIR(stats.st_mode)) { return mkdirat(fd, candidatePath.cStr(), 0700); } else { #if __APPLE__ // No mknodat() on OSX, gotta open() a file, ugh. int newFd = openat(fd, candidatePath.cStr(), O_RDWR | O_CREAT | O_EXCL | MAYBE_O_CLOEXEC, 0700); if (newFd >= 0) close(newFd); return newFd; #else return mknodat(fd, candidatePath.cStr(), S_IFREG | 0600, dev_t()); #endif } })) { away = kj::mv(*awayPath); } else { // Already threw. return false; } // OK, now move the target object to replace the thing we just created. KJ_SYSCALL(renameat(fd, toPath.cStr(), fd, away.cStr())) { // Something went wrong. Remove the thing we just created. unlinkat(fd, away.cStr(), S_ISDIR(stats.st_mode) ? AT_REMOVEDIR : 0); return false; } // Now move the source object to the target location. KJ_SYSCALL_HANDLE_ERRORS(renameat(fromDirFd, fromPath.cStr(), fd, toPath.cStr())) { default: // Try to put things back where they were. If this fails, though, then we have little // choice but to leave things broken. KJ_SYSCALL_HANDLE_ERRORS(renameat(fd, away.cStr(), fd, toPath.cStr())) { default: break; } if (errorReason == nullptr) { KJ_FAIL_SYSCALL("rename(fromPath, toPath)", error, fromPath, toPath) { return false; } } else { *errorReason = error; return false; } } // OK, success. Delete the old content. rmrf(fd, away); return true; } else { // Only one of CREATE or MODIFY is specified, so we need to verify non-atomically that the // corresponding precondition (must-not-exist or must-exist, respectively) is held. if (has(mode, WriteMode::CREATE)) { struct stat stats; KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd.get(), toPath.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) { case ENOENT: case ENOTDIR: break; // doesn't exist; continue default: KJ_FAIL_SYSCALL("fstatat(fd, toPath)", error, toPath) { return false; } } else { return false; // already exists; fail } } else if (has(mode, WriteMode::MODIFY)) { struct stat stats; KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd.get(), toPath.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) { case ENOENT: case ENOTDIR: return false; // doesn't exist; fail default: KJ_FAIL_SYSCALL("fstatat(fd, toPath)", error, toPath) { return false; } } else { // already exists; continue } } else { // Neither CREATE nor MODIFY. return false; } // Start over in create-and-modify mode. return tryCommitReplacement(toPath, fromDirFd, fromPath, WriteMode::CREATE | WriteMode::MODIFY, errorReason); } } template <typename T> class ReplacerImpl final: public Directory::Replacer<T> { public: ReplacerImpl(Own<const T>&& object, const DiskHandle& handle, String&& tempPath, String&& path, WriteMode mode) : Directory::Replacer<T>(mode), object(kj::mv(object)), handle(handle), tempPath(kj::mv(tempPath)), path(kj::mv(path)) {} ~ReplacerImpl() noexcept(false) { if (!committed) { rmrf(handle.fd, tempPath); } } const T& get() override { return *object; } bool tryCommit() override { KJ_ASSERT(!committed, "already committed") { return false; } return committed = handle.tryCommitReplacement(path, handle.fd, tempPath, Directory::Replacer<T>::mode); } private: Own<const T> object; const DiskHandle& handle; String tempPath; String path; bool committed = false; // true if *successfully* committed (in which case tempPath is gone) }; template <typename T> class BrokenReplacer final: public Directory::Replacer<T> { // For recovery path when exceptions are disabled. public: BrokenReplacer(Own<const T> inner) : Directory::Replacer<T>(WriteMode::CREATE | WriteMode::MODIFY), inner(kj::mv(inner)) {} const T& get() override { return *inner; } bool tryCommit() override { return false; } private: Own<const T> inner; }; Maybe<Own<const File>> tryOpenFile(PathPtr path, WriteMode mode) const { return tryOpenFileInternal(path, mode, false).map(newDiskFile); } Own<Directory::Replacer<File>> replaceFile(PathPtr path, WriteMode mode) const { mode_t acl = 0666; if (has(mode, WriteMode::EXECUTABLE)) { acl = 0777; } if (has(mode, WriteMode::PRIVATE)) { acl &= 0700; } int newFd_; KJ_IF_MAYBE(temp, createNamedTemporary(path, mode, [&](StringPtr candidatePath) { return newFd_ = openat(fd, candidatePath.cStr(), O_RDWR | O_CREAT | O_EXCL | MAYBE_O_CLOEXEC, acl); })) { AutoCloseFd newFd(newFd_); #ifndef O_CLOEXEC setCloexec(newFd); #endif return heap<ReplacerImpl<File>>(newDiskFile(kj::mv(newFd)), *this, kj::mv(*temp), path.toString(), mode); } else { // threw, but exceptions are disabled return heap<BrokenReplacer<File>>(newInMemoryFile(nullClock())); } } Own<const File> createTemporary() const { int newFd_; #if __linux__ && defined(O_TMPFILE) // Use syscall() to work around glibc bug with O_TMPFILE: // https://sourceware.org/bugzilla/show_bug.cgi?id=17523 KJ_SYSCALL_HANDLE_ERRORS(newFd_ = syscall( SYS_openat, fd.get(), ".", O_RDWR | O_TMPFILE, 0700)) { case EOPNOTSUPP: case EINVAL: // Maybe not supported by this kernel / filesystem. Fall back to below. break; default: KJ_FAIL_SYSCALL("open(O_TMPFILE)", error) { break; } break; } else { AutoCloseFd newFd(newFd_); #ifndef O_CLOEXEC setCloexec(newFd); #endif return newDiskFile(kj::mv(newFd)); } #endif KJ_IF_MAYBE(temp, createNamedTemporary(Path("unnamed"), WriteMode::CREATE, [&](StringPtr path) { return newFd_ = openat(fd, path.cStr(), O_RDWR | O_CREAT | O_EXCL | MAYBE_O_CLOEXEC, 0600); })) { AutoCloseFd newFd(newFd_); #ifndef O_CLOEXEC setCloexec(newFd); #endif auto result = newDiskFile(kj::mv(newFd)); KJ_SYSCALL(unlinkat(fd, temp->cStr(), 0)) { break; } return kj::mv(result); } else { // threw, but exceptions are disabled return newInMemoryFile(nullClock()); } } Maybe<Own<AppendableFile>> tryAppendFile(PathPtr path, WriteMode mode) const { return tryOpenFileInternal(path, mode, true).map(newDiskAppendableFile); } Maybe<Own<const Directory>> tryOpenSubdir(PathPtr path, WriteMode mode) const { // Must create before open. if (has(mode, WriteMode::CREATE)) { if (!tryMkdir(path, mode, false)) return nullptr; } return tryOpenSubdirInternal(path).map(newDiskDirectory); } Own<Directory::Replacer<Directory>> replaceSubdir(PathPtr path, WriteMode mode) const { mode_t acl = has(mode, WriteMode::PRIVATE) ? 0700 : 0777; KJ_IF_MAYBE(temp, createNamedTemporary(path, mode, [&](StringPtr candidatePath) { return mkdirat(fd, candidatePath.cStr(), acl); })) { int subdirFd_; KJ_SYSCALL_HANDLE_ERRORS(subdirFd_ = openat( fd, temp->cStr(), O_RDONLY | MAYBE_O_CLOEXEC | MAYBE_O_DIRECTORY)) { default: KJ_FAIL_SYSCALL("open(just-created-temporary)", error); return heap<BrokenReplacer<Directory>>(newInMemoryDirectory(nullClock())); } AutoCloseFd subdirFd(subdirFd_); #ifndef O_CLOEXEC setCloexec(subdirFd); #endif return heap<ReplacerImpl<Directory>>( newDiskDirectory(kj::mv(subdirFd)), *this, kj::mv(*temp), path.toString(), mode); } else { // threw, but exceptions are disabled return heap<BrokenReplacer<Directory>>(newInMemoryDirectory(nullClock())); } } bool trySymlink(PathPtr linkpath, StringPtr content, WriteMode mode) const { return tryReplaceNode(linkpath, mode, [&](StringPtr candidatePath) { return symlinkat(content.cStr(), fd, candidatePath.cStr()); }); } bool tryTransfer(PathPtr toPath, WriteMode toMode, const Directory& fromDirectory, PathPtr fromPath, TransferMode mode, const Directory& self) const { KJ_REQUIRE(toPath.size() > 0, "can't replace self") { return false; } if (mode == TransferMode::LINK) { KJ_IF_MAYBE(fromFd, fromDirectory.getFd()) { // Other is a disk directory, so we can hopefully do an efficient move/link. return tryReplaceNode(toPath, toMode, [&](StringPtr candidatePath) { return linkat(*fromFd, fromPath.toString().cStr(), fd, candidatePath.cStr(), 0); }); }; } else if (mode == TransferMode::MOVE) { KJ_IF_MAYBE(fromFd, fromDirectory.getFd()) { KJ_ASSERT(mode == TransferMode::MOVE); int error = 0; if (tryCommitReplacement(toPath.toString(), *fromFd, fromPath.toString(), toMode, &error)) { return true; } else switch (error) { case 0: // Plain old WriteMode precondition failure. return false; case EXDEV: // Can't move between devices. Fall back to default implementation, which does // copy/delete. break; case ENOENT: // Either the destination directory doesn't exist or the source path doesn't exist. // Unfortunately we don't really know. If CREATE_PARENT was provided, try creating // the parent directory. Otherwise, we don't actually need to distinguish between // these two errors; just return false. if (has(toMode, WriteMode::CREATE) && has(toMode, WriteMode::CREATE_PARENT) && toPath.size() > 0 && tryMkdir(toPath.parent(), WriteMode::CREATE | WriteMode::MODIFY | WriteMode::CREATE_PARENT, true)) { // Retry, but make sure we don't try to create the parent again. return tryTransfer(toPath, toMode - WriteMode::CREATE_PARENT, fromDirectory, fromPath, mode, self); } return false; default: KJ_FAIL_SYSCALL("rename(fromPath, toPath)", error, fromPath, toPath) { return false; } } } } // OK, we can't do anything efficient using the OS. Fall back to default implementation. return self.Directory::tryTransfer(toPath, toMode, fromDirectory, fromPath, mode); } bool tryRemove(PathPtr path) const { return rmrf(fd, path.toString()); } protected: AutoCloseFd fd; }; #define FSNODE_METHODS(classname) \ Maybe<int> getFd() const override { return DiskHandle::getFd(); } \ \ Own<const FsNode> cloneFsNode() const override { \ return heap<classname>(DiskHandle::clone()); \ } \ \ Metadata stat() const override { return DiskHandle::stat(); } \ void sync() const override { DiskHandle::sync(); } \ void datasync() const override { DiskHandle::datasync(); } class DiskReadableFile final: public ReadableFile, public DiskHandle { public: DiskReadableFile(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {} FSNODE_METHODS(DiskReadableFile); size_t read(uint64_t offset, ArrayPtr<byte> buffer) const override { return DiskHandle::read(offset, buffer); } Array<const byte> mmap(uint64_t offset, uint64_t size) const override { return DiskHandle::mmap(offset, size); } Array<byte> mmapPrivate(uint64_t offset, uint64_t size) const override { return DiskHandle::mmapPrivate(offset, size); } }; class DiskAppendableFile final: public AppendableFile, public DiskHandle, public FdOutputStream { public: DiskAppendableFile(AutoCloseFd&& fd) : DiskHandle(kj::mv(fd)), FdOutputStream(DiskHandle::fd.get()) {} FSNODE_METHODS(DiskAppendableFile); void write(const void* buffer, size_t size) override { FdOutputStream::write(buffer, size); } void write(ArrayPtr<const ArrayPtr<const byte>> pieces) override { FdOutputStream::write(pieces); } }; class DiskFile final: public File, public DiskHandle { public: DiskFile(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {} FSNODE_METHODS(DiskFile); size_t read(uint64_t offset, ArrayPtr<byte> buffer) const override { return DiskHandle::read(offset, buffer); } Array<const byte> mmap(uint64_t offset, uint64_t size) const override { return DiskHandle::mmap(offset, size); } Array<byte> mmapPrivate(uint64_t offset, uint64_t size) const override { return DiskHandle::mmapPrivate(offset, size); } void write(uint64_t offset, ArrayPtr<const byte> data) const override { DiskHandle::write(offset, data); } void zero(uint64_t offset, uint64_t size) const override { DiskHandle::zero(offset, size); } void truncate(uint64_t size) const override { DiskHandle::truncate(size); } Own<const WritableFileMapping> mmapWritable(uint64_t offset, uint64_t size) const override { return DiskHandle::mmapWritable(offset, size); } size_t copy(uint64_t offset, const ReadableFile& from, uint64_t fromOffset, uint64_t size) const override { KJ_IF_MAYBE(result, DiskHandle::copy(offset, from, fromOffset, size)) { return *result; } else { return File::copy(offset, from, fromOffset, size); } } }; class DiskReadableDirectory final: public ReadableDirectory, public DiskHandle { public: DiskReadableDirectory(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {} FSNODE_METHODS(DiskReadableDirectory); Array<String> listNames() const override { return DiskHandle::listNames(); } Array<Entry> listEntries() const override { return DiskHandle::listEntries(); } bool exists(PathPtr path) const override { return DiskHandle::exists(path); } Maybe<FsNode::Metadata> tryLstat(PathPtr path) const override { return DiskHandle::tryLstat(path); } Maybe<Own<const ReadableFile>> tryOpenFile(PathPtr path) const override { return DiskHandle::tryOpenFile(path); } Maybe<Own<const ReadableDirectory>> tryOpenSubdir(PathPtr path) const override { return DiskHandle::tryOpenSubdir(path); } Maybe<String> tryReadlink(PathPtr path) const override { return DiskHandle::tryReadlink(path); } }; class DiskDirectory final: public Directory, public DiskHandle { public: DiskDirectory(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {} FSNODE_METHODS(DiskDirectory); Array<String> listNames() const override { return DiskHandle::listNames(); } Array<Entry> listEntries() const override { return DiskHandle::listEntries(); } bool exists(PathPtr path) const override { return DiskHandle::exists(path); } Maybe<FsNode::Metadata> tryLstat(PathPtr path) const override { return DiskHandle::tryLstat(path); } Maybe<Own<const ReadableFile>> tryOpenFile(PathPtr path) const override { return DiskHandle::tryOpenFile(path); } Maybe<Own<const ReadableDirectory>> tryOpenSubdir(PathPtr path) const override { return DiskHandle::tryOpenSubdir(path); } Maybe<String> tryReadlink(PathPtr path) const override { return DiskHandle::tryReadlink(path); } Maybe<Own<const File>> tryOpenFile(PathPtr path, WriteMode mode) const override { return DiskHandle::tryOpenFile(path, mode); } Own<Replacer<File>> replaceFile(PathPtr path, WriteMode mode) const override { return DiskHandle::replaceFile(path, mode); } Own<const File> createTemporary() const override { return DiskHandle::createTemporary(); } Maybe<Own<AppendableFile>> tryAppendFile(PathPtr path, WriteMode mode) const override { return DiskHandle::tryAppendFile(path, mode); } Maybe<Own<const Directory>> tryOpenSubdir(PathPtr path, WriteMode mode) const override { return DiskHandle::tryOpenSubdir(path, mode); } Own<Replacer<Directory>> replaceSubdir(PathPtr path, WriteMode mode) const override { return DiskHandle::replaceSubdir(path, mode); } bool trySymlink(PathPtr linkpath, StringPtr content, WriteMode mode) const override { return DiskHandle::trySymlink(linkpath, content, mode); } bool tryTransfer(PathPtr toPath, WriteMode toMode, const Directory& fromDirectory, PathPtr fromPath, TransferMode mode) const override { return DiskHandle::tryTransfer(toPath, toMode, fromDirectory, fromPath, mode, *this); } // tryTransferTo() not implemented because we have nothing special we can do. bool tryRemove(PathPtr path) const override { return DiskHandle::tryRemove(path); } }; class DiskFilesystem final: public Filesystem { public: DiskFilesystem() : root(openDir("/")), current(openDir(".")), currentPath(computeCurrentPath()) {} const Directory& getRoot() const override { return root; } const Directory& getCurrent() const override { return current; } PathPtr getCurrentPath() const override { return currentPath; } private: DiskDirectory root; DiskDirectory current; Path currentPath; static AutoCloseFd openDir(const char* dir) { int newFd; KJ_SYSCALL(newFd = open(dir, O_RDONLY | MAYBE_O_CLOEXEC | MAYBE_O_DIRECTORY)); AutoCloseFd result(newFd); #ifndef O_CLOEXEC setCloexec(result); #endif return result; } static Path computeCurrentPath() { // If env var PWD is set and points to the current directory, use it. This captures the current // path according to the user's shell, which may differ from the kernel's idea in the presence // of symlinks. const char* pwd = getenv("PWD"); if (pwd != nullptr) { Path result = nullptr; struct stat pwdStat, dotStat; KJ_IF_MAYBE(e, kj::runCatchingExceptions([&]() { KJ_ASSERT(pwd[0] == '/') { return; } result = Path::parse(pwd + 1); KJ_SYSCALL(lstat(result.toString(true).cStr(), &pwdStat), result) { return; } KJ_SYSCALL(lstat(".", &dotStat)) { return; } })) { // failed, give up on PWD KJ_LOG(WARNING, "PWD environment variable seems invalid", pwd, *e); } else { if (pwdStat.st_ino == dotStat.st_ino && pwdStat.st_dev == dotStat.st_dev) { return kj::mv(result); } else { KJ_LOG(WARNING, "PWD environment variable doesn't match current directory", pwd); } } } size_t size = 256; retry: KJ_STACK_ARRAY(char, buf, size, 256, 4096); if (getcwd(buf.begin(), size) == nullptr) { int error = errno; if (error == ENAMETOOLONG) { size *= 2; goto retry; } else { KJ_FAIL_SYSCALL("getcwd()", error); } } StringPtr path = buf.begin(); // On Linux, the path will start with "(unreachable)" if the working directory is not a subdir // of the root directory, which is possible via chroot() or mount namespaces. KJ_ASSERT(!path.startsWith("(unreachable)"), "working directory is not reachable from root", path); KJ_ASSERT(path.startsWith("/"), "current directory is not absolute", path); return Path::parse(path.slice(1)); } }; } // namespace Own<ReadableFile> newDiskReadableFile(kj::AutoCloseFd fd) { return heap<DiskReadableFile>(kj::mv(fd)); } Own<AppendableFile> newDiskAppendableFile(kj::AutoCloseFd fd) { return heap<DiskAppendableFile>(kj::mv(fd)); } Own<File> newDiskFile(kj::AutoCloseFd fd) { return heap<DiskFile>(kj::mv(fd)); } Own<ReadableDirectory> newDiskReadableDirectory(kj::AutoCloseFd fd) { return heap<DiskReadableDirectory>(kj::mv(fd)); } Own<Directory> newDiskDirectory(kj::AutoCloseFd fd) { return heap<DiskDirectory>(kj::mv(fd)); } Own<Filesystem> newDiskFilesystem() { return heap<DiskFilesystem>(); } } // namespace kj #endif // !_WIN32