Commit 3f4beb3b authored by Kenton Varda's avatar Kenton Varda

Add hashCode to FsNode::Metadata.

This allows detecting when two objects point to the same underlying file.
parent 447087af
......@@ -120,6 +120,11 @@ static FsNode::Type modeToType(mode_t mode) {
}
static FsNode::Metadata statToMetadata(struct stat& stats) {
// Probably st_ino and st_dev are usually under 32 bits, so mix by rotating st_dev left 32 bits
// and XOR.
uint64_t d = stats.st_dev;
uint64_t hash = ((d << 32) | (d >> 32)) ^ stats.st_ino;
return FsNode::Metadata {
modeToType(stats.st_mode),
implicitCast<uint64_t>(stats.st_size),
......@@ -129,7 +134,8 @@ static FsNode::Metadata statToMetadata(struct stat& stats) {
#else
toKjDate(stats.st_mtim),
#endif
implicitCast<uint>(stats.st_nlink)
implicitCast<uint>(stats.st_nlink),
hash
};
}
......
......@@ -104,6 +104,12 @@ static FsNode::Type modeToType(DWORD attrs, DWORD reparseTag) {
static FsNode::Metadata statToMetadata(const BY_HANDLE_FILE_INFORMATION& stats) {
uint64_t size = (implicitCast<uint64_t>(stats.nFileSizeHigh) << 32) | stats.nFileSizeLow;
// Assume file index is usually a small number, i.e. nFileIndexHigh is usually 0. So we try to
// put the serial number in the upper 32 bits and the index in the lower.
uint64_t hash = ((uint64_t(stats.dwVolumeSerialNumber) << 32)
^ (uint64_t(stats.nFileIndexHigh) << 32))
| (uint64_t(stats.nFileIndexLow));
return FsNode::Metadata {
modeToType(stats.dwFileAttributes, 0),
size,
......@@ -111,7 +117,8 @@ static FsNode::Metadata statToMetadata(const BY_HANDLE_FILE_INFORMATION& stats)
// syscall for something rarely used would be sad.
size,
toKjDate(stats.ftLastWriteTime),
stats.nNumberOfLinks
stats.nNumberOfLinks,
hash
};
}
......@@ -126,7 +133,9 @@ static FsNode::Metadata statToMetadata(const WIN32_FIND_DATAW& stats) {
size,
toKjDate(stats.ftLastWriteTime),
// We can't get the number of links without opening the file, apparently. Meh.
1
1,
// We can't produce a reliable hashCode without opening the file.
0
};
}
......@@ -1483,7 +1492,7 @@ public:
}
Metadata stat() override {
return { Type::DIRECTORY, 0, 0, UNIX_EPOCH, 1 };
return { Type::DIRECTORY, 0, 0, UNIX_EPOCH, 1, 0 };
}
void sync() override {}
void datasync() override {}
......
......@@ -779,7 +779,8 @@ public:
}
Metadata stat() override {
return Metadata { Type::FILE, size, size, lastModified, 1 };
uint64_t hash = reinterpret_cast<uintptr_t>(this);
return Metadata { Type::FILE, size, size, lastModified, 1, hash };
}
void sync() override {}
......@@ -967,7 +968,8 @@ public:
}
Metadata stat() override {
return Metadata { Type::DIRECTORY, 0, 0, lastModified, 1 };
uint64_t hash = reinterpret_cast<uintptr_t>(this);
return Metadata { Type::DIRECTORY, 0, 0, lastModified, 1, hash };
}
void sync() override {}
......@@ -1023,7 +1025,8 @@ public:
return entry->node.get<DirectoryNode>().directory->stat();
} else if (entry->node.is<SymlinkNode>()) {
auto& link = entry->node.get<SymlinkNode>();
return FsNode::Metadata { FsNode::Type::SYMLINK, 0, 0, link.lastModified, 1 };
uint64_t hash = reinterpret_cast<uintptr_t>(link.content.begin());
return FsNode::Metadata { FsNode::Type::SYMLINK, 0, 0, link.lastModified, 1, hash };
} else {
KJ_FAIL_ASSERT("unknown node type") { return nullptr; }
}
......
......@@ -341,16 +341,37 @@ public:
uint linkCount = 1;
// Number of hard links pointing to this node.
uint64_t hashCode = 0;
// Hint which can be used to determine if two FsNode instances point to the same underlying
// file object. If two FsNodes report different hashCodes, then they are not the same object.
// If they report the same hashCode, then they may or may not be teh same object.
//
// The Unix filesystem implementation builds the hashCode based on st_dev and st_ino of
// `struct stat`. However, note that some filesystems -- especially FUSE-based -- may not fill
// in st_ino.
//
// The Windows filesystem implementation builds the hashCode based on dwVolumeSerialNumber and
// dwFileIndex{Low,High} of the BY_HANDLE_FILE_INFORMATION structure. However, these are again
// not guaranteed to be unique on all filesystems. In particular the documentation says that
// ReFS uses 128-bit identifiers which can't be represented here, and again virtual filesystems
// may often not report real identifiers.
//
// Of course, the process of hashing values into a single hash code can also cause collisions
// even if the filesystem reports reliable information.
//
// Additionally note that this value is not reliable when returned by `lstat()`. You should
// actually open the object, then call `stat()` on the opened object.
// Not currently included:
// - Device / inode number: Rarely useful, and not safe to use in practice anyway.
// - Access control info: Differs wildly across platforms, and KJ prefers capabilities anyway.
// - Other timestamps: Differs across platforms.
// - Device number: If you care, you're probably doing platform-specific stuff anyway.
Metadata() = default;
Metadata(Type type, uint64_t size, uint64_t spaceUsed, Date lastModified, uint linkCount)
Metadata(Type type, uint64_t size, uint64_t spaceUsed, Date lastModified, uint linkCount,
uint64_t hashCode)
: type(type), size(size), spaceUsed(spaceUsed), lastModified(lastModified),
linkCount(linkCount) {}
linkCount(linkCount), hashCode(hashCode) {}
// TODO(cleanup): This constructor is redundant in C++14, but needed in C++11.
};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment