Commit 65da9fd9 authored by Laszlo Csomor's avatar Laszlo Csomor

io_win32: support non-ASCII paths

Fixes https://github.com/google/protobuf/issues/3951
parent 953a0253
...@@ -30,10 +30,11 @@ ...@@ -30,10 +30,11 @@
// Author: laszlocsomor@google.com (Laszlo Csomor) // Author: laszlocsomor@google.com (Laszlo Csomor)
// //
// Implementation for long-path-aware open/mkdir/etc. on Windows. // Implementation for long-path-aware open/mkdir/access/etc. on Windows, as well
// as for the supporting utility functions.
// //
// These functions convert the input path to an absolute Windows path // These functions convert the input path to an absolute Windows path
// with "\\?\" prefix if necessary, then pass that to _wopen/_wmkdir/etc. // with "\\?\" prefix, then pass that to _wopen/_wmkdir/_waccess/etc.
// (declared in <io.h>) respectively. This allows working with files/directories // (declared in <io.h>) respectively. This allows working with files/directories
// whose paths are longer than MAX_PATH (260 chars). // whose paths are longer than MAX_PATH (260 chars).
// //
...@@ -59,7 +60,6 @@ ...@@ -59,7 +60,6 @@
#include <google/protobuf/stubs/io_win32.h> #include <google/protobuf/stubs/io_win32.h>
#include <google/protobuf/stubs/scoped_ptr.h> #include <google/protobuf/stubs/scoped_ptr.h>
#include <cassert>
#include <memory> #include <memory>
#include <sstream> #include <sstream>
#include <string> #include <string>
...@@ -89,6 +89,11 @@ struct CharTraits<wchar_t> { ...@@ -89,6 +89,11 @@ struct CharTraits<wchar_t> {
static bool is_alpha(wchar_t ch) { return iswalpha(ch); } static bool is_alpha(wchar_t ch) { return iswalpha(ch); }
}; };
template <typename char_type>
bool null_or_empty(const char_type* s) {
return s == nullptr || *s == 0;
}
// Returns true if the path starts with a drive letter, e.g. "c:". // Returns true if the path starts with a drive letter, e.g. "c:".
// Note that this won't check for the "\" after the drive letter, so this also // Note that this won't check for the "\" after the drive letter, so this also
// returns true for "c:foo" (which is "c:\${PWD}\foo"). // returns true for "c:foo" (which is "c:\${PWD}\foo").
...@@ -121,16 +126,7 @@ bool is_drive_relative(const char_type* path) { ...@@ -121,16 +126,7 @@ bool is_drive_relative(const char_type* path) {
return has_drive_letter(path) && (path[2] == 0 || !is_separator(path[2])); return has_drive_letter(path) && (path[2] == 0 || !is_separator(path[2]));
} }
template <typename char_type> wstring join_paths(const wstring& path1, const wstring& path2) {
void replace_directory_separators(char_type* p) {
for (; *p; ++p) {
if (*p == '/') {
*p = '\\';
}
}
}
string join_paths(const string& path1, const string& path2) {
if (path1.empty() || is_path_absolute(path2.c_str()) || if (path1.empty() || is_path_absolute(path2.c_str()) ||
has_longpath_prefix(path2.c_str())) { has_longpath_prefix(path2.c_str())) {
return path2; return path2;
...@@ -144,23 +140,23 @@ string join_paths(const string& path1, const string& path2) { ...@@ -144,23 +140,23 @@ string join_paths(const string& path1, const string& path2) {
: (path1 + path2); : (path1 + path2);
} else { } else {
return is_separator(path2[0]) ? (path1 + path2) return is_separator(path2[0]) ? (path1 + path2)
: (path1 + '\\' + path2); : (path1 + L'\\' + path2);
} }
} }
string normalize(string path) { wstring normalize(wstring path) {
if (has_longpath_prefix(path.c_str())) { if (has_longpath_prefix(path.c_str())) {
path = path.substr(4); path = path.substr(4);
} }
static const string dot("."); static const wstring dot(L".");
static const string dotdot(".."); static const wstring dotdot(L"..");
std::vector<string> segments; std::vector<wstring> segments;
int segment_start = -1; int segment_start = -1;
// Find the path segments in `path` (separated by "/"). // Find the path segments in `path` (separated by "/").
for (int i = 0;; ++i) { for (int i = 0;; ++i) {
if (!is_separator(path[i]) && path[i] != '\0') { if (!is_separator(path[i]) && path[i] != L'\0') {
// The current character does not end a segment, so start one unless it's // The current character does not end a segment, so start one unless it's
// already started. // already started.
if (segment_start < 0) { if (segment_start < 0) {
...@@ -169,7 +165,7 @@ string normalize(string path) { ...@@ -169,7 +165,7 @@ string normalize(string path) {
} else if (segment_start >= 0 && i > segment_start) { } else if (segment_start >= 0 && i > segment_start) {
// The current character is "/" or "\0", so this ends a segment. // The current character is "/" or "\0", so this ends a segment.
// Add that to `segments` if there's anything to add; handle "." and "..". // Add that to `segments` if there's anything to add; handle "." and "..".
string segment(path, segment_start, i - segment_start); wstring segment(path, segment_start, i - segment_start);
segment_start = -1; segment_start = -1;
if (segment == dotdot) { if (segment == dotdot) {
if (!segments.empty() && if (!segments.empty() &&
...@@ -180,7 +176,7 @@ string normalize(string path) { ...@@ -180,7 +176,7 @@ string normalize(string path) {
segments.push_back(segment); segments.push_back(segment);
} }
} }
if (path[i] == '\0') { if (path[i] == L'\0') {
break; break;
} }
} }
...@@ -189,64 +185,58 @@ string normalize(string path) { ...@@ -189,64 +185,58 @@ string normalize(string path) {
// form of it, e.g. "c:\.."). // form of it, e.g. "c:\..").
if (segments.size() == 1 && segments[0].size() == 2 && if (segments.size() == 1 && segments[0].size() == 2 &&
has_drive_letter(segments[0].c_str())) { has_drive_letter(segments[0].c_str())) {
return segments[0] + '\\'; return segments[0] + L'\\';
} }
// Join all segments. // Join all segments.
bool first = true; bool first = true;
std::ostringstream result; std::wstringstream result;
for (int i = 0; i < segments.size(); ++i) { for (int i = 0; i < segments.size(); ++i) {
if (!first) { if (!first) {
result << '\\'; result << L'\\';
} }
first = false; first = false;
result << segments[i]; result << segments[i];
} }
// Preserve trailing separator if the input contained it. // Preserve trailing separator if the input contained it.
if (!path.empty() && is_separator(path[path.size() - 1])) { if (!path.empty() && is_separator(path[path.size() - 1])) {
result << '\\'; result << L'\\';
} }
return result.str(); return result.str();
} }
WCHAR* as_wstring(const string& s) { bool as_windows_path(const char* path, wstring* result) {
int len = ::MultiByteToWideChar(CP_UTF8, 0, s.c_str(), s.size(), NULL, 0); if (null_or_empty(path)) {
WCHAR* result = new WCHAR[len + 1];
::MultiByteToWideChar(CP_UTF8, 0, s.c_str(), s.size(), result, len + 1);
result[len] = 0;
return result;
}
void as_wchar_path(const string& path, wstring* wchar_path) {
scoped_array<WCHAR> wbuf(as_wstring(path));
replace_directory_separators(wbuf.get());
wchar_path->assign(wbuf.get());
}
bool as_windows_path(const string& path, wstring* result) {
if (path.empty()) {
result->clear(); result->clear();
return true; return true;
} }
if (is_separator(path[0]) || is_drive_relative(path.c_str())) { if (is_separator(path[0]) || is_drive_relative(path)) {
return false;
}
wstring wpath;
if (!strings::utf8_to_wcs(path, &wpath)) {
return false; return false;
} }
string mutable_path = path; if (!is_path_absolute(wpath.c_str()) && !has_longpath_prefix(wpath.c_str())) {
if (!is_path_absolute(mutable_path.c_str()) && int size = ::GetCurrentDirectoryW(0, NULL);
!has_longpath_prefix(mutable_path.c_str())) { if (size == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
char cwd[MAX_PATH]; return false;
::GetCurrentDirectoryA(MAX_PATH, cwd); }
mutable_path = join_paths(cwd, mutable_path); scoped_array<WCHAR> wcwd(new WCHAR[size]);
::GetCurrentDirectoryW(size, wcwd.get());
wpath = join_paths(wcwd.get(), wpath);
} }
as_wchar_path(normalize(mutable_path), result); wpath = normalize(wpath);
if (!has_longpath_prefix(result->c_str())) { if (!has_longpath_prefix(wpath.c_str())) {
// Add the "\\?\" prefix unconditionally. This way we prevent the Win32 API // Add the "\\?\" prefix unconditionally. This way we prevent the Win32 API
// from processing the path and "helpfully" removing trailing dots from the // from processing the path and "helpfully" removing trailing dots from the
// path, for example. // path, for example.
// See https://github.com/bazelbuild/bazel/issues/2935 // See https://github.com/bazelbuild/bazel/issues/2935
*result = wstring(L"\\\\?\\") + *result; wpath = wstring(L"\\\\?\\") + wpath;
} }
*result = wpath;
return true; return true;
} }
...@@ -319,13 +309,21 @@ int stat(const char* path, struct _stat* buffer) { ...@@ -319,13 +309,21 @@ int stat(const char* path, struct _stat* buffer) {
FILE* fopen(const char* path, const char* mode) { FILE* fopen(const char* path, const char* mode) {
#ifdef SUPPORT_LONGPATHS #ifdef SUPPORT_LONGPATHS
if (null_or_empty(path)) {
errno = EINVAL;
return NULL;
}
wstring wpath; wstring wpath;
if (!as_windows_path(path, &wpath)) { if (!as_windows_path(path, &wpath)) {
errno = ENOENT; errno = ENOENT;
return NULL; return NULL;
} }
scoped_array<WCHAR> wmode(as_wstring(mode)); wstring wmode;
return ::_wfopen(wpath.c_str(), wmode.get()); if (!strings::utf8_to_wcs(mode, &wmode)) {
errno = EINVAL;
return NULL;
}
return ::_wfopen(wpath.c_str(), wmode.c_str());
#else #else
return ::fopen(path, mode); return ::fopen(path, mode);
#endif #endif
...@@ -347,16 +345,61 @@ int write(int fd, const void* buffer, size_t size) { ...@@ -347,16 +345,61 @@ int write(int fd, const void* buffer, size_t size) {
return ::_write(fd, buffer, size); return ::_write(fd, buffer, size);
} }
wstring testonly_path_to_winpath(const string& path) { wstring testonly_utf8_to_winpath(const char* path) {
wstring wpath; wstring wpath;
as_windows_path(path, &wpath); return as_windows_path(path, &wpath) ? wpath : wstring();
return wpath; }
namespace strings {
bool wcs_to_mbs(const WCHAR* s, string* out, bool outUtf8) {
if (null_or_empty(s)) {
out->clear();
return true;
}
BOOL usedDefaultChar = FALSE;
SetLastError(0);
int size = WideCharToMultiByte(
outUtf8 ? CP_UTF8 : CP_ACP, 0, s, -1, NULL, 0, NULL,
outUtf8 ? NULL : &usedDefaultChar);
if ((size == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER)
|| usedDefaultChar) {
return false;
}
scoped_array<CHAR> astr(new CHAR[size]);
WideCharToMultiByte(
outUtf8 ? CP_UTF8 : CP_ACP, 0, s, -1, astr.get(), size, NULL, NULL);
out->assign(astr.get());
return true;
}
bool mbs_to_wcs(const char* s, wstring* out, bool inUtf8) {
if (null_or_empty(s)) {
out->clear();
return true;
}
SetLastError(0);
int size =
MultiByteToWideChar(inUtf8 ? CP_UTF8 : CP_ACP, 0, s, -1, NULL, 0);
if (size == 0 && GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
return false;
}
scoped_array<WCHAR> wstr(new WCHAR[size]);
MultiByteToWideChar(
inUtf8 ? CP_UTF8 : CP_ACP, 0, s, -1, wstr.get(), size + 1);
out->assign(wstr.get());
return true;
} }
bool utf8_to_wcs(const char* input, wstring* out) {
return mbs_to_wcs(input, out, true);
}
} // namespace strings
} // namespace win32 } // namespace win32
} // namespace internal } // namespace internal
} // namespace protobuf } // namespace protobuf
} // namespace google } // namespace google
#endif // defined(_WIN32) #endif // defined(_WIN32)
...@@ -69,8 +69,22 @@ LIBPROTOBUF_EXPORT int read(int fd, void* buffer, size_t size); ...@@ -69,8 +69,22 @@ LIBPROTOBUF_EXPORT int read(int fd, void* buffer, size_t size);
LIBPROTOBUF_EXPORT int setmode(int fd, int mode); LIBPROTOBUF_EXPORT int setmode(int fd, int mode);
LIBPROTOBUF_EXPORT int stat(const char* path, struct _stat* buffer); LIBPROTOBUF_EXPORT int stat(const char* path, struct _stat* buffer);
LIBPROTOBUF_EXPORT int write(int fd, const void* buffer, size_t size); LIBPROTOBUF_EXPORT int write(int fd, const void* buffer, size_t size);
LIBPROTOBUF_EXPORT std::wstring testonly_path_to_winpath( LIBPROTOBUF_EXPORT std::wstring testonly_utf8_to_winpath(const char* path);
const std::string& path);
namespace strings {
// Convert from UTF-16 to Active-Code-Page-encoded or to UTF-8-encoded text.
LIBPROTOBUF_EXPORT bool wcs_to_mbs(
const wchar_t* s, std::string* out, bool outUtf8);
// Convert from Active-Code-Page-encoded or UTF-8-encoded text to UTF-16.
LIBPROTOBUF_EXPORT bool mbs_to_wcs(
const char* s, std::wstring* out, bool inUtf8);
// Convert from UTF-8-encoded text to UTF-16.
LIBPROTOBUF_EXPORT bool utf8_to_wcs(const char* input, std::wstring* out);
} // namespace strings
} // namespace win32 } // namespace win32
} // namespace internal } // namespace internal
......
...@@ -30,7 +30,8 @@ ...@@ -30,7 +30,8 @@
// Author: laszlocsomor@google.com (Laszlo Csomor) // Author: laszlocsomor@google.com (Laszlo Csomor)
// //
// Unit tests for long-path-aware open/mkdir/access on Windows. // Unit tests for long-path-aware open/mkdir/access/etc. on Windows, as well as
// for the supporting utility functions.
// //
// This file is only used on Windows, it's empty on other platforms. // This file is only used on Windows, it's empty on other platforms.
...@@ -89,13 +90,17 @@ void StripTrailingSlashes(string* str) { ...@@ -89,13 +90,17 @@ void StripTrailingSlashes(string* str) {
str->resize(i+1); str->resize(i+1);
} }
bool GetEnvVar(const char* name, string* result) { bool GetEnvVarAsUtf8(const WCHAR* name, string* result) {
DWORD size = ::GetEnvironmentVariableA(name, NULL, 0); DWORD size = ::GetEnvironmentVariableW(name, NULL, 0);
if (size > 0 && GetLastError() != ERROR_ENVVAR_NOT_FOUND) { if (size > 0 && GetLastError() != ERROR_ENVVAR_NOT_FOUND) {
scoped_array<char> str(new char[size]); scoped_array<WCHAR> wcs(new WCHAR[size]);
::GetEnvironmentVariableA(name, str.get(), size); ::GetEnvironmentVariableW(name, wcs.get(), size);
result->assign(str.get()); // GetEnvironmentVariableA retrieves an Active-Code-Page-encoded text which
return true; // we'd first need to convert to UTF-16 then to UTF-8, because there seems
// to be no API function to do that conversion directly.
// GetEnvironmentVariableW retrieves an UTF-16-encoded text, which we need
// to convert to UTF-8.
return strings::wcs_to_mbs(wcs.get(), result, true);
} else { } else {
return false; return false;
} }
...@@ -104,30 +109,30 @@ bool GetEnvVar(const char* name, string* result) { ...@@ -104,30 +109,30 @@ bool GetEnvVar(const char* name, string* result) {
} // namespace } // namespace
void IoWin32Test::SetUp() { void IoWin32Test::SetUp() {
test_tmpdir.clear();
wtest_tmpdir.clear();
string tmp; string tmp;
bool ok = false; bool ok = false;
if (!ok) { if (!ok) {
ok = GetEnvVar("TEST_TMPDIR", &tmp); ok = GetEnvVarAsUtf8(L"TEST_TMPDIR", &tmp);
} }
if (!ok) { if (!ok) {
ok = GetEnvVar("TEMP", &tmp); ok = GetEnvVarAsUtf8(L"TEMP", &tmp);
} }
if (!ok) { if (!ok) {
ok = GetEnvVar("TMP", &tmp); ok = GetEnvVarAsUtf8(L"TMP", &tmp);
} }
if (!ok || tmp.empty()) { if (!ok || tmp.empty()) {
FAIL(); FAIL();
} }
StripTrailingSlashes(&tmp); StripTrailingSlashes(&tmp);
test_tmpdir = tmp + "\\io_win32_unittest.tmp"; test_tmpdir = tmp + "\\io_win32_unittest.tmp";
wtest_tmpdir = testonly_path_to_winpath(test_tmpdir); wtest_tmpdir = testonly_utf8_to_winpath(test_tmpdir.c_str());
if (!DeleteAllUnder(wtest_tmpdir) || !CreateAllUnder(wtest_tmpdir)) { ASSERT_FALSE(wtest_tmpdir.empty());
FAIL(); ASSERT_TRUE(DeleteAllUnder(wtest_tmpdir));
test_tmpdir.clear(); ASSERT_TRUE(CreateAllUnder(wtest_tmpdir));
wtest_tmpdir.clear();
}
} }
void IoWin32Test::TearDown() { void IoWin32Test::TearDown() {
...@@ -171,8 +176,8 @@ bool IoWin32Test::DeleteAllUnder(wstring path) { ...@@ -171,8 +176,8 @@ bool IoWin32Test::DeleteAllUnder(wstring path) {
path = wstring(L"\\\\?\\") + path; path = wstring(L"\\\\?\\") + path;
} }
// Append "\" if necessary. // Append "\" if necessary.
if (path[path.size() - 1] != '\\') { if (path[path.size() - 1] != L'\\') {
path.push_back('\\'); path.push_back(L'\\');
} }
WIN32_FIND_DATAW metadata; WIN32_FIND_DATAW metadata;
...@@ -290,12 +295,12 @@ TEST_F(IoWin32Test, MkdirTest) { ...@@ -290,12 +295,12 @@ TEST_F(IoWin32Test, MkdirTest) {
} }
TEST_F(IoWin32Test, ChdirTest) { TEST_F(IoWin32Test, ChdirTest) {
char owd[MAX_PATH]; WCHAR owd[MAX_PATH];
EXPECT_GT(::GetCurrentDirectoryA(MAX_PATH, owd), 0); EXPECT_GT(::GetCurrentDirectoryW(MAX_PATH, owd), 0);
string path("C:\\"); string path("C:\\");
EXPECT_EQ(access(path.c_str(), F_OK), 0); EXPECT_EQ(access(path.c_str(), F_OK), 0);
ASSERT_EQ(chdir(path.c_str()), 0); ASSERT_EQ(chdir(path.c_str()), 0);
EXPECT_TRUE(::SetCurrentDirectoryA(owd)); EXPECT_TRUE(::SetCurrentDirectoryW(owd));
// Do not try to chdir into the test_tmpdir, it may already contain directory // Do not try to chdir into the test_tmpdir, it may already contain directory
// names with trailing dots. // names with trailing dots.
...@@ -316,11 +321,11 @@ TEST_F(IoWin32Test, AsWindowsPathTest) { ...@@ -316,11 +321,11 @@ TEST_F(IoWin32Test, AsWindowsPathTest) {
EXPECT_GT(GetCurrentDirectoryW(size, cwd_str.get()), 0); EXPECT_GT(GetCurrentDirectoryW(size, cwd_str.get()), 0);
wstring cwd = wstring(L"\\\\?\\") + cwd_str.get(); wstring cwd = wstring(L"\\\\?\\") + cwd_str.get();
ASSERT_EQ(testonly_path_to_winpath("relative_mkdirtest"), ASSERT_EQ(testonly_utf8_to_winpath("relative_mkdirtest"),
cwd + L"\\relative_mkdirtest"); cwd + L"\\relative_mkdirtest");
ASSERT_EQ(testonly_path_to_winpath("preserve//\\trailing///"), ASSERT_EQ(testonly_utf8_to_winpath("preserve//\\trailing///"),
cwd + L"\\preserve\\trailing\\"); cwd + L"\\preserve\\trailing\\");
ASSERT_EQ(testonly_path_to_winpath("./normalize_me\\/../blah"), ASSERT_EQ(testonly_utf8_to_winpath("./normalize_me\\/../blah"),
cwd + L"\\blah"); cwd + L"\\blah");
std::ostringstream relpath; std::ostringstream relpath;
for (wchar_t* p = cwd_str.get(); *p; ++p) { for (wchar_t* p = cwd_str.get(); *p; ++p) {
...@@ -329,18 +334,43 @@ TEST_F(IoWin32Test, AsWindowsPathTest) { ...@@ -329,18 +334,43 @@ TEST_F(IoWin32Test, AsWindowsPathTest) {
} }
} }
relpath << ".\\/../\\./beyond-toplevel"; relpath << ".\\/../\\./beyond-toplevel";
ASSERT_EQ(testonly_path_to_winpath(relpath.str()), ASSERT_EQ(testonly_utf8_to_winpath(relpath.str().c_str()),
wstring(L"\\\\?\\") + cwd_str.get()[0] + L":\\beyond-toplevel"); wstring(L"\\\\?\\") + cwd_str.get()[0] + L":\\beyond-toplevel");
// Absolute unix paths lack drive letters, driveless absolute windows paths // Absolute unix paths lack drive letters, driveless absolute windows paths
// do too. Neither can be converted to a drive-specifying absolute Windows // do too. Neither can be converted to a drive-specifying absolute Windows
// path. // path.
ASSERT_EQ(testonly_path_to_winpath("/absolute/unix/path"), L""); ASSERT_EQ(testonly_utf8_to_winpath("/absolute/unix/path"), L"");
// Though valid on Windows, we also don't support UNC paths (\\UNC\\blah). // Though valid on Windows, we also don't support UNC paths (\\UNC\\blah).
ASSERT_EQ(testonly_path_to_winpath("\\driveless\\absolute"), L""); ASSERT_EQ(testonly_utf8_to_winpath("\\driveless\\absolute"), L"");
// Though valid in cmd.exe, drive-relative paths are not supported. // Though valid in cmd.exe, drive-relative paths are not supported.
ASSERT_EQ(testonly_path_to_winpath("c:foo"), L""); ASSERT_EQ(testonly_utf8_to_winpath("c:foo"), L"");
ASSERT_EQ(testonly_path_to_winpath("c:/foo"), L"\\\\?\\c:\\foo"); ASSERT_EQ(testonly_utf8_to_winpath("c:/foo"), L"\\\\?\\c:\\foo");
}
TEST_F(IoWin32Test, Utf8ToUtf16Test) {
const char hi_utf8[] = {
'h', 'i', ' ',
// utf-8: 11010000 10011111, utf-16: 100 0001 1111 = 0x041F
0xd0, 0x9f,
// utf-8: 11010001 10000000, utf-16: 100 0100 0000 = 0x0440
0xd1, 0x80,
// utf-8: 11010000 10111000, utf-16: 100 0011 1000 = 0x0438
0xd0, 0xb8,
// utf-8: 11010000 10110010, utf-16: 100 0011 0010 = 0x0432
0xd0, 0xb2,
// utf-8: 11010000 10110101, utf-16: 100 0011 0101 = 0x0435
0xd0, 0xb5,
// utf-8: 11010001 10000010, utf-16: 100 0100 0010 = 0x0442
0xd1, 0x82, 0
};
const wchar_t hi_utf16[] = {
L'h', L'i', L' ', 0x041f, 0x0440, 0x0438, 0x0432, 0x0435, 0x0442, 0
};
wstring wcs;
ASSERT_TRUE(strings::utf8_to_wcs(hi_utf8, &wcs));
ASSERT_EQ(wcs, hi_utf16);
} }
} // namespace } // namespace
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment