diff --git a/adb/sysdeps.h b/adb/sysdeps.h index 59425923b..16796cd5c 100644 --- a/adb/sysdeps.h +++ b/adb/sysdeps.h @@ -27,6 +27,7 @@ #include #include +#include // Include this before open/unlink are defined as macros below. #include @@ -288,6 +289,8 @@ extern int adb_chmod(const char *, int); extern int adb_vfprintf(FILE *stream, const char *format, va_list ap) __attribute__((__format__(ADB_FORMAT_ARCHETYPE, 2, 0))); +extern int adb_vprintf(const char *format, va_list ap) + __attribute__((__format__(ADB_FORMAT_ARCHETYPE, 1, 0))); extern int adb_fprintf(FILE *stream, const char *format, ...) __attribute__((__format__(ADB_FORMAT_ARCHETYPE, 2, 3))); extern int adb_printf(const char *format, ...) @@ -295,6 +298,8 @@ extern int adb_printf(const char *format, ...) extern int adb_fputs(const char* buf, FILE* stream); extern int adb_fputc(int ch, FILE* stream); +extern int adb_putchar(int ch); +extern int adb_puts(const char* buf); extern size_t adb_fwrite(const void* ptr, size_t size, size_t nmemb, FILE* stream); @@ -321,13 +326,20 @@ inline void seekdir(DIR*, long) { #define chmod adb_chmod #define vfprintf adb_vfprintf +#define vprintf adb_vprintf #define fprintf adb_fprintf #define printf adb_printf #define fputs adb_fputs #define fputc adb_fputc +// putc may be a macro, so if so, undefine it, so that we can redefine it. +#undef putc +#define putc(c, s) adb_fputc(c, s) +#define putchar adb_putchar +#define puts adb_puts #define fwrite adb_fwrite #define fopen adb_fopen +#define freopen freopen_utf8_not_yet_implemented #define getenv adb_getenv #define putenv putenv_utf8_not_yet_implemented @@ -386,6 +398,12 @@ public: // but does not check if the handle != INVALID_HANDLE_VALUE. typedef std::unique_ptr unique_handle; +namespace internal { + +size_t ParseCompleteUTF8(const char* first, const char* last, std::vector* remaining_bytes); + +} + #else /* !_WIN32 a.k.a. Unix */ #include "fdevent.h" diff --git a/adb/sysdeps_win32.cpp b/adb/sysdeps_win32.cpp index dece96f6d..0b0898186 100644 --- a/adb/sysdeps_win32.cpp +++ b/adb/sysdeps_win32.cpp @@ -2451,12 +2451,15 @@ static void _fh_socketpair_hook( FH fh, int events, EventHook hook ) } +static adb_mutex_t g_console_output_buffer_lock; + void adb_sysdeps_init( void ) { #define ADB_MUTEX(x) InitializeCriticalSection( & x ); #include "mutex_list.h" InitializeCriticalSection( &_win32_lock ); + InitializeCriticalSection( &g_console_output_buffer_lock ); } /**************************************************************************/ @@ -2525,6 +2528,8 @@ static HANDLE _get_console_handle(int fd, DWORD* mode=nullptr) { // Returns a console handle if |stream| is a console, otherwise returns nullptr. static HANDLE _get_console_handle(FILE* const stream) { + // Save and restore errno to make it easier for callers to prevent from overwriting errno. + android::base::ErrnoRestorer er; const int fd = fileno(stream); if (fd < 0) { return nullptr; @@ -3296,6 +3301,9 @@ static HANDLE _console_handle; // when set, console mode should be restored void stdin_raw_init() { const HANDLE in = _get_console_handle(STDIN_FILENO, &_old_console_mode); + if (in == nullptr) { + return; + } // Disable ENABLE_PROCESSED_INPUT so that Ctrl-C is read instead of // calling the process Ctrl-C routine (configured by @@ -3617,16 +3625,98 @@ int adb_chmod(const char* path, int mode) { return _wchmod(path_wide.c_str(), mode); } -// Internal helper function to write UTF-8 bytes to a console. Returns -1 -// on error. -static int _console_write_utf8(const char* buf, size_t size, FILE* stream, - HANDLE console) { - std::wstring output; +// From libutils/Unicode.cpp, get the length of a UTF-8 sequence given the lead byte. +static inline size_t utf8_codepoint_len(uint8_t ch) { + return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1; +} - // Try to convert from data that might be UTF-8 to UTF-16, ignoring errors. - // Data might not be UTF-8 if the user cat's random data, runs dmesg, etc. +namespace internal { + +// Given a sequence of UTF-8 bytes (denoted by the range [first, last)), return the number of bytes +// (from the beginning) that are complete UTF-8 sequences and append the remaining bytes to +// remaining_bytes. +size_t ParseCompleteUTF8(const char* const first, const char* const last, + std::vector* const remaining_bytes) { + // Walk backwards from the end of the sequence looking for the beginning of a UTF-8 sequence. + // Current_after points one byte past the current byte to be examined. + for (const char* current_after = last; current_after != first; --current_after) { + const char* const current = current_after - 1; + const char ch = *current; + const char kHighBit = 0x80u; + const char kTwoHighestBits = 0xC0u; + if ((ch & kHighBit) == 0) { // high bit not set + // The buffer ends with a one-byte UTF-8 sequence, possibly followed by invalid trailing + // bytes with no leading byte, so return the entire buffer. + break; + } else if ((ch & kTwoHighestBits) == kTwoHighestBits) { // top two highest bits set + // Lead byte in UTF-8 sequence, so check if we have all the bytes in the sequence. + const size_t bytes_available = last - current; + if (bytes_available < utf8_codepoint_len(ch)) { + // We don't have all the bytes in the UTF-8 sequence, so return all the bytes + // preceding the current incomplete UTF-8 sequence and append the remaining bytes + // to remaining_bytes. + remaining_bytes->insert(remaining_bytes->end(), current, last); + return current - first; + } else { + // The buffer ends with a complete UTF-8 sequence, possibly followed by invalid + // trailing bytes with no lead byte, so return the entire buffer. + break; + } + } else { + // Trailing byte, so keep going backwards looking for the lead byte. + } + } + + // Return the size of the entire buffer. It is possible that we walked backward past invalid + // trailing bytes with no lead byte, in which case we want to return all those invalid bytes + // so that they can be processed. + return last - first; +} + +} + +// Bytes that have not yet been output to the console because they are incomplete UTF-8 sequences. +// Note that we use only one buffer even though stderr and stdout are logically separate streams. +// This matches the behavior of Linux. +// Protected by g_console_output_buffer_lock. +static auto& g_console_output_buffer = *new std::vector(); + +// Internal helper function to write UTF-8 bytes to a console. Returns -1 on error. +static int _console_write_utf8(const char* const buf, const size_t buf_size, FILE* stream, + HANDLE console) { + const int saved_errno = errno; + std::vector combined_buffer; + + // Complete UTF-8 sequences that should be immediately written to the console. + const char* utf8; + size_t utf8_size; + + adb_mutex_lock(&g_console_output_buffer_lock); + if (g_console_output_buffer.empty()) { + // If g_console_output_buffer doesn't have a buffered up incomplete UTF-8 sequence (the + // common case with plain ASCII), parse buf directly. + utf8 = buf; + utf8_size = internal::ParseCompleteUTF8(buf, buf + buf_size, &g_console_output_buffer); + } else { + // If g_console_output_buffer has a buffered up incomplete UTF-8 sequence, move it to + // combined_buffer (and effectively clear g_console_output_buffer) and append buf to + // combined_buffer, then parse it all together. + combined_buffer.swap(g_console_output_buffer); + combined_buffer.insert(combined_buffer.end(), buf, buf + buf_size); + + utf8 = combined_buffer.data(); + utf8_size = internal::ParseCompleteUTF8(utf8, utf8 + combined_buffer.size(), + &g_console_output_buffer); + } + adb_mutex_unlock(&g_console_output_buffer_lock); + + std::wstring utf16; + + // Try to convert from data that might be UTF-8 to UTF-16, ignoring errors (just like Linux + // which does not return an error on bad UTF-8). Data might not be UTF-8 if the user cat's + // random data, runs dmesg (which might have non-UTF-8), etc. // This could throw std::bad_alloc. - (void)android::base::UTF8ToWide(buf, size, &output); + (void)android::base::UTF8ToWide(utf8, utf8_size, &utf16); // Note that this does not do \n => \r\n translation because that // doesn't seem necessary for the Windows console. For the Windows @@ -3639,16 +3729,16 @@ static int _console_write_utf8(const char* buf, size_t size, FILE* stream, // Write UTF-16 to the console. DWORD written = 0; - if (!WriteConsoleW(console, output.c_str(), output.length(), &written, - NULL)) { + if (!WriteConsoleW(console, utf16.c_str(), utf16.length(), &written, NULL)) { errno = EIO; return -1; } - // This is the number of UTF-16 chars written, which might be different - // than the number of UTF-8 chars passed in. It doesn't seem practical to - // get this count correct. - return written; + // Return the size of the original buffer passed in, signifying that we consumed it all, even + // if nothing was displayed, in the case of being passed an incomplete UTF-8 sequence. This + // matches the Linux behavior. + errno = saved_errno; + return buf_size; } // Function prototype because attributes cannot be placed on func definitions. @@ -3660,14 +3750,21 @@ static int _console_vfprintf(const HANDLE console, FILE* stream, // Returns -1 on error. static int _console_vfprintf(const HANDLE console, FILE* stream, const char *format, va_list ap) { + const int saved_errno = errno; std::string output_utf8; // Format the string. // This could throw std::bad_alloc. android::base::StringAppendV(&output_utf8, format, ap); - return _console_write_utf8(output_utf8.c_str(), output_utf8.length(), - stream, console); + const int result = _console_write_utf8(output_utf8.c_str(), output_utf8.length(), stream, + console); + if (result != -1) { + errno = saved_errno; + } else { + // If -1 was returned, errno has been set. + } + return result; } // Version of vfprintf() that takes UTF-8 and can write Unicode to a @@ -3689,6 +3786,11 @@ int adb_vfprintf(FILE *stream, const char *format, va_list ap) { } } +// Version of vprintf() that takes UTF-8 and can write Unicode to a Windows console. +int adb_vprintf(const char *format, va_list ap) { + return adb_vfprintf(stdout, format, ap); +} + // Version of fprintf() that takes UTF-8 and can write Unicode to a // Windows console. int adb_fprintf(FILE *stream, const char *format, ...) { @@ -3716,6 +3818,7 @@ int adb_printf(const char *format, ...) { int adb_fputs(const char* buf, FILE* stream) { // adb_fprintf returns -1 on error, which is conveniently the same as EOF // which fputs (and hence adb_fputs) should return on error. + static_assert(EOF == -1, "EOF is not -1, so this code needs to be fixed"); return adb_fprintf(stream, "%s", buf); } @@ -3723,32 +3826,32 @@ int adb_fputs(const char* buf, FILE* stream) { // Windows console. int adb_fputc(int ch, FILE* stream) { const int result = adb_fprintf(stream, "%c", ch); - if (result <= 0) { - // If there was an error, or if nothing was printed (which should be an - // error), return an error, which fprintf signifies with EOF. + if (result == -1) { return EOF; } // For success, fputc returns the char, cast to unsigned char, then to int. return static_cast(ch); } +// Version of putchar() that takes UTF-8 and can write Unicode to a Windows console. +int adb_putchar(int ch) { + return adb_fputc(ch, stdout); +} + +// Version of puts() that takes UTF-8 and can write Unicode to a Windows console. +int adb_puts(const char* buf) { + // adb_printf returns -1 on error, which is conveniently the same as EOF + // which puts (and hence adb_puts) should return on error. + static_assert(EOF == -1, "EOF is not -1, so this code needs to be fixed"); + return adb_printf("%s\n", buf); +} + // Internal function to write UTF-8 to a Win32 console. Returns the number of // items (of length size) written. On error, returns a short item count or 0. static size_t _console_fwrite(const void* ptr, size_t size, size_t nmemb, FILE* stream, HANDLE console) { - // TODO: Note that a Unicode character could be several UTF-8 bytes. But - // if we're passed only some of the bytes of a character (for example, from - // the network socket for adb shell), we won't be able to convert the char - // to a complete UTF-16 char (or surrogate pair), so the output won't look - // right. - // - // To fix this, see libutils/Unicode.cpp for hints on decoding UTF-8. - // - // For now we ignore this problem because the alternative is that we'd have - // to parse UTF-8 and buffer things up (doable). At least this is better - // than what we had before -- always incorrect multi-byte UTF-8 output. - int result = _console_write_utf8(reinterpret_cast(ptr), - size * nmemb, stream, console); + const int result = _console_write_utf8(reinterpret_cast(ptr), size * nmemb, stream, + console); if (result == -1) { return 0; } diff --git a/adb/sysdeps_win32_test.cpp b/adb/sysdeps_win32_test.cpp index 1d4028173..8f610cfda 100755 --- a/adb/sysdeps_win32_test.cpp +++ b/adb/sysdeps_win32_test.cpp @@ -137,3 +137,60 @@ TEST(sysdeps_win32, unix_isatty) { // Make sure an invalid FD is handled correctly. EXPECT_EQ(0, unix_isatty(-1)); } + +void TestParseCompleteUTF8(const char* buf, const size_t buf_size, + const size_t expected_complete_bytes, + const std::vector& expected_remaining_bytes) { + std::vector remaining_bytes; + const size_t complete_bytes = internal::ParseCompleteUTF8(buf, buf + buf_size, + &remaining_bytes); + EXPECT_EQ(expected_complete_bytes, complete_bytes); + EXPECT_EQ(expected_remaining_bytes, remaining_bytes); +} + +TEST(sysdeps_win32, ParseCompleteUTF8) { + const std::vector> multi_byte_sequences = { + { '\xc2', '\xa9' }, // 2 byte UTF-8 sequence + { '\xe1', '\xb4', '\xa8' }, // 3 byte UTF-8 sequence + { '\xf0', '\x9f', '\x98', '\x80' }, // 4 byte UTF-8 sequence + }; + std::vector> all_sequences = { + {}, // 0 bytes + { '\0' }, // NULL byte + { 'a' }, // 1 byte UTF-8 sequence + }; + all_sequences.insert(all_sequences.end(), multi_byte_sequences.begin(), + multi_byte_sequences.end()); + + // Vary a prefix of bytes in front of the sequence that we're actually interested in parsing. + for (const auto& prefix : all_sequences) { + // Parse (prefix + one byte of the sequence at a time) + for (const auto& seq : multi_byte_sequences) { + std::vector buffer(prefix); + + // For every byte of the sequence except the last + for (size_t i = 0; i < seq.size() - 1; ++i) { + buffer.push_back(seq[i]); + + // When parsing an incomplete UTF-8 sequence, the amount of the buffer preceding + // the start of the incomplete UTF-8 sequence is valid. The remaining bytes are the + // bytes of the incomplete UTF-8 sequence. + TestParseCompleteUTF8(buffer.data(), buffer.size(), prefix.size(), + std::vector(seq.begin(), seq.begin() + i + 1)); + } + + // For the last byte of the sequence + buffer.push_back(seq.back()); + TestParseCompleteUTF8(buffer.data(), buffer.size(), buffer.size(), std::vector()); + } + + // Parse (prefix (aka sequence) + invalid trailing bytes) to verify that the invalid + // trailing bytes are immediately "returned" to prevent them from being stuck in some + // buffer. + std::vector buffer(prefix); + for (size_t i = 0; i < 8; ++i) { + buffer.push_back(0x80); // trailing byte + TestParseCompleteUTF8(buffer.data(), buffer.size(), buffer.size(), std::vector()); + } + } +}