From 27393d5ca63c55313182bd6cbb8ef2ecd4c1472f Mon Sep 17 00:00:00 2001 From: Count Count Date: Sun, 21 Mar 2021 08:19:34 +0100 Subject: [PATCH] fix incomplete UTF-8 writes in Windows console stdio --- library/std/src/sys/windows/stdio.rs | 116 +++++++++++++++++++++++---- 1 file changed, 102 insertions(+), 14 deletions(-) diff --git a/library/std/src/sys/windows/stdio.rs b/library/std/src/sys/windows/stdio.rs index be3141e46a1c..160d9bcd4d22 100644 --- a/library/std/src/sys/windows/stdio.rs +++ b/library/std/src/sys/windows/stdio.rs @@ -14,8 +14,18 @@ use crate::sys::handle::Handle; pub struct Stdin { surrogate: u16, } -pub struct Stdout; -pub struct Stderr; +pub struct Stdout { + incomplete_utf8: IncompleteUtf8, +} + +pub struct Stderr { + incomplete_utf8: IncompleteUtf8, +} + +struct IncompleteUtf8 { + bytes: [u8; 4], + len: u8, +} // Apparently Windows doesn't handle large reads on stdin or writes to stdout/stderr well (see // #13304 for details). @@ -50,7 +60,27 @@ fn is_console(handle: c::HANDLE) -> bool { unsafe { c::GetConsoleMode(handle, &mut mode) != 0 } } -fn write(handle_id: c::DWORD, data: &[u8]) -> io::Result { +// Simple reimplementation of std::str::utf8_char_width() which is feature-gated +fn utf8_char_width(b: u8) -> usize { + match b { + 0x00..=0x7F => 1, + 0x80..=0xC1 => 0, + 0xC2..=0xDF => 2, + 0xE0..=0xEF => 3, + 0xF0..=0xF4 => 4, + 0xF5..=0xFF => 0, + } +} + +fn write( + handle_id: c::DWORD, + data: &[u8], + incomplete_utf8: &mut IncompleteUtf8, +) -> io::Result { + if data.is_empty() { + return Ok(0); + } + let handle = get_handle(handle_id)?; if !is_console(handle) { let handle = Handle::new(handle); @@ -59,22 +89,74 @@ fn write(handle_id: c::DWORD, data: &[u8]) -> io::Result { return ret; } - // As the console is meant for presenting text, we assume bytes of `data` come from a string - // and are encoded as UTF-8, which needs to be encoded as UTF-16. + match incomplete_utf8.len { + 0 => {} + 1..=3 => { + if data[0] >> 6 != 0b10 { + incomplete_utf8.len = 0; + // not a continuation byte - reject + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Windows stdio in console mode does not support writing non-UTF-8 byte sequences", + )); + } + incomplete_utf8.bytes[incomplete_utf8.len as usize] = data[0]; + incomplete_utf8.len += 1; + let char_width = utf8_char_width(incomplete_utf8.bytes[0]); + if (incomplete_utf8.len as usize) < char_width { + // more bytes needed + return Ok(1); + } + let s = str::from_utf8(&incomplete_utf8.bytes[0..incomplete_utf8.len as usize]); + incomplete_utf8.len = 0; + match s { + Ok(s) => { + assert_eq!(char_width, s.len()); + let written = write_valid_utf8(handle, s)?; + assert_eq!(written, s.len()); // guaranteed by write0() for single codepoint writes + return Ok(1); + } + Err(_) => { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "Windows stdio in console mode does not support writing non-UTF-8 byte sequences", + )); + } + } + } + _ => { + panic!("Unexpected number of incomplete UTF-8 chars."); + } + } + + // As the console is meant for presenting text, we assume bytes of `data` are encoded as UTF-8, + // which needs to be encoded as UTF-16. // // If the data is not valid UTF-8 we write out as many bytes as are valid. - // Only when there are no valid bytes (which will happen on the next call), return an error. + // If the first byte is invalid it is either first byte of a multi-byte sequence but the + // provided byte slice is too short or it is the first byte of an invalide multi-byte sequence. let len = cmp::min(data.len(), MAX_BUFFER_SIZE / 2); let utf8 = match str::from_utf8(&data[..len]) { Ok(s) => s, Err(ref e) if e.valid_up_to() == 0 => { - return Err(io::Error::new_const( - io::ErrorKind::InvalidData, - &"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", - )); + if data.len() < utf8_char_width(data[0]) { + incomplete_utf8.bytes[0] = data[0]; + incomplete_utf8.len = 1; + return Ok(1); + } else { + return Err(io::Error::new_const( + io::ErrorKind::InvalidData, + &"Windows stdio in console mode does not support writing non-UTF-8 byte sequences", + )); + } } Err(e) => str::from_utf8(&data[..e.valid_up_to()]).unwrap(), }; + + write_valid_utf8(handle, utf8) +} + +fn write_valid_utf8(handle: c::HANDLE, utf8: &str) -> io::Result { let mut utf16 = [0u16; MAX_BUFFER_SIZE / 2]; let mut len_utf16 = 0; for (chr, dest) in utf8.encode_utf16().zip(utf16.iter_mut()) { @@ -254,15 +336,21 @@ fn utf16_to_utf8(utf16: &[u16], utf8: &mut [u8]) -> io::Result { Ok(written) } +impl IncompleteUtf8 { + pub const fn new() -> IncompleteUtf8 { + IncompleteUtf8 { bytes: [0; 4], len: 0 } + } +} + impl Stdout { pub const fn new() -> Stdout { - Stdout + Stdout { incomplete_utf8: IncompleteUtf8::new() } } } impl io::Write for Stdout { fn write(&mut self, buf: &[u8]) -> io::Result { - write(c::STD_OUTPUT_HANDLE, buf) + write(c::STD_ERROR_HANDLE, buf, &mut self.incomplete_utf8) } fn flush(&mut self) -> io::Result<()> { @@ -272,13 +360,13 @@ impl io::Write for Stdout { impl Stderr { pub const fn new() -> Stderr { - Stderr + Stderr { incomplete_utf8: IncompleteUtf8::new() } } } impl io::Write for Stderr { fn write(&mut self, buf: &[u8]) -> io::Result { - write(c::STD_ERROR_HANDLE, buf) + write(c::STD_ERROR_HANDLE, buf, &mut self.incomplete_utf8) } fn flush(&mut self) -> io::Result<()> {