From ccebe901d7d76dfddc082d994efa54ef2aefee57 Mon Sep 17 00:00:00 2001 From: Florent Le Coz Date: Sat, 9 Nov 2013 06:01:47 +0100 Subject: Check UTF-8 encoding, and convert strings to UTF-8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Handle conversion errors properly by inserting � instead. Add a binary header to provide portable way to write binary literals (I like them) Also add a test file. ref #2404 --- src/main.cpp | 29 +++-------- src/test.cpp | 43 +++++++++++++++ src/utils/binary.hpp | 16 ++++++ src/utils/encoding.cpp | 139 +++++++++++++++++++++++++++++++++++++++++++++++++ src/utils/encoding.hpp | 21 ++++++++ 5 files changed, 226 insertions(+), 22 deletions(-) create mode 100644 src/test.cpp create mode 100644 src/utils/binary.hpp create mode 100644 src/utils/encoding.cpp create mode 100644 src/utils/encoding.hpp (limited to 'src') diff --git a/src/main.cpp b/src/main.cpp index b0fb140..b7fa01e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,31 +1,16 @@ -#include -#include #include - -#include -#include +#include #include -#include -#include - -#include - int main() { Poller p; - // Now I'm the bridge, creating an ircclient because needed. - std::shared_ptr c = std::make_shared(); - p.add_socket_handler(c); - std::shared_ptr d = std::make_shared(); - p.add_socket_handler(d); - std::shared_ptr e = std::make_shared(); - p.add_socket_handler(e); - c->connect("localhost", "7877"); - d->connect("localhost", "7878"); - e->connect("localhost", "7879"); - while (true) - p.poll(); + std::shared_ptr xmpp_component = + std::make_shared("irc.localhost", "secret"); + p.add_socket_handler(xmpp_component); + xmpp_component->start(); + while (p.poll()) + ; return 0; } diff --git a/src/test.cpp b/src/test.cpp new file mode 100644 index 0000000..e3bfa55 --- /dev/null +++ b/src/test.cpp @@ -0,0 +1,43 @@ +/** + * Just a very simple test suite, by hand, using assert() + */ + +#include + +#include + +#include +#include + +#include + +int main() +{ + /** + * Encoding + */ + const char* valid = "C̡͔͕̩͙̽ͫ̈́ͥ̿̆ͧ̚r̸̩̘͍̻͖̆͆͛͊̉̕͡o͇͈̳̤̱̊̈͢q̻͍̦̮͕ͥͬͬ̽ͭ͌̾ͅǔ͉͕͇͚̙͉̭͉̇̽ȇ͈̮̼͍͔ͣ͊͞͝ͅ ͫ̾ͪ̓ͥ̆̋̔҉̢̦̠͈͔̖̲̯̦ụ̶̯͐̃̋ͮ͆͝n̬̱̭͇̻̱̰̖̤̏͛̏̿̑͟ë́͐҉̸̥̪͕̹̻̙͉̰ ̹̼̱̦̥ͩ͑̈́͑͝ͅt͍̥͈̹̝ͣ̃̔̈̔ͧ̕͝ḙ̸̖̟̙͙ͪ͢ų̯̞̼̲͓̻̞͛̃̀́b̮̰̗̩̰̊̆͗̾̎̆ͯ͌͝.̗̙͎̦ͫ̈́ͥ͌̈̓ͬ"; + assert(utils::is_valid_utf8(valid) == true); + const char* invalid = "\xF0\x0F"; + assert(utils::is_valid_utf8(invalid) == false); + const char* invalid2 = "\xFE\xFE\xFF\xFF"; + assert(utils::is_valid_utf8(invalid2) == false); + + std::string in = "coucou les copains ♥ "; + assert(utils::is_valid_utf8(in.c_str()) == true); + std::string res = utils::convert_to_utf8(in, "UTF-8"); + assert(utils::is_valid_utf8(res.c_str()) == true && res == in); + + std::string original_utf8("couc¥ou"); + std::string original_latin1("couc\xa5ou"); + + // When converting back to utf-8 + std::string from_latin1 = utils::convert_to_utf8(original_latin1.c_str(), "ISO-8859-1"); + assert(from_latin1 == original_utf8); + + // Check the behaviour when the decoding fails (here because we provide a + // wrong charset) + std::string from_ascii = utils::convert_to_utf8(original_latin1, "US-ASCII"); + assert(from_ascii == "couc�ou"); + return 0; +} diff --git a/src/utils/binary.hpp b/src/utils/binary.hpp new file mode 100644 index 0000000..10807bc --- /dev/null +++ b/src/utils/binary.hpp @@ -0,0 +1,16 @@ +#ifndef BINARY_INCLUDED +# define BINARY_INCLUDED + +template struct binary +{ + static_assert(FIRST == '0' || FIRST == '1', "invalid binary digit" ); + enum { value = ((FIRST - '0') << sizeof...(REST)) + binary::value }; +}; + +template<> struct binary<'0'> { enum { value = 0 }; }; +template<> struct binary<'1'> { enum { value = 1 }; }; + +template inline +constexpr unsigned int operator "" _b() { return binary::value; } + +#endif // BINARY_INCLUDED diff --git a/src/utils/encoding.cpp b/src/utils/encoding.cpp new file mode 100644 index 0000000..a1bc01b --- /dev/null +++ b/src/utils/encoding.cpp @@ -0,0 +1,139 @@ +#include +#include + +#include + +#include +#include +#include + +/** + * The UTF-8-encoded character used as a place holder when a character conversion fails. + * This is U+FFFD � "replacement character" + */ +static const char* invalid_char = "\xef\xbf\xbd"; +static const size_t invalid_char_len = 3; + +namespace utils +{ + /** + * Based on http://en.wikipedia.org/wiki/UTF-8#Description + */ + bool is_valid_utf8(const char* s) + { + if (!s) + return false; + + const unsigned char* str = reinterpret_cast(s); + + while (*str) + { + // 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + if ((str[0] & 11111000_b) == 11110000_b) + { + if (!str[1] || !str[2] || !str[3] + || ((str[1] & 11000000_b) != 10000000_b) + || ((str[2] & 11000000_b) != 10000000_b) + || ((str[3] & 11000000_b) != 10000000_b)) + return false; + str += 4; + } + // 3 bytes: 1110xxx 10xxxxxx 10xxxxxx + else if ((str[0] & 11110000_b) == 11100000_b) + { + if (!str[1] || !str[2] + || ((str[1] & 11000000_b) != 10000000_b) + || ((str[2] & 11000000_b) != 10000000_b)) + return false; + str += 3; + } + // 2 bytes: 110xxxxx 10xxxxxx + else if (((str[0]) & 11100000_b) == 11000000_b) + { + if (!str[1] || + ((str[1] & 11000000_b) != 10000000_b)) + return false; + str += 2; + } + // 1 byte: 0xxxxxxx + else if ((str[0] & 10000000_b) != 0) + return false; + else + str++; + } + return true; + } + + std::string convert_to_utf8(const std::string& str, const char* charset) + { + std::string res; + + const iconv_t cd = iconv_open("UTF-8", charset); + if (cd == (iconv_t)-1) + throw std::runtime_error("Cannot convert into UTF-8"); + + // Make sure cd is always closed when we leave this function + ScopeGuard sg([&]{ iconv_close(cd); }); + + // iconv will not attempt to modify this buffer, but it still requires + // a char**. + size_t inbytesleft = str.size(); + char* inbuf_ptr = const_cast(str.c_str()); + + size_t outbytesleft = str.size() * 4; + char* outbuf = new char[outbytesleft]; + char* outbuf_ptr = outbuf; + + // Make sure outbuf is always deleted when we leave this function + sg.add_callback([&]{ delete[] outbuf; }); + + bool done = false; + while (done == false) + { + size_t error = iconv(cd, &inbuf_ptr, &inbytesleft, &outbuf_ptr, &outbytesleft); + if ((size_t)-1 == error) + { + switch (errno) + { + case EILSEQ: + // Invalid byte found. Insert a placeholder instead of the + // converted character, jump one byte and continue + memcpy(outbuf_ptr, invalid_char, invalid_char_len); + outbuf_ptr += invalid_char_len; + inbytesleft--; + inbuf_ptr++; + break; + case EINVAL: + // A multibyte sequence is not terminated, but we can't + // provide any more data, so we just add a placeholder to + // indicate that the character is not properly converted, + // and we stop the conversion + memcpy(outbuf_ptr, invalid_char, invalid_char_len); + outbuf_ptr += invalid_char_len; + outbuf_ptr++; + done = true; + break; + case E2BIG: + // This should never happen + done = true; + default: + // This should happen even neverer + done = true; + break; + } + } + else + { + // The conversion finished without any error, stop converting + done = true; + } + } + // Terminate the converted buffer, and copy that buffer it into the + // string we return + *outbuf_ptr = '\0'; + res = outbuf; + return res; + } + +} + diff --git a/src/utils/encoding.hpp b/src/utils/encoding.hpp new file mode 100644 index 0000000..362f1df --- /dev/null +++ b/src/utils/encoding.hpp @@ -0,0 +1,21 @@ +#ifndef ENCODING_INCLUDED +# define ENCODING_INCLUDED + +#include + +namespace utils +{ + /** + * Returns true if the given null-terminated string is valid utf-8. + * + * Based on http://en.wikipedia.org/wiki/UTF-8#Description + */ + bool is_valid_utf8(const char* s); + /** + * Convert the given string (encoded is "encoding") into valid utf-8. + * If some decoding fails, insert an utf-8 placeholder character instead. + */ + std::string convert_to_utf8(const std::string& str, const char* encoding); +} + +#endif // ENCODING_INCLUDED -- cgit v1.2.3