From 0ec82c104ded01a44ed36d20e25220fa41887fd0 Mon Sep 17 00:00:00 2001 From: Florent Le Coz Date: Fri, 27 Feb 2015 12:18:34 +0100 Subject: Add louloulibs as a submodule --- louloulibs | 1 + 1 file changed, 1 insertion(+) create mode 160000 louloulibs (limited to 'louloulibs/utils/encoding.cpp') diff --git a/louloulibs b/louloulibs new file mode 160000 index 0000000..b6af145 --- /dev/null +++ b/louloulibs @@ -0,0 +1 @@ +Subproject commit b6af145bfb9561a1bb1ecb940f50163c5ce4dbbb -- cgit v1.2.3 From e6569a1090be063f34624474f0d4578f37a169ae Mon Sep 17 00:00:00 2001 From: Florent Le Coz Date: Fri, 27 Feb 2015 12:40:50 +0100 Subject: Only use include_directory() if the directory path is defined --- louloulibs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'louloulibs/utils/encoding.cpp') diff --git a/louloulibs b/louloulibs index b6af145..d6a3724 160000 --- a/louloulibs +++ b/louloulibs @@ -1 +1 @@ -Subproject commit b6af145bfb9561a1bb1ecb940f50163c5ce4dbbb +Subproject commit d6a3724c6a0127a49a9e7adb1090bb7438c8d0f2 -- cgit v1.2.3 From e4c696861d86b62305ca0ec8136e79f147837b94 Mon Sep 17 00:00:00 2001 From: Florent Le Coz Date: Mon, 2 Mar 2015 11:06:40 +0100 Subject: Update louloulibs to last revision --- louloulibs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'louloulibs/utils/encoding.cpp') diff --git a/louloulibs b/louloulibs index d6a3724..5f3a1bb 160000 --- a/louloulibs +++ b/louloulibs @@ -1 +1 @@ -Subproject commit d6a3724c6a0127a49a9e7adb1090bb7438c8d0f2 +Subproject commit 5f3a1bb54df4de5f332282bbdf791bdce07c71c4 -- cgit v1.2.3 From d88ec5fdf10ecb168355bc38dc81d83ff59a0234 Mon Sep 17 00:00:00 2001 From: Florent Le Coz Date: Mon, 2 Mar 2015 11:32:18 +0100 Subject: Update to latest louloulibs revision --- louloulibs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'louloulibs/utils/encoding.cpp') diff --git a/louloulibs b/louloulibs index 5f3a1bb..d0b8695 160000 --- a/louloulibs +++ b/louloulibs @@ -1 +1 @@ -Subproject commit 5f3a1bb54df4de5f332282bbdf791bdce07c71c4 +Subproject commit d0b8695ceb13e0c6d72821fe605de36e494afcdf -- cgit v1.2.3 From c243fea660723eba00b65e639b76d0783cb59064 Mon Sep 17 00:00:00 2001 From: Florent Le Coz Date: Wed, 4 Mar 2015 05:56:44 +0100 Subject: Update to latest louloulibs revision --- louloulibs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'louloulibs/utils/encoding.cpp') diff --git a/louloulibs b/louloulibs index d0b8695..99757a4 160000 --- a/louloulibs +++ b/louloulibs @@ -1 +1 @@ -Subproject commit d0b8695ceb13e0c6d72821fe605de36e494afcdf +Subproject commit 99757a44b49619ff59cae9e6d983a3b7c20c56bf -- cgit v1.2.3 From ad0465b32051e224f6a234f3ed36494905e59cbf Mon Sep 17 00:00:00 2001 From: Florent Le Coz Date: Mon, 20 Apr 2015 20:33:02 +0200 Subject: Decode incoming JIDs local part according to xep 0106 This let users send message to nicks such as Q@CServe.quakenet.org fix #3047 --- louloulibs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'louloulibs/utils/encoding.cpp') diff --git a/louloulibs b/louloulibs index 99757a4..88d2b13 160000 --- a/louloulibs +++ b/louloulibs @@ -1 +1 @@ -Subproject commit 99757a44b49619ff59cae9e6d983a3b7c20c56bf +Subproject commit 88d2b136e5f133f0d0dc01f59449284f663d53ea -- cgit v1.2.3 From 0d706741c6b3a8bdf6b4f8ca0b1ac00cb27bd8b8 Mon Sep 17 00:00:00 2001 From: Florent Le Coz Date: Mon, 20 Apr 2015 20:35:32 +0200 Subject: Update louloulibs submodule to the correct revision --- louloulibs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'louloulibs/utils/encoding.cpp') diff --git a/louloulibs b/louloulibs index 88d2b13..b53ae92 160000 --- a/louloulibs +++ b/louloulibs @@ -1 +1 @@ -Subproject commit 88d2b136e5f133f0d0dc01f59449284f663d53ea +Subproject commit b53ae922f48f1465a7fa61136f65ec39e38a452e -- cgit v1.2.3 From a8225dc54c019788722bda3bda8d55151c1ccdef Mon Sep 17 00:00:00 2001 From: Florent Le Coz Date: Tue, 21 Apr 2015 15:35:10 +0200 Subject: Properly check for connecting or connected status before reconnecting Note, in our context, is_connecting() includes the resolving part as well as the actual connection (if we are using c-ares) fix #3048 --- louloulibs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'louloulibs/utils/encoding.cpp') diff --git a/louloulibs b/louloulibs index b53ae92..6c812cd 160000 --- a/louloulibs +++ b/louloulibs @@ -1 +1 @@ -Subproject commit b53ae922f48f1465a7fa61136f65ec39e38a452e +Subproject commit 6c812cd86e31569db61cac4e30f77e296d207191 -- cgit v1.2.3 From 71fec776c4d7b99b76a44deae6f333d9cffa1496 Mon Sep 17 00:00:00 2001 From: Florent Le Coz Date: Thu, 7 May 2015 17:42:37 +0200 Subject: Update to latest louloulibs fix #3042 --- louloulibs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'louloulibs/utils/encoding.cpp') diff --git a/louloulibs b/louloulibs index 6c812cd..eaa4fbb 160000 --- a/louloulibs +++ b/louloulibs @@ -1 +1 @@ -Subproject commit 6c812cd86e31569db61cac4e30f77e296d207191 +Subproject commit eaa4fbba814b56b4fe7ffb62984fddfbb9280291 -- cgit v1.2.3 From fbeb5af364db54c8a82f5ea30b83df441988ea4b Mon Sep 17 00:00:00 2001 From: Florent Le Coz Date: Wed, 13 May 2015 20:17:43 +0200 Subject: Update to latest louloulibs revision, and add test for hostname validity fix #2694 --- louloulibs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'louloulibs/utils/encoding.cpp') diff --git a/louloulibs b/louloulibs index eaa4fbb..89398b5 160000 --- a/louloulibs +++ b/louloulibs @@ -1 +1 @@ -Subproject commit eaa4fbba814b56b4fe7ffb62984fddfbb9280291 +Subproject commit 89398b5d886744c3812b65195308cae57eca2b53 -- cgit v1.2.3 From 897b281e67dc82700db9fd9c2dedc5e01e5871ee Mon Sep 17 00:00:00 2001 From: Florent Le Coz Date: Wed, 27 May 2015 23:44:23 +0200 Subject: Avoid some potential race conditions by blocking the signals we manage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit They are atomically unblocked in the ppoll/epoll_pwait calls, avoiding any race condition on the check of the “stop” or “reload” booleans. --- louloulibs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'louloulibs/utils/encoding.cpp') diff --git a/louloulibs b/louloulibs index 89398b5..0f3c118 160000 --- a/louloulibs +++ b/louloulibs @@ -1 +1 @@ -Subproject commit 89398b5d886744c3812b65195308cae57eca2b53 +Subproject commit 0f3c1183e2bf0941ae2bffd3f31577bce4f3001c -- cgit v1.2.3 From e1a7114c8daa10589c830ce972cf461c3540111b Mon Sep 17 00:00:00 2001 From: Florent Le Coz Date: Thu, 28 May 2015 23:42:52 +0200 Subject: louloulibs is directly included, instead of being a submodule Because this is a nightmare to manage --- louloulibs | 1 - louloulibs/utils/encoding.cpp | 254 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 254 insertions(+), 1 deletion(-) delete mode 160000 louloulibs create mode 100644 louloulibs/utils/encoding.cpp (limited to 'louloulibs/utils/encoding.cpp') diff --git a/louloulibs b/louloulibs deleted file mode 160000 index 0f3c118..0000000 --- a/louloulibs +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0f3c1183e2bf0941ae2bffd3f31577bce4f3001c diff --git a/louloulibs/utils/encoding.cpp b/louloulibs/utils/encoding.cpp new file mode 100644 index 0000000..f738ce2 --- /dev/null +++ b/louloulibs/utils/encoding.cpp @@ -0,0 +1,254 @@ +#include + +#include + +#include + +#include +#include +#include + +#include +#include + +/** + * The UTF-8-encoded character used as a place holder when a character conversion fails. + * This is U+FFFD � "replacement character" + */ +static const char* invalid_char = "\xef\xbf\xbd"; +static const size_t invalid_char_len = 3; + +namespace utils +{ + /** + * Based on http://en.wikipedia.org/wiki/UTF-8#Description + */ + bool is_valid_utf8(const char* s) + { + if (!s) + return false; + + const unsigned char* str = reinterpret_cast(s); + + while (*str) + { + // 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + if ((str[0] & 0b11111000) == 0b11110000) + { + if (!str[1] || !str[2] || !str[3] + || ((str[1] & 0b11000000) != 0b10000000) + || ((str[2] & 0b11000000) != 0b10000000) + || ((str[3] & 0b11000000) != 0b10000000)) + return false; + str += 4; + } + // 3 bytes: 1110xxx 10xxxxxx 10xxxxxx + else if ((str[0] & 0b11110000) == 0b11100000) + { + if (!str[1] || !str[2] + || ((str[1] & 0b11000000) != 0b10000000) + || ((str[2] & 0b11000000) != 0b10000000)) + return false; + str += 3; + } + // 2 bytes: 110xxxxx 10xxxxxx + else if (((str[0]) & 0b11100000) == 0b11000000) + { + if (!str[1] || + ((str[1] & 0b11000000) != 0b10000000)) + return false; + str += 2; + } + // 1 byte: 0xxxxxxx + else if ((str[0] & 0b10000000) != 0) + return false; + else + str++; + } + return true; + } + + std::string remove_invalid_xml_chars(const std::string& original) + { + // The given string MUST be a valid utf-8 string + unsigned char* res = new unsigned char[original.size()]; + ScopeGuard sg([&res]() { delete[] res;}); + + // pointer where we write valid chars + unsigned char* r = res; + + const unsigned char* str = reinterpret_cast(original.c_str()); + std::bitset<20> codepoint; + + while (*str) + { + // 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + if ((str[0] & 0b11111000) == 0b11110000) + { + codepoint = ((str[0] & 0b00000111) << 18); + codepoint |= ((str[1] & 0b00111111) << 12); + codepoint |= ((str[2] & 0b00111111) << 6 ); + codepoint |= ((str[3] & 0b00111111) << 0 ); + if (codepoint.to_ulong() <= 0x10FFFF) + { + ::memcpy(r, str, 4); + r += 4; + } + str += 4; + } + // 3 bytes: 1110xxx 10xxxxxx 10xxxxxx + else if ((str[0] & 0b11110000) == 0b11100000) + { + codepoint = ((str[0] & 0b00001111) << 12); + codepoint |= ((str[1] & 0b00111111) << 6); + codepoint |= ((str[2] & 0b00111111) << 0 ); + if (codepoint.to_ulong() <= 0xD7FF || + (codepoint.to_ulong() >= 0xE000 && codepoint.to_ulong() <= 0xFFFD)) + { + ::memcpy(r, str, 3); + r += 3; + } + str += 3; + } + // 2 bytes: 110xxxxx 10xxxxxx + else if (((str[0]) & 0b11100000) == 0b11000000) + { + // All 2 bytes char are valid, don't even bother calculating + // the codepoint + ::memcpy(r, str, 2); + r += 2; + str += 2; + } + // 1 byte: 0xxxxxxx + else if ((str[0] & 0b10000000) == 0) + { + codepoint = ((str[0] & 0b01111111)); + if (codepoint.to_ulong() == 0x09 || + codepoint.to_ulong() == 0x0A || + codepoint.to_ulong() == 0x0D || + codepoint.to_ulong() >= 0x20) + { + ::memcpy(r, str, 1); + r += 1; + } + str += 1; + } + else + throw std::runtime_error("Invalid UTF-8 passed to remove_invalid_xml_chars"); + } + return std::string(reinterpret_cast(res), r-res); + } + + std::string convert_to_utf8(const std::string& str, const char* charset) + { + std::string res; + + const iconv_t cd = iconv_open("UTF-8", charset); + if (cd == (iconv_t)-1) + throw std::runtime_error("Cannot convert into UTF-8"); + + // Make sure cd is always closed when we leave this function + ScopeGuard sg([&]{ iconv_close(cd); }); + + size_t inbytesleft = str.size(); + + // iconv will not attempt to modify this buffer, but some plateform + // require a char** anyway +#ifdef ICONV_SECOND_ARGUMENT_IS_CONST + const char* inbuf_ptr = str.c_str(); +#else + char* inbuf_ptr = const_cast(str.c_str()); +#endif + + size_t outbytesleft = str.size() * 4; + char* outbuf = new char[outbytesleft]; + char* outbuf_ptr = outbuf; + + // Make sure outbuf is always deleted when we leave this function + sg.add_callback([&]{ delete[] outbuf; }); + + bool done = false; + while (done == false) + { + size_t error = iconv(cd, &inbuf_ptr, &inbytesleft, &outbuf_ptr, &outbytesleft); + if ((size_t)-1 == error) + { + switch (errno) + { + case EILSEQ: + // Invalid byte found. Insert a placeholder instead of the + // converted character, jump one byte and continue + memcpy(outbuf_ptr, invalid_char, invalid_char_len); + outbuf_ptr += invalid_char_len; + inbytesleft--; + inbuf_ptr++; + break; + case EINVAL: + // A multibyte sequence is not terminated, but we can't + // provide any more data, so we just add a placeholder to + // indicate that the character is not properly converted, + // and we stop the conversion + memcpy(outbuf_ptr, invalid_char, invalid_char_len); + outbuf_ptr += invalid_char_len; + outbuf_ptr++; + done = true; + break; + case E2BIG: + // This should never happen + done = true; + break; + default: + // This should happen even neverer + done = true; + break; + } + } + else + { + // The conversion finished without any error, stop converting + done = true; + } + } + // Terminate the converted buffer, and copy that buffer it into the + // string we return + *outbuf_ptr = '\0'; + res = outbuf; + return res; + } + +} + +namespace xep0106 +{ + static const std::map encode_map = { + {' ', "\\20"}, + {'"', "\\22"}, + {'&', "\\26"}, + {'\'',"\\27"}, + {'/', "\\2f"}, + {':', "\\3a"}, + {'<', "\\3c"}, + {'>', "\\3e"}, + {'@', "\\40"}, + }; + + void decode(std::string& s) + { + std::string::size_type pos; + for (const auto& pair: encode_map) + while ((pos = s.find(pair.second)) != std::string::npos) + s.replace(pos, pair.second.size(), + 1, pair.first); + } + + void encode(std::string& s) + { + std::string::size_type pos; + while ((pos = s.find_first_of(" \"&'/:<>@")) != std::string::npos) + { + auto it = encode_map.find(s[pos]); + assert(it != encode_map.end()); + s.replace(pos, 1, it->second); + } + } +} -- cgit v1.2.3 From 6235fb2d0326b18a9e013ae13dfb1fd0577ffd9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?louiz=E2=80=99?= Date: Wed, 15 Jun 2016 00:38:43 +0200 Subject: Add get_next_codepoint_size --- louloulibs/utils/encoding.cpp | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'louloulibs/utils/encoding.cpp') diff --git a/louloulibs/utils/encoding.cpp b/louloulibs/utils/encoding.cpp index f738ce2..507f38a 100644 --- a/louloulibs/utils/encoding.cpp +++ b/louloulibs/utils/encoding.cpp @@ -23,6 +23,17 @@ namespace utils /** * Based on http://en.wikipedia.org/wiki/UTF-8#Description */ + std::size_t get_next_codepoint_size(const unsigned char c) + { + if ((c & 0b11111000) == 0b11110000) // 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + return 4; + else if ((c & 0b11110000) == 0b11100000) // 3 bytes: 1110xxx 10xxxxxx 10xxxxxx + return 3; + else if ((c & 0b11100000) == 0b11000000) // 2 bytes: 110xxxxx 10xxxxxx + return 2; + return 1; // 1 byte: 0xxxxxxx + } + bool is_valid_utf8(const char* s) { if (!s) @@ -32,38 +43,31 @@ namespace utils while (*str) { - // 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - if ((str[0] & 0b11111000) == 0b11110000) + const auto codepoint_size = get_next_codepoint_size(str[0]); + if (codepoint_size == 4) { if (!str[1] || !str[2] || !str[3] || ((str[1] & 0b11000000) != 0b10000000) || ((str[2] & 0b11000000) != 0b10000000) || ((str[3] & 0b11000000) != 0b10000000)) return false; - str += 4; } - // 3 bytes: 1110xxx 10xxxxxx 10xxxxxx - else if ((str[0] & 0b11110000) == 0b11100000) + else if (codepoint_size == 3) { if (!str[1] || !str[2] || ((str[1] & 0b11000000) != 0b10000000) || ((str[2] & 0b11000000) != 0b10000000)) return false; - str += 3; } - // 2 bytes: 110xxxxx 10xxxxxx - else if (((str[0]) & 0b11100000) == 0b11000000) + else if (codepoint_size == 2) { if (!str[1] || ((str[1] & 0b11000000) != 0b10000000)) return false; - str += 2; } - // 1 byte: 0xxxxxxx else if ((str[0] & 0b10000000) != 0) return false; - else - str++; + str += codepoint_size; } return true; } -- cgit v1.2.3