diff options
Diffstat (limited to 'louloulibs/utils/encoding.cpp')
-rw-r--r-- | louloulibs/utils/encoding.cpp | 254 |
1 files changed, 0 insertions, 254 deletions
diff --git a/louloulibs/utils/encoding.cpp b/louloulibs/utils/encoding.cpp deleted file mode 100644 index aa91dac..0000000 --- a/louloulibs/utils/encoding.cpp +++ /dev/null @@ -1,254 +0,0 @@ -#include <utils/encoding.hpp> - -#include <utils/scopeguard.hpp> - -#include <stdexcept> - -#include <assert.h> -#include <string.h> -#include <iconv.h> -#include <cerrno> - -#include <map> -#include <bitset> - -/** - * The UTF-8-encoded character used as a place holder when a character conversion fails. - * This is U+FFFD � "replacement character" - */ -static const char* invalid_char = "\xef\xbf\xbd"; -static const size_t invalid_char_len = 3; - -namespace utils -{ - /** - * Based on http://en.wikipedia.org/wiki/UTF-8#Description - */ - std::size_t get_next_codepoint_size(const unsigned char c) - { - if ((c & 0b11111000) == 0b11110000) // 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - return 4; - else if ((c & 0b11110000) == 0b11100000) // 3 bytes: 1110xxx 10xxxxxx 10xxxxxx - return 3; - else if ((c & 0b11100000) == 0b11000000) // 2 bytes: 110xxxxx 10xxxxxx - return 2; - return 1; // 1 byte: 0xxxxxxx - } - - bool is_valid_utf8(const char* s) - { - if (!s) - return false; - - const unsigned char* str = reinterpret_cast<const unsigned char*>(s); - - while (*str) - { - const auto codepoint_size = get_next_codepoint_size(str[0]); - if (codepoint_size == 4) - { - if (!str[1] || !str[2] || !str[3] - || ((str[1] & 0b11000000) != 0b10000000) - || ((str[2] & 0b11000000) != 0b10000000) - || ((str[3] & 0b11000000) != 0b10000000)) - return false; - } - else if (codepoint_size == 3) - { - if (!str[1] || !str[2] - || ((str[1] & 0b11000000) != 0b10000000) - || ((str[2] & 0b11000000) != 0b10000000)) - return false; - } - else if (codepoint_size == 2) - { - if (!str[1] || - ((str[1] & 0b11000000) != 0b10000000)) - return false; - } - else if ((str[0] & 0b10000000) != 0) - return false; - str += codepoint_size; - } - return true; - } - - std::string remove_invalid_xml_chars(const std::string& original) - { - // The given string MUST be a valid utf-8 string - std::vector<char> res(original.size(), '\0'); - - // pointer where we write valid chars - char* r = res.data(); - - const char* str = original.c_str(); - std::bitset<20> codepoint; - - while (*str) - { - // 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - if ((str[0] & 0b11111000) == 0b11110000) - { - codepoint = ((str[0] & 0b00000111) << 18); - codepoint |= ((str[1] & 0b00111111) << 12); - codepoint |= ((str[2] & 0b00111111) << 6 ); - codepoint |= ((str[3] & 0b00111111) << 0 ); - if (codepoint.to_ulong() <= 0x10FFFF) - { - ::memcpy(r, str, 4); - r += 4; - } - str += 4; - } - // 3 bytes: 1110xxx 10xxxxxx 10xxxxxx - else if ((str[0] & 0b11110000) == 0b11100000) - { - codepoint = ((str[0] & 0b00001111) << 12); - codepoint |= ((str[1] & 0b00111111) << 6); - codepoint |= ((str[2] & 0b00111111) << 0 ); - if (codepoint.to_ulong() <= 0xD7FF || - (codepoint.to_ulong() >= 0xE000 && codepoint.to_ulong() <= 0xFFFD)) - { - ::memcpy(r, str, 3); - r += 3; - } - str += 3; - } - // 2 bytes: 110xxxxx 10xxxxxx - else if (((str[0]) & 0b11100000) == 0b11000000) - { - // All 2 bytes char are valid, don't even bother calculating - // the codepoint - ::memcpy(r, str, 2); - r += 2; - str += 2; - } - // 1 byte: 0xxxxxxx - else if ((str[0] & 0b10000000) == 0) - { - codepoint = ((str[0] & 0b01111111)); - if (codepoint.to_ulong() == 0x09 || - codepoint.to_ulong() == 0x0A || - codepoint.to_ulong() == 0x0D || - codepoint.to_ulong() >= 0x20) - { - ::memcpy(r, str, 1); - r += 1; - } - str += 1; - } - else - throw std::runtime_error("Invalid UTF-8 passed to remove_invalid_xml_chars"); - } - return {res.data(), static_cast<size_t>(r - res.data())}; - } - - std::string convert_to_utf8(const std::string& str, const char* charset) - { - std::string res; - - const iconv_t cd = iconv_open("UTF-8", charset); - if (cd == (iconv_t)-1) - throw std::runtime_error("Cannot convert into UTF-8"); - - // Make sure cd is always closed when we leave this function - const auto sg = utils::make_scope_guard([&cd](){ iconv_close(cd); }); - - size_t inbytesleft = str.size(); - - // iconv will not attempt to modify this buffer, but some plateform - // require a char** anyway -#ifdef ICONV_SECOND_ARGUMENT_IS_CONST - const char* inbuf_ptr = str.c_str(); -#else - char* inbuf_ptr = const_cast<char*>(str.c_str()); -#endif - - size_t outbytesleft = str.size() * 4; - char* outbuf = new char[outbytesleft]; - char* outbuf_ptr = outbuf; - - // Make sure outbuf is always deleted when we leave this function - const auto sg2 = utils::make_scope_guard([outbuf](){ delete[] outbuf; }); - - bool done = false; - while (done == false) - { - size_t error = iconv(cd, &inbuf_ptr, &inbytesleft, &outbuf_ptr, &outbytesleft); - if ((size_t)-1 == error) - { - switch (errno) - { - case EILSEQ: - // Invalid byte found. Insert a placeholder instead of the - // converted character, jump one byte and continue - memcpy(outbuf_ptr, invalid_char, invalid_char_len); - outbuf_ptr += invalid_char_len; - inbytesleft--; - inbuf_ptr++; - break; - case EINVAL: - // A multibyte sequence is not terminated, but we can't - // provide any more data, so we just add a placeholder to - // indicate that the character is not properly converted, - // and we stop the conversion - memcpy(outbuf_ptr, invalid_char, invalid_char_len); - outbuf_ptr += invalid_char_len; - outbuf_ptr++; - done = true; - break; - case E2BIG: // This should never happen - default: // This should happen even neverer - done = true; - break; - } - } - else - { - // The conversion finished without any error, stop converting - done = true; - } - } - // Terminate the converted buffer, and copy that buffer it into the - // string we return - *outbuf_ptr = '\0'; - res = outbuf; - return res; - } - -} - -namespace xep0106 -{ - static const std::map<const char, const std::string> encode_map = { - {' ', "\\20"}, - {'"', "\\22"}, - {'&', "\\26"}, - {'\'',"\\27"}, - {'/', "\\2f"}, - {':', "\\3a"}, - {'<', "\\3c"}, - {'>', "\\3e"}, - {'@', "\\40"}, - }; - - void decode(std::string& s) - { - std::string::size_type pos; - for (const auto& pair: encode_map) - while ((pos = s.find(pair.second)) != std::string::npos) - s.replace(pos, pair.second.size(), - 1, pair.first); - } - - void encode(std::string& s) - { - std::string::size_type pos; - while ((pos = s.find_first_of(" \"&'/:<>@")) != std::string::npos) - { - auto it = encode_map.find(s[pos]); - assert(it != encode_map.end()); - s.replace(pos, 1, it->second); - } - } -} |