diff options
Diffstat (limited to 'src/utils')
-rw-r--r-- | src/utils/encoding.cpp | 73 | ||||
-rw-r--r-- | src/utils/encoding.hpp | 8 |
2 files changed, 81 insertions, 0 deletions
diff --git a/src/utils/encoding.cpp b/src/utils/encoding.cpp index 634964b..76d1922 100644 --- a/src/utils/encoding.cpp +++ b/src/utils/encoding.cpp @@ -9,6 +9,8 @@ #include <config.h> +#include <bitset> + /** * The UTF-8-encoded character used as a place holder when a character conversion fails. * This is U+FFFD � "replacement character" @@ -66,6 +68,77 @@ namespace utils return true; } + std::string remove_invalid_xml_chars(const std::string& original) + { + // The given string MUST be a valid utf-8 string + unsigned char* res = new unsigned char[original.size()]; + ScopeGuard sg([&res]() { delete[] res;}); + + // pointer where we write valid chars + unsigned char* r = res; + + const unsigned char* str = reinterpret_cast<const unsigned char*>(original.c_str()); + std::bitset<20> codepoint; + + while (*str) + { + // 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + if ((str[0] & 11111000_b) == 11110000_b) + { + codepoint = ((str[0] & 00000111_b) << 18); + codepoint |= ((str[1] & 00111111_b) << 12); + codepoint |= ((str[2] & 00111111_b) << 6 ); + codepoint |= ((str[3] & 00111111_b) << 0 ); + if (codepoint.to_ulong() <= 0x10FFFF) + { + ::memcpy(r, str, 4); + r += 4; + } + str += 4; + } + // 3 bytes: 1110xxx 10xxxxxx 10xxxxxx + else if ((str[0] & 11110000_b) == 11100000_b) + { + codepoint = ((str[0] & 00001111_b) << 12); + codepoint |= ((str[1] & 00111111_b) << 6); + codepoint |= ((str[2] & 00111111_b) << 0 ); + if (codepoint.to_ulong() <= 0xD7FF || + (codepoint.to_ulong() >= 0xE000 && codepoint.to_ulong() <= 0xFFFD)) + { + ::memcpy(r, str, 3); + r += 3; + } + str += 3; + } + // 2 bytes: 110xxxxx 10xxxxxx + else if (((str[0]) & 11100000_b) == 11000000_b) + { + // All 2 bytes char are valid, don't even bother calculating + // the codepoint + ::memcpy(r, str, 2); + r += 2; + str += 2; + } + // 1 byte: 0xxxxxxx + else if ((str[0] & 10000000_b) == 0) + { + codepoint = ((str[0] & 01111111_b)); + if (codepoint.to_ulong() == 0x09 || + codepoint.to_ulong() == 0x0A || + codepoint.to_ulong() == 0x0D || + codepoint.to_ulong() >= 0x20) + { + ::memcpy(r, str, 1); + r += 1; + } + str += 1; + } + else + throw std::runtime_error("Invalid UTF-8 passed to remove_invalid_xml_chars"); + } + return std::string(reinterpret_cast<char*>(res), r-res); + } + std::string convert_to_utf8(const std::string& str, const char* charset) { std::string res; diff --git a/src/utils/encoding.hpp b/src/utils/encoding.hpp index 362f1df..a3bccfc 100644 --- a/src/utils/encoding.hpp +++ b/src/utils/encoding.hpp @@ -12,6 +12,14 @@ namespace utils */ bool is_valid_utf8(const char* s); /** + * Remove all invalid codepoints from the given utf-8-encoded string. + * The value returned is a copy of the string, without the removed chars. + * + * See http://www.w3.org/TR/xml/#charsets for the list of valid characters + * in XML. + */ + std::string remove_invalid_xml_chars(const std::string& original); + /** * Convert the given string (encoded is "encoding") into valid utf-8. * If some decoding fails, insert an utf-8 placeholder character instead. */ |