From 3960e4d5afa09c299f595b411ee8522db30580fd Mon Sep 17 00:00:00 2001 From: Florent Le Coz Date: Wed, 11 Dec 2013 21:07:39 +0100 Subject: Functions to provide xml-valid strings By removing invalid chars, see http://www.w3.org/TR/xml/#charsets --- src/utils/encoding.cpp | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/utils/encoding.hpp | 8 ++++++ 2 files changed, 81 insertions(+) (limited to 'src/utils') diff --git a/src/utils/encoding.cpp b/src/utils/encoding.cpp index 634964b..76d1922 100644 --- a/src/utils/encoding.cpp +++ b/src/utils/encoding.cpp @@ -9,6 +9,8 @@ #include +#include + /** * The UTF-8-encoded character used as a place holder when a character conversion fails. * This is U+FFFD � "replacement character" @@ -66,6 +68,77 @@ namespace utils return true; } + std::string remove_invalid_xml_chars(const std::string& original) + { + // The given string MUST be a valid utf-8 string + unsigned char* res = new unsigned char[original.size()]; + ScopeGuard sg([&res]() { delete[] res;}); + + // pointer where we write valid chars + unsigned char* r = res; + + const unsigned char* str = reinterpret_cast(original.c_str()); + std::bitset<20> codepoint; + + while (*str) + { + // 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + if ((str[0] & 11111000_b) == 11110000_b) + { + codepoint = ((str[0] & 00000111_b) << 18); + codepoint |= ((str[1] & 00111111_b) << 12); + codepoint |= ((str[2] & 00111111_b) << 6 ); + codepoint |= ((str[3] & 00111111_b) << 0 ); + if (codepoint.to_ulong() <= 0x10FFFF) + { + ::memcpy(r, str, 4); + r += 4; + } + str += 4; + } + // 3 bytes: 1110xxx 10xxxxxx 10xxxxxx + else if ((str[0] & 11110000_b) == 11100000_b) + { + codepoint = ((str[0] & 00001111_b) << 12); + codepoint |= ((str[1] & 00111111_b) << 6); + codepoint |= ((str[2] & 00111111_b) << 0 ); + if (codepoint.to_ulong() <= 0xD7FF || + (codepoint.to_ulong() >= 0xE000 && codepoint.to_ulong() <= 0xFFFD)) + { + ::memcpy(r, str, 3); + r += 3; + } + str += 3; + } + // 2 bytes: 110xxxxx 10xxxxxx + else if (((str[0]) & 11100000_b) == 11000000_b) + { + // All 2 bytes char are valid, don't even bother calculating + // the codepoint + ::memcpy(r, str, 2); + r += 2; + str += 2; + } + // 1 byte: 0xxxxxxx + else if ((str[0] & 10000000_b) == 0) + { + codepoint = ((str[0] & 01111111_b)); + if (codepoint.to_ulong() == 0x09 || + codepoint.to_ulong() == 0x0A || + codepoint.to_ulong() == 0x0D || + codepoint.to_ulong() >= 0x20) + { + ::memcpy(r, str, 1); + r += 1; + } + str += 1; + } + else + throw std::runtime_error("Invalid UTF-8 passed to remove_invalid_xml_chars"); + } + return std::string(reinterpret_cast(res), r-res); + } + std::string convert_to_utf8(const std::string& str, const char* charset) { std::string res; diff --git a/src/utils/encoding.hpp b/src/utils/encoding.hpp index 362f1df..a3bccfc 100644 --- a/src/utils/encoding.hpp +++ b/src/utils/encoding.hpp @@ -11,6 +11,14 @@ namespace utils * Based on http://en.wikipedia.org/wiki/UTF-8#Description */ bool is_valid_utf8(const char* s); + /** + * Remove all invalid codepoints from the given utf-8-encoded string. + * The value returned is a copy of the string, without the removed chars. + * + * See http://www.w3.org/TR/xml/#charsets for the list of valid characters + * in XML. + */ + std::string remove_invalid_xml_chars(const std::string& original); /** * Convert the given string (encoded is "encoding") into valid utf-8. * If some decoding fails, insert an utf-8 placeholder character instead. -- cgit v1.2.3