summaryrefslogtreecommitdiff
path: root/src/utils
diff options
context:
space:
mode:
Diffstat (limited to 'src/utils')
-rw-r--r--src/utils/encoding.cpp73
-rw-r--r--src/utils/encoding.hpp8
2 files changed, 81 insertions, 0 deletions
diff --git a/src/utils/encoding.cpp b/src/utils/encoding.cpp
index 634964b..76d1922 100644
--- a/src/utils/encoding.cpp
+++ b/src/utils/encoding.cpp
@@ -9,6 +9,8 @@
#include <config.h>
+#include <bitset>
+
/**
* The UTF-8-encoded character used as a place holder when a character conversion fails.
* This is U+FFFD � "replacement character"
@@ -66,6 +68,77 @@ namespace utils
return true;
}
+ std::string remove_invalid_xml_chars(const std::string& original)
+ {
+ // The given string MUST be a valid utf-8 string
+ unsigned char* res = new unsigned char[original.size()];
+ ScopeGuard sg([&res]() { delete[] res;});
+
+ // pointer where we write valid chars
+ unsigned char* r = res;
+
+ const unsigned char* str = reinterpret_cast<const unsigned char*>(original.c_str());
+ std::bitset<20> codepoint;
+
+ while (*str)
+ {
+ // 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ if ((str[0] & 11111000_b) == 11110000_b)
+ {
+ codepoint = ((str[0] & 00000111_b) << 18);
+ codepoint |= ((str[1] & 00111111_b) << 12);
+ codepoint |= ((str[2] & 00111111_b) << 6 );
+ codepoint |= ((str[3] & 00111111_b) << 0 );
+ if (codepoint.to_ulong() <= 0x10FFFF)
+ {
+ ::memcpy(r, str, 4);
+ r += 4;
+ }
+ str += 4;
+ }
+ // 3 bytes: 1110xxx 10xxxxxx 10xxxxxx
+ else if ((str[0] & 11110000_b) == 11100000_b)
+ {
+ codepoint = ((str[0] & 00001111_b) << 12);
+ codepoint |= ((str[1] & 00111111_b) << 6);
+ codepoint |= ((str[2] & 00111111_b) << 0 );
+ if (codepoint.to_ulong() <= 0xD7FF ||
+ (codepoint.to_ulong() >= 0xE000 && codepoint.to_ulong() <= 0xFFFD))
+ {
+ ::memcpy(r, str, 3);
+ r += 3;
+ }
+ str += 3;
+ }
+ // 2 bytes: 110xxxxx 10xxxxxx
+ else if (((str[0]) & 11100000_b) == 11000000_b)
+ {
+ // All 2 bytes char are valid, don't even bother calculating
+ // the codepoint
+ ::memcpy(r, str, 2);
+ r += 2;
+ str += 2;
+ }
+ // 1 byte: 0xxxxxxx
+ else if ((str[0] & 10000000_b) == 0)
+ {
+ codepoint = ((str[0] & 01111111_b));
+ if (codepoint.to_ulong() == 0x09 ||
+ codepoint.to_ulong() == 0x0A ||
+ codepoint.to_ulong() == 0x0D ||
+ codepoint.to_ulong() >= 0x20)
+ {
+ ::memcpy(r, str, 1);
+ r += 1;
+ }
+ str += 1;
+ }
+ else
+ throw std::runtime_error("Invalid UTF-8 passed to remove_invalid_xml_chars");
+ }
+ return std::string(reinterpret_cast<char*>(res), r-res);
+ }
+
std::string convert_to_utf8(const std::string& str, const char* charset)
{
std::string res;
diff --git a/src/utils/encoding.hpp b/src/utils/encoding.hpp
index 362f1df..a3bccfc 100644
--- a/src/utils/encoding.hpp
+++ b/src/utils/encoding.hpp
@@ -12,6 +12,14 @@ namespace utils
*/
bool is_valid_utf8(const char* s);
/**
+ * Remove all invalid codepoints from the given utf-8-encoded string.
+ * The value returned is a copy of the string, without the removed chars.
+ *
+ * See http://www.w3.org/TR/xml/#charsets for the list of valid characters
+ * in XML.
+ */
+ std::string remove_invalid_xml_chars(const std::string& original);
+ /**
* Convert the given string (encoded is "encoding") into valid utf-8.
* If some decoding fails, insert an utf-8 placeholder character instead.
*/