2 files changed, 81 insertions, 0 deletions
diff --git a/src/utils/encoding.cpp b/src/utils/encoding.cpp
index 634964b..76d1922 100644
--- a/src/utils/encoding.cpp
+++ b/src/utils/encoding.cpp
@@ -9,6 +9,8 @@
 
 #include <config.h>
 
+#include <bitset>
+
 /**
  * The UTF-8-encoded character used as a place holder when a character conversion fails.
  * This is U+FFFD � "replacement character"
@@ -66,6 +68,77 @@ namespace utils
     return true;
   }
 
+  std::string remove_invalid_xml_chars(const std::string& original)
+  {
+    // The given string MUST be a valid utf-8 string
+    unsigned char* res = new unsigned char[original.size()];
+    ScopeGuard sg([&res]() { delete[] res;});
+
+    // pointer where we write valid chars
+    unsigned char* r = res;
+
+    const unsigned char* str = reinterpret_cast<const unsigned char*>(original.c_str());
+    std::bitset<20> codepoint;
+
+    while (*str)
+      {
+        // 4 bytes:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+        if ((str[0] & 11111000_b) == 11110000_b)
+          {
+            codepoint  = ((str[0] & 00000111_b) << 18);
+            codepoint |= ((str[1] & 00111111_b) << 12);
+            codepoint |= ((str[2] & 00111111_b) << 6 );
+            codepoint |= ((str[3] & 00111111_b) << 0 );
+            if (codepoint.to_ulong() <= 0x10FFFF)
+              {
+                ::memcpy(r, str, 4);
+                r += 4;
+              }
+            str += 4;
+          }
+        // 3 bytes:  1110xxx 10xxxxxx 10xxxxxx
+        else if ((str[0] & 11110000_b) == 11100000_b)
+          {
+            codepoint  = ((str[0] & 00001111_b) << 12);
+            codepoint |= ((str[1] & 00111111_b) << 6);
+            codepoint |= ((str[2] & 00111111_b) << 0 );
+            if (codepoint.to_ulong() <= 0xD7FF ||
+                (codepoint.to_ulong() >= 0xE000 && codepoint.to_ulong() <= 0xFFFD))
+              {
+                ::memcpy(r, str, 3);
+                r += 3;
+              }
+            str += 3;
+          }
+        // 2 bytes:  110xxxxx 10xxxxxx
+        else if (((str[0]) & 11100000_b) == 11000000_b)
+          {
+            // All 2 bytes char are valid, don't even bother calculating
+            // the codepoint
+            ::memcpy(r, str, 2);
+            r += 2;
+            str += 2;
+          }
+        // 1 byte:  0xxxxxxx
+        else if ((str[0] & 10000000_b) == 0)
+          {
+            codepoint = ((str[0] & 01111111_b));
+            if (codepoint.to_ulong() == 0x09 ||
+                codepoint.to_ulong() == 0x0A ||
+                codepoint.to_ulong() == 0x0D ||
+                codepoint.to_ulong() >= 0x20)
+              {
+                ::memcpy(r, str, 1);
+                r += 1;
+              }
+            str += 1;
+          }
+        else
+          throw std::runtime_error("Invalid UTF-8 passed to remove_invalid_xml_chars");
+      }
+    return std::string(reinterpret_cast<char*>(res), r-res);
+  }
+
   std::string convert_to_utf8(const std::string& str, const char* charset)
   {
     std::string res;
diff --git a/src/utils/encoding.hpp b/src/utils/encoding.hpp
index 362f1df..a3bccfc 100644
--- a/src/utils/encoding.hpp
+++ b/src/utils/encoding.hpp
@@ -12,6 +12,14 @@ namespace utils
    */
   bool is_valid_utf8(const char* s);
   /**
+   * Remove all invalid codepoints from the given utf-8-encoded string.
+   * The value returned is a copy of the string, without the removed chars.
+   *
+   * See http://www.w3.org/TR/xml/#charsets for the list of valid characters
+   * in XML.
+   */
+  std::string remove_invalid_xml_chars(const std::string& original);
+  /**
    * Convert the given string (encoded is "encoding") into valid utf-8.
    * If some decoding fails, insert an utf-8 placeholder character instead.
    */