Check UTF-8 encoding, and convert strings to UTF-8

Handle conversion errors properly by inserting � instead. Add a binary header to provide portable way to write binary literals (I like them) Also add a test file. ref #2404
author: Florent Le Coz <louiz@louiz.org> 2013-11-09 06:01:47 +0100
committer: Florent Le Coz <louiz@louiz.org> 2013-11-09 06:01:47 +0100
commit: ccebe901d7d76dfddc082d994efa54ef2aefee57 (patch)
tree: 97a542cce7c3c3185553859f679dc074f8f0286f
parent: a418b6ed5d70f0e61e71bb1adce2a693ade89e30 (diff)
download: biboumi-ccebe901d7d76dfddc082d994efa54ef2aefee57.tar.gz
biboumi-ccebe901d7d76dfddc082d994efa54ef2aefee57.tar.bz2
biboumi-ccebe901d7d76dfddc082d994efa54ef2aefee57.tar.xz
biboumi-ccebe901d7d76dfddc082d994efa54ef2aefee57.zip
6 files changed, 247 insertions, 25 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bff724c..bd8ca76 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,6 +20,13 @@ include_directories("src/")
 include_directories(SYSTEM ${CRYPTO++_INCLUDE_DIR})
 
 #
+## utils
+#
+file(GLOB source_utils
+  src/utils/*.[hc]pp)
+add_library(utils STATIC ${source_utils})
+
+#
 ## network
 #
 file(GLOB source_network
@@ -32,7 +39,7 @@ add_library(network STATIC ${source_network})
 file(GLOB source_irc
   src/irc/*.[hc]pp)
 add_library(irc STATIC ${source_irc})
-target_link_libraries(irc network)
+target_link_libraries(irc network utils)
 
 #
 ## xmpplib
@@ -40,7 +47,7 @@ target_link_libraries(irc network)
 file(GLOB source_xmpp
   src/xmpp/*.[hc]pp)
 add_library(xmpp STATIC ${source_xmpp})
-target_link_libraries(xmpp bridge network ${CRYPTO++_LIBRARIES} expatpp)
+target_link_libraries(xmpp bridge network utils ${CRYPTO++_LIBRARIES} expatpp)
 
 #
 ## bridge
@@ -54,4 +61,15 @@ add_executable(${PROJECT_NAME} src/main.cpp)
 target_link_libraries(${PROJECT_NAME}
   xmpp
   irc
-  bridge)
-\ No newline at end of file
+  bridge)
+
+#
+## Tests
+#
+
+add_executable(test src/test.cpp)
+target_link_libraries(test
+  xmpp
+  irc
+  bridge
+  utils)
diff --git a/src/main.cpp b/src/main.cpp
index b0fb140..b7fa01e 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,31 +1,16 @@
-#include <irc/irc_client.hpp>
-#include <xmpp/xmpp_component.hpp>
 #include <network/poller.hpp>
-
-#include <xmpp/xmpp_parser.hpp>
-#include <xmpp/xmpp_stanza.hpp>
+#include <xmpp/xmpp_component.hpp>
 
 #include <memory>
 
-#include <xmpp/jid.hpp>
-#include <irc/iid.hpp>
-
-#include <iostream>
-
 int main()
 {
   Poller p;
-  // Now I'm the bridge, creating an ircclient because needed.
-  std::shared_ptr<IrcClient> c = std::make_shared<IrcClient>();
-  p.add_socket_handler(c);
-  std::shared_ptr<IrcClient> d = std::make_shared<IrcClient>();
-  p.add_socket_handler(d);
-  std::shared_ptr<IrcClient> e = std::make_shared<IrcClient>();
-  p.add_socket_handler(e);
-  c->connect("localhost", "7877");
-  d->connect("localhost", "7878");
-  e->connect("localhost", "7879");
-  while (true)
-    p.poll();
+  std::shared_ptr<XmppComponent> xmpp_component =
+    std::make_shared<XmppComponent>("irc.localhost", "secret");
+  p.add_socket_handler(xmpp_component);
+  xmpp_component->start();
+  while (p.poll())
+    ;
   return 0;
 }
diff --git a/src/test.cpp b/src/test.cpp
new file mode 100644
index 0000000..e3bfa55
--- /dev/null
+++ b/src/test.cpp
@@ -0,0 +1,43 @@
+/**
+ * Just a very simple test suite, by hand, using assert()
+ */
+
+#include <assert.h>
+
+#include <iostream>
+
+#include <utils/encoding.hpp>
+#include <string.h>
+
+#include <fstream>
+
+int main()
+{
+  /**
+   * Encoding
+   */
+  const char* valid = "C̡͔͕̩͙̽ͫ̈́ͥ̿̆ͧ̚r̸̩̘͍̻͖̆͆͛͊̉̕͡o͇͈̳̤̱̊̈͢q̻͍̦̮͕ͥͬͬ̽ͭ͌̾ͅǔ͉͕͇͚̙͉̭͉̇̽ȇ͈̮̼͍͔ͣ͊͞͝ͅ ͫ̾ͪ̓ͥ̆̋̔҉̢̦̠͈͔̖̲̯̦ụ̶̯͐̃̋ͮ͆͝n̬̱̭͇̻̱̰̖̤̏͛̏̿̑͟ë́͐҉̸̥̪͕̹̻̙͉̰ ̹̼̱̦̥ͩ͑̈́͑͝ͅt͍̥͈̹̝ͣ̃̔̈̔ͧ̕͝ḙ̸̖̟̙͙ͪ͢ų̯̞̼̲͓̻̞͛̃̀́b̮̰̗̩̰̊̆͗̾̎̆ͯ͌͝.̗̙͎̦ͫ̈́ͥ͌̈̓ͬ";
+  assert(utils::is_valid_utf8(valid) == true);
+  const char* invalid = "\xF0\x0F";
+  assert(utils::is_valid_utf8(invalid) == false);
+  const char* invalid2 = "\xFE\xFE\xFF\xFF";
+  assert(utils::is_valid_utf8(invalid2) == false);
+
+  std::string in = "coucou les copains  ♥ ";
+  assert(utils::is_valid_utf8(in.c_str()) == true);
+  std::string res = utils::convert_to_utf8(in, "UTF-8");
+  assert(utils::is_valid_utf8(res.c_str()) == true && res == in);
+
+  std::string original_utf8("couc¥ou");
+  std::string original_latin1("couc\xa5ou");
+
+  // When converting back to utf-8
+  std::string from_latin1 = utils::convert_to_utf8(original_latin1.c_str(), "ISO-8859-1");
+  assert(from_latin1 == original_utf8);
+
+  // Check the behaviour when the decoding fails (here because we provide a
+  // wrong charset)
+  std::string from_ascii = utils::convert_to_utf8(original_latin1, "US-ASCII");
+  assert(from_ascii == "couc�ou");
+  return 0;
+}
diff --git a/src/utils/binary.hpp b/src/utils/binary.hpp
new file mode 100644
index 0000000..10807bc
--- /dev/null
+++ b/src/utils/binary.hpp
@@ -0,0 +1,16 @@
+#ifndef BINARY_INCLUDED
+# define BINARY_INCLUDED
+
+template<char FIRST, char... REST> struct binary
+{
+  static_assert(FIRST == '0' || FIRST == '1', "invalid binary digit" );
+  enum { value = ((FIRST - '0') << sizeof...(REST)) + binary<REST...>::value };
+};
+
+template<> struct binary<'0'> { enum { value = 0 }; };
+template<> struct binary<'1'> { enum { value = 1 }; };
+
+template<char... LITERAL> inline
+constexpr unsigned int operator "" _b() { return binary<LITERAL...>::value; }
+
+#endif // BINARY_INCLUDED
diff --git a/src/utils/encoding.cpp b/src/utils/encoding.cpp
new file mode 100644
index 0000000..a1bc01b
--- /dev/null
+++ b/src/utils/encoding.cpp
@@ -0,0 +1,139 @@
+#include <utils/encoding.hpp>
+#include <utils/binary.hpp>
+
+#include <utils/scopeguard.hpp>
+
+#include <assert.h>
+#include <string.h>
+#include <iconv.h>
+
+/**
+ * The UTF-8-encoded character used as a place holder when a character conversion fails.
+ * This is U+FFFD � "replacement character"
+ */
+static const char* invalid_char = "\xef\xbf\xbd";
+static const size_t invalid_char_len = 3;
+
+namespace utils
+{
+  /**
+   * Based on http://en.wikipedia.org/wiki/UTF-8#Description
+   */
+  bool is_valid_utf8(const char* s)
+  {
+    if (!s)
+      return false;
+
+    const unsigned char* str = reinterpret_cast<const unsigned char*>(s);
+
+    while (*str)
+      {
+        // 4 bytes:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+        if ((str[0] & 11111000_b) == 11110000_b)
+          {
+            if (!str[1] || !str[2] || !str[3]
+                || ((str[1] & 11000000_b) != 10000000_b)
+                || ((str[2] & 11000000_b) != 10000000_b)
+                || ((str[3] & 11000000_b) != 10000000_b))
+              return false;
+            str += 4;
+          }
+        // 3 bytes:  1110xxx 10xxxxxx 10xxxxxx
+        else if ((str[0] & 11110000_b) == 11100000_b)
+          {
+            if (!str[1] || !str[2]
+                || ((str[1] & 11000000_b) != 10000000_b)
+                || ((str[2] & 11000000_b) != 10000000_b))
+              return false;
+            str += 3;
+          }
+        // 2 bytes:  110xxxxx 10xxxxxx
+        else if (((str[0]) & 11100000_b) == 11000000_b)
+          {
+            if (!str[1] ||
+                ((str[1] & 11000000_b) != 10000000_b))
+              return false;
+            str += 2;
+          }
+        // 1 byte:  0xxxxxxx
+        else if ((str[0] & 10000000_b) != 0)
+          return false;
+        else
+          str++;
+      }
+    return true;
+  }
+
+  std::string convert_to_utf8(const std::string& str, const char* charset)
+  {
+    std::string res;
+
+    const iconv_t cd = iconv_open("UTF-8", charset);
+    if (cd == (iconv_t)-1)
+      throw std::runtime_error("Cannot convert into UTF-8");
+
+    // Make sure cd is always closed when we leave this function
+    ScopeGuard sg([&]{ iconv_close(cd); });
+
+    // iconv will not attempt to modify this buffer, but it still requires
+    // a char**.
+    size_t inbytesleft = str.size();
+    char* inbuf_ptr = const_cast<char*>(str.c_str());
+
+    size_t outbytesleft = str.size() * 4;
+    char* outbuf = new char[outbytesleft];
+    char* outbuf_ptr = outbuf;
+
+    // Make sure outbuf is always deleted when we leave this function
+    sg.add_callback([&]{ delete[] outbuf; });
+
+    bool done = false;
+    while (done == false)
+      {
+        size_t error = iconv(cd, &inbuf_ptr, &inbytesleft, &outbuf_ptr, &outbytesleft);
+        if ((size_t)-1 == error)
+          {
+            switch (errno)
+              {
+              case EILSEQ:
+                // Invalid byte found. Insert a placeholder instead of the
+                // converted character, jump one byte and continue
+                memcpy(outbuf_ptr, invalid_char, invalid_char_len);
+                outbuf_ptr += invalid_char_len;
+                inbytesleft--;
+                inbuf_ptr++;
+                break;
+              case EINVAL:
+                // A multibyte sequence is not terminated, but we can't
+                // provide any more data, so we just add a placeholder to
+                // indicate that the character is not properly converted,
+                // and we stop the conversion
+                memcpy(outbuf_ptr, invalid_char, invalid_char_len);
+                outbuf_ptr += invalid_char_len;
+                outbuf_ptr++;
+                done = true;
+                break;
+              case E2BIG:
+                // This should never happen
+                done = true;
+              default:
+                // This should happen even neverer
+                done = true;
+                break;
+              }
+          }
+        else
+          {
+            // The conversion finished without any error, stop converting
+            done = true;
+          }
+      }
+    // Terminate the converted buffer, and copy that buffer it into the
+    // string we return
+    *outbuf_ptr = '\0';
+    res = outbuf;
+    return res;
+  }
+
+}
+
diff --git a/src/utils/encoding.hpp b/src/utils/encoding.hpp
new file mode 100644
index 0000000..362f1df
--- /dev/null
+++ b/src/utils/encoding.hpp
@@ -0,0 +1,21 @@
+#ifndef ENCODING_INCLUDED
+# define ENCODING_INCLUDED
+
+#include <string>
+
+namespace utils
+{
+  /**
+   * Returns true if the given null-terminated string is valid utf-8.
+   *
+   * Based on http://en.wikipedia.org/wiki/UTF-8#Description
+   */
+  bool is_valid_utf8(const char* s);
+  /**
+   * Convert the given string (encoded is "encoding") into valid utf-8.
+   * If some decoding fails, insert an utf-8 placeholder character instead.
+   */
+  std::string convert_to_utf8(const std::string& str, const char* encoding);
+}
+
+#endif // ENCODING_INCLUDED
author	Florent Le Coz <louiz@louiz.org>	2013-11-09 06:01:47 +0100
committer	Florent Le Coz <louiz@louiz.org>	2013-11-09 06:01:47 +0100
commit	ccebe901d7d76dfddc082d994efa54ef2aefee57 (patch)
tree	97a542cce7c3c3185553859f679dc074f8f0286f
parent	a418b6ed5d70f0e61e71bb1adce2a693ade89e30 (diff)
download	biboumi-ccebe901d7d76dfddc082d994efa54ef2aefee57.tar.gz biboumi-ccebe901d7d76dfddc082d994efa54ef2aefee57.tar.bz2 biboumi-ccebe901d7d76dfddc082d994efa54ef2aefee57.tar.xz biboumi-ccebe901d7d76dfddc082d994efa54ef2aefee57.zip