summaryrefslogtreecommitdiff
path: root/src/utils
diff options
context:
space:
mode:
authorFlorent Le Coz <louiz@louiz.org>2013-11-09 06:01:47 +0100
committerFlorent Le Coz <louiz@louiz.org>2013-11-09 06:01:47 +0100
commitccebe901d7d76dfddc082d994efa54ef2aefee57 (patch)
tree97a542cce7c3c3185553859f679dc074f8f0286f /src/utils
parenta418b6ed5d70f0e61e71bb1adce2a693ade89e30 (diff)
downloadbiboumi-ccebe901d7d76dfddc082d994efa54ef2aefee57.tar.gz
biboumi-ccebe901d7d76dfddc082d994efa54ef2aefee57.tar.bz2
biboumi-ccebe901d7d76dfddc082d994efa54ef2aefee57.tar.xz
biboumi-ccebe901d7d76dfddc082d994efa54ef2aefee57.zip
Check UTF-8 encoding, and convert strings to UTF-8
Handle conversion errors properly by inserting � instead. Add a binary header to provide portable way to write binary literals (I like them) Also add a test file. ref #2404
Diffstat (limited to 'src/utils')
-rw-r--r--src/utils/binary.hpp16
-rw-r--r--src/utils/encoding.cpp139
-rw-r--r--src/utils/encoding.hpp21
3 files changed, 176 insertions, 0 deletions
diff --git a/src/utils/binary.hpp b/src/utils/binary.hpp
new file mode 100644
index 0000000..10807bc
--- /dev/null
+++ b/src/utils/binary.hpp
@@ -0,0 +1,16 @@
+#ifndef BINARY_INCLUDED
+# define BINARY_INCLUDED
+
+template<char FIRST, char... REST> struct binary
+{
+ static_assert(FIRST == '0' || FIRST == '1', "invalid binary digit" );
+ enum { value = ((FIRST - '0') << sizeof...(REST)) + binary<REST...>::value };
+};
+
+template<> struct binary<'0'> { enum { value = 0 }; };
+template<> struct binary<'1'> { enum { value = 1 }; };
+
+template<char... LITERAL> inline
+constexpr unsigned int operator "" _b() { return binary<LITERAL...>::value; }
+
+#endif // BINARY_INCLUDED
diff --git a/src/utils/encoding.cpp b/src/utils/encoding.cpp
new file mode 100644
index 0000000..a1bc01b
--- /dev/null
+++ b/src/utils/encoding.cpp
@@ -0,0 +1,139 @@
+#include <utils/encoding.hpp>
+#include <utils/binary.hpp>
+
+#include <utils/scopeguard.hpp>
+
+#include <assert.h>
+#include <string.h>
+#include <iconv.h>
+
+/**
+ * The UTF-8-encoded character used as a place holder when a character conversion fails.
+ * This is U+FFFD � "replacement character"
+ */
+static const char* invalid_char = "\xef\xbf\xbd";
+static const size_t invalid_char_len = 3;
+
+namespace utils
+{
+ /**
+ * Based on http://en.wikipedia.org/wiki/UTF-8#Description
+ */
+ bool is_valid_utf8(const char* s)
+ {
+ if (!s)
+ return false;
+
+ const unsigned char* str = reinterpret_cast<const unsigned char*>(s);
+
+ while (*str)
+ {
+ // 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ if ((str[0] & 11111000_b) == 11110000_b)
+ {
+ if (!str[1] || !str[2] || !str[3]
+ || ((str[1] & 11000000_b) != 10000000_b)
+ || ((str[2] & 11000000_b) != 10000000_b)
+ || ((str[3] & 11000000_b) != 10000000_b))
+ return false;
+ str += 4;
+ }
+ // 3 bytes: 1110xxx 10xxxxxx 10xxxxxx
+ else if ((str[0] & 11110000_b) == 11100000_b)
+ {
+ if (!str[1] || !str[2]
+ || ((str[1] & 11000000_b) != 10000000_b)
+ || ((str[2] & 11000000_b) != 10000000_b))
+ return false;
+ str += 3;
+ }
+ // 2 bytes: 110xxxxx 10xxxxxx
+ else if (((str[0]) & 11100000_b) == 11000000_b)
+ {
+ if (!str[1] ||
+ ((str[1] & 11000000_b) != 10000000_b))
+ return false;
+ str += 2;
+ }
+ // 1 byte: 0xxxxxxx
+ else if ((str[0] & 10000000_b) != 0)
+ return false;
+ else
+ str++;
+ }
+ return true;
+ }
+
+ std::string convert_to_utf8(const std::string& str, const char* charset)
+ {
+ std::string res;
+
+ const iconv_t cd = iconv_open("UTF-8", charset);
+ if (cd == (iconv_t)-1)
+ throw std::runtime_error("Cannot convert into UTF-8");
+
+ // Make sure cd is always closed when we leave this function
+ ScopeGuard sg([&]{ iconv_close(cd); });
+
+ // iconv will not attempt to modify this buffer, but it still requires
+ // a char**.
+ size_t inbytesleft = str.size();
+ char* inbuf_ptr = const_cast<char*>(str.c_str());
+
+ size_t outbytesleft = str.size() * 4;
+ char* outbuf = new char[outbytesleft];
+ char* outbuf_ptr = outbuf;
+
+ // Make sure outbuf is always deleted when we leave this function
+ sg.add_callback([&]{ delete[] outbuf; });
+
+ bool done = false;
+ while (done == false)
+ {
+ size_t error = iconv(cd, &inbuf_ptr, &inbytesleft, &outbuf_ptr, &outbytesleft);
+ if ((size_t)-1 == error)
+ {
+ switch (errno)
+ {
+ case EILSEQ:
+ // Invalid byte found. Insert a placeholder instead of the
+ // converted character, jump one byte and continue
+ memcpy(outbuf_ptr, invalid_char, invalid_char_len);
+ outbuf_ptr += invalid_char_len;
+ inbytesleft--;
+ inbuf_ptr++;
+ break;
+ case EINVAL:
+ // A multibyte sequence is not terminated, but we can't
+ // provide any more data, so we just add a placeholder to
+ // indicate that the character is not properly converted,
+ // and we stop the conversion
+ memcpy(outbuf_ptr, invalid_char, invalid_char_len);
+ outbuf_ptr += invalid_char_len;
+ outbuf_ptr++;
+ done = true;
+ break;
+ case E2BIG:
+ // This should never happen
+ done = true;
+ default:
+ // This should happen even neverer
+ done = true;
+ break;
+ }
+ }
+ else
+ {
+ // The conversion finished without any error, stop converting
+ done = true;
+ }
+ }
+ // Terminate the converted buffer, and copy that buffer it into the
+ // string we return
+ *outbuf_ptr = '\0';
+ res = outbuf;
+ return res;
+ }
+
+}
+
diff --git a/src/utils/encoding.hpp b/src/utils/encoding.hpp
new file mode 100644
index 0000000..362f1df
--- /dev/null
+++ b/src/utils/encoding.hpp
@@ -0,0 +1,21 @@
+#ifndef ENCODING_INCLUDED
+# define ENCODING_INCLUDED
+
+#include <string>
+
+namespace utils
+{
+ /**
+ * Returns true if the given null-terminated string is valid utf-8.
+ *
+ * Based on http://en.wikipedia.org/wiki/UTF-8#Description
+ */
+ bool is_valid_utf8(const char* s);
+ /**
+ * Convert the given string (encoded is "encoding") into valid utf-8.
+ * If some decoding fails, insert an utf-8 placeholder character instead.
+ */
+ std::string convert_to_utf8(const std::string& str, const char* encoding);
+}
+
+#endif // ENCODING_INCLUDED