From 6235fb2d0326b18a9e013ae13dfb1fd0577ffd9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?louiz=E2=80=99?= Date: Wed, 15 Jun 2016 00:38:43 +0200 Subject: Add get_next_codepoint_size --- louloulibs/utils/encoding.cpp | 28 ++++++++++++++++------------ louloulibs/utils/encoding.hpp | 5 +++++ 2 files changed, 21 insertions(+), 12 deletions(-) (limited to 'louloulibs/utils') diff --git a/louloulibs/utils/encoding.cpp b/louloulibs/utils/encoding.cpp index f738ce2..507f38a 100644 --- a/louloulibs/utils/encoding.cpp +++ b/louloulibs/utils/encoding.cpp @@ -23,6 +23,17 @@ namespace utils /** * Based on http://en.wikipedia.org/wiki/UTF-8#Description */ + std::size_t get_next_codepoint_size(const unsigned char c) + { + if ((c & 0b11111000) == 0b11110000) // 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + return 4; + else if ((c & 0b11110000) == 0b11100000) // 3 bytes: 1110xxx 10xxxxxx 10xxxxxx + return 3; + else if ((c & 0b11100000) == 0b11000000) // 2 bytes: 110xxxxx 10xxxxxx + return 2; + return 1; // 1 byte: 0xxxxxxx + } + bool is_valid_utf8(const char* s) { if (!s) @@ -32,38 +43,31 @@ namespace utils while (*str) { - // 4 bytes: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - if ((str[0] & 0b11111000) == 0b11110000) + const auto codepoint_size = get_next_codepoint_size(str[0]); + if (codepoint_size == 4) { if (!str[1] || !str[2] || !str[3] || ((str[1] & 0b11000000) != 0b10000000) || ((str[2] & 0b11000000) != 0b10000000) || ((str[3] & 0b11000000) != 0b10000000)) return false; - str += 4; } - // 3 bytes: 1110xxx 10xxxxxx 10xxxxxx - else if ((str[0] & 0b11110000) == 0b11100000) + else if (codepoint_size == 3) { if (!str[1] || !str[2] || ((str[1] & 0b11000000) != 0b10000000) || ((str[2] & 0b11000000) != 0b10000000)) return false; - str += 3; } - // 2 bytes: 110xxxxx 10xxxxxx - else if (((str[0]) & 0b11100000) == 0b11000000) + else if (codepoint_size == 2) { if (!str[1] || ((str[1] & 0b11000000) != 0b10000000)) return false; - str += 2; } - // 1 byte: 0xxxxxxx else if ((str[0] & 0b10000000) != 0) return false; - else - str++; + str += codepoint_size; } return true; } diff --git a/louloulibs/utils/encoding.hpp b/louloulibs/utils/encoding.hpp index 6b7ccd2..3f55055 100644 --- a/louloulibs/utils/encoding.hpp +++ b/louloulibs/utils/encoding.hpp @@ -5,6 +5,11 @@ namespace utils { + /** + * Return the size, in bytes, of the next UTF-8 codepoint, based on + * the given char. + */ + std::size_t get_next_codepoint_size(const unsigned char c); /** * Returns true if the given null-terminated string is valid utf-8. * -- cgit v1.2.3