Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 55 additions & 48 deletions mregexp.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/*
/*
* Copyright (c) 2020 Fabian van Rissenbeck

* Permission is hereby granted, free of charge, to any person obtaining a copy
Expand Down Expand Up @@ -34,65 +34,72 @@
#define __SIZE_MAX__ 4294967296
#endif

static inline unsigned utf8_char_width(uint8_t c)
static inline uint32_t utf8_char_width(uint8_t c)
{
int a1 = !(128 & c) && 1;
int a2 = (128 & c) && (64 & c) && !(32 & c);
int a3 = (128 & c) && (64 & c) && (32 & c) && !(16 & c);
int a4 = (128 & c) && (64 & c) && (32 & c) && (16 & c) && !(8 & c);

return a1 * 1 + a2 * 2 + a3 * 3 + a4 * 4;
// "optimal" branchless implementation...
uint32_t ret = (c < 0x80); // 1
ret |= ((c & 0xE0) == 0xC0) << 1; // 2
ret |= ((c & 0xF0) == 0xE0) * 3; // 3
ret |= ((c & 0xF8) == 0xF0) << 2; // 4
return ret;
}

// utf8 valid is extremly slow
static inline bool utf8_valid(const char *s)
{
const size_t len = strlen(s);

for (size_t i = 0; i < len;) {
const unsigned width = utf8_char_width((uint8_t)s[i]);

if (width == 0) {
return false;
}

if (i + width > len) {
return false;
}
// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.

#define UTF8_ACCEPT 0
#define UTF8_REJECT 1

static const uint8_t utf8d[] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
};

for (unsigned j = 1; j < width; ++j)
if ((s[i + j] & (128 + 64)) != 128) {
return false;
}
static inline uint32_t utf8_decode(uint32_t *state, uint32_t *codep,
const uint32_t byte)
{
uint32_t type = utf8d[byte];
*codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6)
: (0xff >> type) & (byte);

i += width;
}
*state = utf8d[256 + (*state << 4) + type];
return *state;
}

return true;
static bool utf8_count_codepoints(size_t *count, const uint8_t *s)
{
uint32_t state = UTF8_ACCEPT, codepoint;
for (*count = 0; *s; ++s)
*count += !utf8_decode(&state, &codepoint, *s);
return state == UTF8_ACCEPT; // NB! valid == true
}

bool mregexp_check_utf8(const char *s)
// End - Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>

bool mregexp_valid_utf8(const char *s)
{
return utf8_valid(s);
size_t count;
bool valid = utf8_count_codepoints(&count, (const uint8_t *)s);
return valid;
}

static const int utf8_peek_mods[] = {0, 127, 31, 15, 7};
static inline uint32_t utf8_peek(const char *s)
{
if (*s == 0)
return 0;

const unsigned width = utf8_char_width(s[0]);
size_t ret = 0;

ret = s[0] & utf8_peek_mods[width];

for (unsigned i = 1; i < width; ++i) {
ret <<= 6;
ret += s[i] & 63;
}

return ret;
uint32_t state = UTF8_ACCEPT, codepoint;
utf8_decode(&state, &codepoint, (uint8_t)s[0]);
return codepoint;
}

static inline const char *utf8_next(const char *s)
Expand Down Expand Up @@ -853,7 +860,7 @@ MRegexp *mregexp_compile(const char *re)
return NULL;
}

if (!utf8_valid(re)) {
if (!mregexp_valid_utf8(re)) {
CompileException.err = MREGEXP_INVALID_UTF8;
CompileException.s = NULL;
return NULL;
Expand Down