Safer UTF-8 decoder

This commit is contained in:
Clownacy 2019-01-25 11:50:42 +00:00
parent acdf8aa622
commit 414fe76abc

View file

@ -38,25 +38,47 @@ static unsigned long UTF8ToCode(const unsigned char *string, unsigned int *bytes
unsigned int length; unsigned int length;
unsigned long charcode; unsigned long charcode;
if ((string[0] & 0x80) == 0) unsigned int zero_bit = 0;
for (unsigned char lead_byte = string[0]; zero_bit < 5 && (lead_byte & 0x80); ++zero_bit, lead_byte <<= 1);
switch (zero_bit)
{ {
length = 1; case 0:
charcode = string[0] & 0x7F; // Single-byte character
} length = 1;
else if ((string[0] & 0xE0) == 0xC0) charcode = string[0];
{ break;
length = 2;
charcode = ((string[0] & ~0xE0) << 6) | (string[1] & 0x3F); case 2:
} case 3:
else if ((string[0] & 0xF0) == 0xE0) case 4:
{ length = zero_bit;
length = 3; charcode = string[0] & (1 << (8 - zero_bit)) - 1;
charcode = ((string[0] & ~0xF0) << (6 * 2)) | ((string[1] & 0x3F) << 6) | (string[2] & 0x3F);
} for (unsigned int i = 1; i < zero_bit; ++i)
else //if (string[0] & 0xF8 == 0xF0) {
{ if ((string[i] & 0xC0) == 0x80)
length = 4; {
charcode = ((string[0] & ~0xF8) << (6 * 3)) | ((string[1] & 0x3F) << (6 * 2)) | ((string[2] & 0x3F) << 6) | (string[3] & 0x3F); charcode <<= 6;
charcode |= string[i] & ~0xC0;
}
else
{
// Error: Invalid continuation byte
length = 1;
charcode = 0xFFFD;
break;
}
}
break;
default:
// Error: Invalid lead byte
length = 1;
charcode = 0xFFFD;
break;
} }
if (bytes_read) if (bytes_read)