unrealircd

- supernets unrealircd source & configuration
git clone git://git.acid.vegas/unrealircd.git
utf8.c (6567B)
      1 #include "unrealircd.h"
      2 
      3 /**************** UTF8 HELPER FUNCTIONS START HERE *****************/
      4 
      5 /* Operations on UTF-8 strings.
      6  * This part is taken from "glib" with the following copyright:
      7  * Copyright (C) 1999 Tom Tromey
      8  * Copyright (C) 2000 Red Hat, Inc.
      9  * Taken from the master snapshot on Oct 23, 2018, glib/gutf8.c.
     10  * The library uses LGPL 2.1. From what I understand this allows me to
     11  * use this code in a GPLv2-compatible way which fits the rest of
     12  * the UnrealIRCd project.
     13  *
     14  * Code stripped and converted heavily to fit in UnrealIRCd by
     15  * Bram Matthys ("Syzop") in 2019. Thanks to i <info@servx.org>
     16  * for all the directions and help with regards to UTF8 handling.
     17  *
     18  * Note that with UnrealIRCd, a char is always unsigned char,
     19  * which allows us to cut some corners and make more readable
     20  * code without 100 casts.
     21  */
     22 
     23 #define VALIDATE_BYTE(mask, expect) \
     24   do {                              \
     25     if ((*p & (mask)) != (expect))  \
     26       goto error;                   \
     27   } while(0)
     28 
     29 /* see IETF RFC 3629 Section 4 */
     30 
     31 static const char *fast_validate(const char *str)
     32 {
     33 	const char *p;
     34 
     35 	for (p = str; *p; p++)
     36 	{
     37 		if (*p >= 128)
     38 		{
     39 			const char *last;
     40 
     41 			last = p;
     42 			if (*p < 0xe0) /* 110xxxxx */
     43 			{
     44 				// ehm.. did you forget a ++p ? ;) or whatever
     45 				if (*p < 0xc2)
     46 				{
     47 					goto error;
     48 				}
     49 			}
     50 			else
     51 			{
     52 				if (*p < 0xf0) /* 1110xxxx */
     53 				{
     54 					switch (*p++ & 0x0f)
     55 					{
     56 						case 0:
     57 							VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */
     58 							break;
     59 						case 0x0d:
     60 							VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */
     61 							break;
     62 						default:
     63 							VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
     64 					}
     65 				}
     66 				else if (*p < 0xf5) /* 11110xxx excluding out-of-range */
     67 				{
     68 					switch (*p++ & 0x07)
     69 					{
     70 						case 0:
     71 							VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
     72 							if ((*p & 0x30) == 0)
     73 								goto error;
     74 							break;
     75 						case 4:
     76 							VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */
     77 							break;
     78 						default:
     79 							VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
     80 					}
     81 					p++;
     82 					VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
     83 				}
     84 				else
     85 				{
     86 					goto error;
     87 				}
     88 			}
     89 
     90 			p++;
     91 			VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */
     92 
     93 			continue;
     94 
     95 error:
     96 			return last;
     97 		}
     98 	}
     99 
    100 	return p;
    101 }
    102 
    103 /** Check if a string is valid UTF8.
    104  * @param str   The string to validate
    105  * @param end   Pointer to char *, as explained in notes below.
    106  * @returns 1 if the string is valid UTF8, 0 if not.
    107  * @note  The variable *end will be set to the first invalid UTF8 sequence.
    108  *        If no invalid UTF8 sequence is encountered then it points to the NUL byte.
    109  */
    110 int unrl_utf8_validate(const char *str, const char **end)
    111 {
    112 	const char *p;
    113 
    114 	p = fast_validate(str);
    115 
    116 	if (end)
    117 		*end = p;
    118 
    119 	if (*p != '\0')
    120 		return 0;
    121 	else
    122 		return 1;
    123 }
    124 
    125 /** Go backwards in a string until we are at the end of an UTF8 sequence.
    126  * Or more accurately: skip sequences that are part of an UTF8 sequence.
    127  * @param begin   The string to check
    128  * @param p       Where to start backtracking
    129  * @returns Byte that is not in the middle of an UTF8 sequence,
    130  *          or NULL if we reached the beginning and that isn't valid either.
    131  */
    132 char *unrl_utf8_find_prev_char (const char *begin, const char *p)
    133 {
    134 	for (--p; p >= begin; --p)
    135 	{
    136 		if ((*p & 0xc0) != 0x80)
    137 			return (char *)p;
    138 	}
    139 	return NULL;
    140 }
    141 
    142 /** Return a valid UTF8 string based on the input.
    143  * @param str		The input string
    144  * @param outputbuf	The output buffer
    145  * @param outputbuflen	Length of the output buffer
    146  * @param strictlen	If set to 1 we never return more than
    147  *                      outputbuflen-1 characters.
    148  *                      If set to 0, we may do that, if the
    149  *                      input string was already 100% valid UTF8.
    150  * @retval Returns a valid UTF8 string, either the input buffer
    151  *         (if it was already valid UTF8) or the output buffer.
    152  *         NULL is returned if either 'str' was NULL or outputlen is zero.
    153  * @notes The 'outputbuf' is unused if the string is already valid UTF8.
    154  *        So don't rely on it being always set, use the returned string.
    155  */
    156 char *unrl_utf8_make_valid(const char *str, char *outputbuf, size_t outputbuflen, int strictlen)
    157 {
    158 	const char *remainder, *invalid;
    159 	int remaining_bytes, valid_bytes, len;
    160 	int replaced = 0; /**< UTF8 string needed replacement (was invalid) */
    161 
    162 	if (!str || !outputbuflen)
    163 		return NULL;
    164 
    165 	len = strlen(str);
    166 
    167 	*outputbuf = '\0';
    168 	remainder = str;
    169 	remaining_bytes = len;
    170 
    171 	while (remaining_bytes != 0)
    172 	{
    173 		if (unrl_utf8_validate(remainder, &invalid))
    174 		{
    175 			if (!replaced)
    176 			{
    177 				if (strictlen)
    178 				{
    179 					/* Caller wants us to go through the 'replaced' branch */
    180 					strlcpy(outputbuf, str, outputbuflen);
    181 					replaced = 1;
    182 				}
    183 				break;
    184 			} else {
    185 				/* We already replaced earlier, now just put the rest at the end. */
    186 				strlcat(outputbuf, remainder, outputbuflen);
    187 				break;
    188 			}
    189 		}
    190 		replaced = 1;
    191 		valid_bytes = invalid - remainder;
    192 
    193 		strlncat(outputbuf, remainder, outputbuflen, valid_bytes); /*g_string_append_len(string, remainder, valid_bytes);*/
    194 		strlcat(outputbuf, "\357\277\275", outputbuflen);
    195 
    196 		remaining_bytes -= valid_bytes + 1;
    197 		remainder = invalid + 1;
    198 	}
    199 
    200 	if (!replaced)
    201 		return (char *)str; /* return original string (no changes needed) */
    202 
    203 	/* If we took up all the space, then backtrack one character and cut
    204 	 * things off from there. This to ensure that we don't end up with
    205 	 * invalid UTF8 due to cutting half-way a UTF8 byte sequence.
    206 	 * NOTE: This may cause us to remove 1 character needlessly at the
    207 	 *       end even though there was still (some) space. So be it.
    208 	 */
    209 	if (strlen(outputbuf) == outputbuflen-1)
    210 	{
    211 		char *cut_at = unrl_utf8_find_prev_char(outputbuf, outputbuf+outputbuflen-1);
    212 		if (cut_at)
    213 			*cut_at = '\0';
    214 	}
    215 
    216 #ifdef DEBUGMODE
    217 	if (!unrl_utf8_validate(outputbuf, NULL))
    218 		abort(); /* this should never happen, it means our conversion resulted in an invalid UTF8 string */
    219 #endif
    220 
    221 	return outputbuf;
    222 }
    223 
    224 /**************** END OF UTF8 HELPER FUNCTIONS *****************/
    225 
    226 /** This is just for internal testing */
    227 void utf8_test(void)
    228 {
    229 	char buf[1024];
    230 	char *res;
    231 	int cnt = 0;
    232 	char *heapbuf; /* for strict OOB testing with ASan */
    233 	char *workbuf = safe_alloc(500);
    234 	size_t workbuflen = 500;
    235 
    236 	while ((fgets(buf, sizeof(buf), stdin)))
    237 	{
    238 		stripcrlf(buf);
    239 		heapbuf = strdup(buf);
    240 		res = unrl_utf8_make_valid(heapbuf, workbuf, workbuflen, 1);
    241 		if (heapbuf == res)
    242 		{
    243 			printf("    %s\n", res);
    244 		} else {
    245 			printf("[!] %s\n", res);
    246 		}
    247 		free(heapbuf);
    248 	}
    249 	safe_free(workbuf);
    250 }