unrealircd- supernets unrealircd source & configuration |
git clone git://git.acid.vegas/unrealircd.git |
Log | Files | Refs | Archive | README | LICENSE |
utf8.c (6567B)
1 #include "unrealircd.h" 2 3 /**************** UTF8 HELPER FUNCTIONS START HERE *****************/ 4 5 /* Operations on UTF-8 strings. 6 * This part is taken from "glib" with the following copyright: 7 * Copyright (C) 1999 Tom Tromey 8 * Copyright (C) 2000 Red Hat, Inc. 9 * Taken from the master snapshot on Oct 23, 2018, glib/gutf8.c. 10 * The library uses LGPL 2.1. From what I understand this allows me to 11 * use this code in a GPLv2-compatible way which fits the rest of 12 * the UnrealIRCd project. 13 * 14 * Code stripped and converted heavily to fit in UnrealIRCd by 15 * Bram Matthys ("Syzop") in 2019. Thanks to i <info@servx.org> 16 * for all the directions and help with regards to UTF8 handling. 17 * 18 * Note that with UnrealIRCd, a char is always unsigned char, 19 * which allows us to cut some corners and make more readable 20 * code without 100 casts. 21 */ 22 23 #define VALIDATE_BYTE(mask, expect) \ 24 do { \ 25 if ((*p & (mask)) != (expect)) \ 26 goto error; \ 27 } while(0) 28 29 /* see IETF RFC 3629 Section 4 */ 30 31 static const char *fast_validate(const char *str) 32 { 33 const char *p; 34 35 for (p = str; *p; p++) 36 { 37 if (*p >= 128) 38 { 39 const char *last; 40 41 last = p; 42 if (*p < 0xe0) /* 110xxxxx */ 43 { 44 // ehm.. did you forget a ++p ? ;) or whatever 45 if (*p < 0xc2) 46 { 47 goto error; 48 } 49 } 50 else 51 { 52 if (*p < 0xf0) /* 1110xxxx */ 53 { 54 switch (*p++ & 0x0f) 55 { 56 case 0: 57 VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */ 58 break; 59 case 0x0d: 60 VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */ 61 break; 62 default: 63 VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ 64 } 65 } 66 else if (*p < 0xf5) /* 11110xxx excluding out-of-range */ 67 { 68 switch (*p++ & 0x07) 69 { 70 case 0: 71 VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ 72 if ((*p & 0x30) == 0) 73 goto error; 74 break; 75 case 4: 76 VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */ 77 break; 78 default: 79 VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ 80 } 81 p++; 82 VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ 83 } 84 else 85 { 86 goto error; 87 } 88 } 89 90 p++; 91 VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */ 92 93 continue; 94 95 error: 96 return last; 97 } 98 } 99 100 return p; 101 } 102 103 /** Check if a string is valid UTF8. 104 * @param str The string to validate 105 * @param end Pointer to char *, as explained in notes below. 106 * @returns 1 if the string is valid UTF8, 0 if not. 107 * @note The variable *end will be set to the first invalid UTF8 sequence. 108 * If no invalid UTF8 sequence is encountered then it points to the NUL byte. 109 */ 110 int unrl_utf8_validate(const char *str, const char **end) 111 { 112 const char *p; 113 114 p = fast_validate(str); 115 116 if (end) 117 *end = p; 118 119 if (*p != '\0') 120 return 0; 121 else 122 return 1; 123 } 124 125 /** Go backwards in a string until we are at the end of an UTF8 sequence. 126 * Or more accurately: skip sequences that are part of an UTF8 sequence. 127 * @param begin The string to check 128 * @param p Where to start backtracking 129 * @returns Byte that is not in the middle of an UTF8 sequence, 130 * or NULL if we reached the beginning and that isn't valid either. 131 */ 132 char *unrl_utf8_find_prev_char (const char *begin, const char *p) 133 { 134 for (--p; p >= begin; --p) 135 { 136 if ((*p & 0xc0) != 0x80) 137 return (char *)p; 138 } 139 return NULL; 140 } 141 142 /** Return a valid UTF8 string based on the input. 143 * @param str The input string 144 * @param outputbuf The output buffer 145 * @param outputbuflen Length of the output buffer 146 * @param strictlen If set to 1 we never return more than 147 * outputbuflen-1 characters. 148 * If set to 0, we may do that, if the 149 * input string was already 100% valid UTF8. 150 * @retval Returns a valid UTF8 string, either the input buffer 151 * (if it was already valid UTF8) or the output buffer. 152 * NULL is returned if either 'str' was NULL or outputlen is zero. 153 * @notes The 'outputbuf' is unused if the string is already valid UTF8. 154 * So don't rely on it being always set, use the returned string. 155 */ 156 char *unrl_utf8_make_valid(const char *str, char *outputbuf, size_t outputbuflen, int strictlen) 157 { 158 const char *remainder, *invalid; 159 int remaining_bytes, valid_bytes, len; 160 int replaced = 0; /**< UTF8 string needed replacement (was invalid) */ 161 162 if (!str || !outputbuflen) 163 return NULL; 164 165 len = strlen(str); 166 167 *outputbuf = '\0'; 168 remainder = str; 169 remaining_bytes = len; 170 171 while (remaining_bytes != 0) 172 { 173 if (unrl_utf8_validate(remainder, &invalid)) 174 { 175 if (!replaced) 176 { 177 if (strictlen) 178 { 179 /* Caller wants us to go through the 'replaced' branch */ 180 strlcpy(outputbuf, str, outputbuflen); 181 replaced = 1; 182 } 183 break; 184 } else { 185 /* We already replaced earlier, now just put the rest at the end. */ 186 strlcat(outputbuf, remainder, outputbuflen); 187 break; 188 } 189 } 190 replaced = 1; 191 valid_bytes = invalid - remainder; 192 193 strlncat(outputbuf, remainder, outputbuflen, valid_bytes); /*g_string_append_len(string, remainder, valid_bytes);*/ 194 strlcat(outputbuf, "\357\277\275", outputbuflen); 195 196 remaining_bytes -= valid_bytes + 1; 197 remainder = invalid + 1; 198 } 199 200 if (!replaced) 201 return (char *)str; /* return original string (no changes needed) */ 202 203 /* If we took up all the space, then backtrack one character and cut 204 * things off from there. This to ensure that we don't end up with 205 * invalid UTF8 due to cutting half-way a UTF8 byte sequence. 206 * NOTE: This may cause us to remove 1 character needlessly at the 207 * end even though there was still (some) space. So be it. 208 */ 209 if (strlen(outputbuf) == outputbuflen-1) 210 { 211 char *cut_at = unrl_utf8_find_prev_char(outputbuf, outputbuf+outputbuflen-1); 212 if (cut_at) 213 *cut_at = '\0'; 214 } 215 216 #ifdef DEBUGMODE 217 if (!unrl_utf8_validate(outputbuf, NULL)) 218 abort(); /* this should never happen, it means our conversion resulted in an invalid UTF8 string */ 219 #endif 220 221 return outputbuf; 222 } 223 224 /**************** END OF UTF8 HELPER FUNCTIONS *****************/ 225 226 /** This is just for internal testing */ 227 void utf8_test(void) 228 { 229 char buf[1024]; 230 char *res; 231 int cnt = 0; 232 char *heapbuf; /* for strict OOB testing with ASan */ 233 char *workbuf = safe_alloc(500); 234 size_t workbuflen = 500; 235 236 while ((fgets(buf, sizeof(buf), stdin))) 237 { 238 stripcrlf(buf); 239 heapbuf = strdup(buf); 240 res = unrl_utf8_make_valid(heapbuf, workbuf, workbuflen, 1); 241 if (heapbuf == res) 242 { 243 printf(" %s\n", res); 244 } else { 245 printf("[!] %s\n", res); 246 } 247 free(heapbuf); 248 } 249 safe_free(workbuf); 250 }