unrealircd

- supernets unrealircd source & configuration
git clone git://git.acid.vegas/unrealircd.git
match.c (23974B)
      1 /*
      2  *   Unreal Internet Relay Chat Daemon, src/match.c
      3  *   Copyright (C) 1990 Jarkko Oikarinen
      4  *
      5  *   This program is free software; you can redistribute it and/or modify
      6  *   it under the terms of the GNU General Public License as published by
      7  *   the Free Software Foundation; either version 1, or (at your option)
      8  *   any later version.
      9  *
     10  *   This program is distributed in the hope that it will be useful,
     11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
     12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13  *   GNU General Public License for more details.
     14  *
     15  *   You should have received a copy of the GNU General Public License
     16  *   along with this program; if not, write to the Free Software
     17  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
     18  */
     19 
     20 
     21 #include "unrealircd.h"
     22 
     23 ID_Copyright("(C) 1990 Jarkko Oikarinen");
     24 
     25 /*
     26  *  Compare if a given string (name) matches the given
     27  *  mask (which can contain wild cards: '*' - match any
     28  *  number of chars, '?' - match any single character.
     29  *
     30  *	return	0, if match
     31  *		1, if no match
     32  */
     33 
     34 u_char touppertab[], tolowertab[];
     35 #define tolowertab2 tolowertab
     36 #define lc(x) tolowertab2[x]
     37 
     38 /* Match routine for special cases where escaping is needed in a normal fashion.
     39  * Checks a string ('name') against a globbing(+more) pattern ('mask').
     40  * Original by Douglas A Lewis (dalewis@acsu.buffalo.edu).
     41  * Code based on hybrid7's version (match_esc()).
     42  * Various modifications by Bram Matthys (Syzop).
     43  * Returns 1 on match and 0 for no match.
     44  * Instead of our previous code, this one is less optimized but actually  _readable_ ;).
     45  * Modifications I (Syzop) had to do vs the hybrid7 code:
     46  * - Got rid of (u_char *) casts, since we already compile with
     47  *   chars defaulting to unsigned [or else major things break] ;).
     48  * - Support for '_'.
     49  * - Rip out support for '#'.
     50  */
     51 int match_esc(const char *mask, const char *name)
     52 {
     53 	const u_char *m = mask;
     54 	const u_char *n = name;
     55 	const u_char *ma = NULL;
     56 	const u_char *na = name;
     57 
     58 	while(1)
     59 	{
     60 		if (*m == '*')
     61 		{
     62 			while (*m == '*') /* collapse.. */
     63 				m++;
     64 			ma = m; 
     65 			na = n;
     66 		}
     67 		
     68 		if (!*m)
     69 		{
     70 			if (!*n)
     71 				return 1;
     72 			if (!ma)
     73 				return 0;
     74 			for (m--; (m > (const u_char *)mask) && (*m == '?'); m--);
     75 			if (*m == '*')
     76 				return 1;
     77 			m = ma;
     78 			n = ++na;
     79 		} else
     80 		if (!*n)
     81 		{
     82 			while (*m == '*') /* collapse.. */
     83 				m++;
     84 			return (*m == 0);
     85 		}
     86 		
     87 		if (*m != '?')
     88 		{
     89 			if (*m == '\\')
     90 				if (!*++m)
     91 					return 0; /* unfinished escape sequence */
     92 			if ((lc(*m) != lc(*n)) && !((*m == '_') && (*n == ' ')))
     93 			{
     94 				if (!ma)
     95 					return 0;
     96 				m = ma;
     97 				n = ++na;
     98 			} else
     99 			{
    100 				m++;
    101 				n++;
    102 			}
    103 		} else
    104 		{
    105 			m++;
    106 			n++;
    107 		}
    108 	}
    109 	return 0;
    110 }
    111 
    112 /** Same credit/copyright as match_esc() applies, except escaping removed.. ;p */
    113 int match_simple(const char *mask, const char *name)
    114 {
    115 	const u_char *m = mask;
    116 	const u_char *n = name;
    117 	const u_char *ma = NULL;
    118 	const u_char *na = name;
    119 
    120 	while(1)
    121 	{
    122 		if (*m == '*')
    123 		{
    124 			while (*m == '*') /* collapse.. */
    125 				m++;
    126 			ma = m; 
    127 			na = n;
    128 		}
    129 		
    130 		if (!*m)
    131 		{
    132 			if (!*n)
    133 				return 1;
    134 			if (!ma)
    135 				return 0;
    136 			for (m--; (m > (const u_char *)mask) && (*m == '?'); m--);
    137 			if (*m == '*')
    138 				return 1;
    139 			m = ma;
    140 			n = ++na;
    141 		} else
    142 		if (!*n)
    143 		{
    144 			while (*m == '*') /* collapse.. */
    145 				m++;
    146 			return (*m == 0);
    147 		}
    148 		
    149 		if ((lc(*m) != lc(*n)) && !((*m == '_') && (*n == ' ')) && (*m != '?'))
    150 		{
    151 			if (!ma)
    152 				return 0;
    153 			m = ma;
    154 			n = ++na;
    155 		} else
    156 		{
    157 			m++;
    158 			n++;
    159 		}
    160 	}
    161 	return 0;
    162 }
    163 
    164 /*
    165  * collapse a pattern string into minimal components.
    166  * This particular version is "in place", so that it changes the pattern
    167  * which is to be reduced to a "minimal" size.
    168  */
    169 char *collapse(char *pattern)
    170 {
    171 	char *s;
    172 	char *s1;
    173 	char *t;
    174 
    175 	s = pattern;
    176 
    177 	if (BadPtr(pattern))
    178 		return pattern;
    179 	/*
    180 	 * Collapse all \** into \*, \*[?]+\** into \*[?]+
    181 	 */
    182 	for (; *s; s++)
    183 		if (*s == '\\')
    184 		{
    185 			if (!*(s + 1))
    186 				break;
    187 			else
    188 				s++;
    189 		}
    190 		else if (*s == '*')
    191 		{
    192 			if (*(t = s1 = s + 1) == '*')
    193 				while (*t == '*')
    194 					t++;
    195 			else if (*t == '?')
    196 				for (t++, s1++; *t == '*' || *t == '?'; t++)
    197 					if (*t == '?')
    198 						*s1++ = *t;
    199 			while ((*s1++ = *t++))
    200 				;
    201 		}
    202 	return pattern;
    203 }
    204 
    205 
    206 /* Case insensitive comparison of two NULL terminated strings,
    207  * using the "IRC nick comparison" rules. Or, well, partially
    208  * anyway.
    209  * Should be used for NICK-related comparisons. And probably
    210  * not even then, since this does not deal with multibyte.
    211  * @returns 	 0, if s1 equal to s2
    212  *		<0, if s1 lexicographically less than s2
    213  *		>0, if s1 lexicographically greater than s2
    214  */
    215 int  smycmp(const char *s1, const char *s2)
    216 {
    217 	u_char *str1;
    218 	u_char *str2;
    219 	int  res;
    220 
    221 	str1 = (u_char *)s1;
    222 	str2 = (u_char *)s2;
    223 
    224 	while ((res = toupper(*str1) - toupper(*str2)) == 0)
    225 	{
    226 		if (*str1 == '\0')
    227 			return 0;
    228 		str1++;
    229 		str2++;
    230 	}
    231 	return (res);
    232 }
    233 
    234 u_char tolowertab[] = {
    235 	0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa,
    236 	0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14,
    237 	0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
    238 	0x1e, 0x1f,
    239 	' ', '!', '"', '#', '$', '%', '&', 0x27, '(', ')',
    240 	'*', '+', ',', '-', '.', '/',
    241 	'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
    242 	':', ';', '<', '=', '>', '?',
    243 	'@', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
    244 	'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
    245 	't', 'u', 'v', 'w', 'x', 'y', 'z', '[', '\\', ']', '^',
    246 	'_',
    247 	'`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
    248 	'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
    249 	't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~',
    250 	0x7f,
    251 	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
    252 	0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
    253 	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
    254 	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
    255 	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9,
    256 	0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
    257 	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9,
    258 	0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
    259 	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9,
    260 	0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
    261 	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9,
    262 	0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
    263 	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
    264 	0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
    265 	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9,
    266 	0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
    267 };
    268 
    269 u_char touppertab[] = {
    270 	0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa,
    271 	0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14,
    272 	0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
    273 	0x1e, 0x1f,
    274 	' ', '!', '"', '#', '$', '%', '&', 0x27, '(', ')',
    275 	'*', '+', ',', '-', '.', '/',
    276 	'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
    277 	':', ';', '<', '=', '>', '?',
    278 	'@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
    279 	'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
    280 	'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^',
    281 	0x5f,
    282 	'`', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
    283 	'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
    284 	'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '{', '|', '}', '~',
    285 	0x7f,
    286 	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
    287 	0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
    288 	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
    289 	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
    290 	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9,
    291 	0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
    292 	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9,
    293 	0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
    294 	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9,
    295 	0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
    296 	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9,
    297 	0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
    298 	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
    299 	0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
    300 	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9,
    301 	0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
    302 };
    303 
    304 u_char char_atribs[] = {
    305 /* 0-7 */ CNTRL, CNTRL, CNTRL, CNTRL, CNTRL, CNTRL, CNTRL, CNTRL,
    306 /* 8-12 */ CNTRL, CNTRL | SPACE, CNTRL | SPACE, CNTRL | SPACE,
    307 	CNTRL | SPACE,
    308 /* 13-15 */ CNTRL | SPACE, CNTRL, CNTRL,
    309 /* 16-23 */ CNTRL, CNTRL, CNTRL, CNTRL, CNTRL, CNTRL, CNTRL, CNTRL,
    310 /* 24-31 */ CNTRL, CNTRL, CNTRL, CNTRL, CNTRL, CNTRL, CNTRL, CNTRL,
    311 /* space */ PRINT | SPACE,
    312 /* !"#$%&'( */ PRINT, PRINT, PRINT, PRINT, PRINT, PRINT, PRINT, PRINT,
    313 /* )*+,-./ */ PRINT, PRINT, PRINT, PRINT, PRINT | ALLOW, PRINT | ALLOW,
    314 	PRINT,
    315 /* 012 */ PRINT | DIGIT | ALLOW, PRINT | DIGIT | ALLOW,
    316 	PRINT | DIGIT | ALLOW,
    317 /* 345 */ PRINT | DIGIT | ALLOW, PRINT | DIGIT | ALLOW,
    318 	PRINT | DIGIT | ALLOW,
    319 /* 678 */ PRINT | DIGIT | ALLOW, PRINT | DIGIT | ALLOW,
    320 	PRINT | DIGIT | ALLOW,
    321 /* 9:; */ PRINT | DIGIT | ALLOW, PRINT, PRINT,
    322 /* <=>? */ PRINT, PRINT, PRINT, PRINT,
    323 /* @ */ PRINT,
    324 /* ABC */ PRINT | ALPHA | ALLOW, PRINT | ALPHA | ALLOW,
    325 	PRINT | ALPHA | ALLOW,
    326 /* DEF */ PRINT | ALPHA | ALLOW, PRINT | ALPHA | ALLOW,
    327 	PRINT | ALPHA | ALLOW,
    328 /* GHI */ PRINT | ALPHA | ALLOW, PRINT | ALPHA | ALLOW,
    329 	PRINT | ALPHA | ALLOW,
    330 /* JKL */ PRINT | ALPHA | ALLOW, PRINT | ALPHA | ALLOW,
    331 	PRINT | ALPHA | ALLOW,
    332 /* MNO */ PRINT | ALPHA | ALLOW, PRINT | ALPHA | ALLOW,
    333 	PRINT | ALPHA | ALLOW,
    334 /* PQR */ PRINT | ALPHA | ALLOW, PRINT | ALPHA | ALLOW,
    335 	PRINT | ALPHA | ALLOW,
    336 /* STU */ PRINT | ALPHA | ALLOW, PRINT | ALPHA | ALLOW,
    337 	PRINT | ALPHA | ALLOW,
    338 /* VWX */ PRINT | ALPHA | ALLOW, PRINT | ALPHA | ALLOW,
    339 	PRINT | ALPHA | ALLOW,
    340 /* YZ[ */ PRINT | ALPHA | ALLOW, PRINT | ALPHA | ALLOW, PRINT,
    341 /* \]^ */ PRINT, PRINT, PRINT,
    342 /* _`  */ PRINT | ALLOW, PRINT,
    343 /* abc */ PRINT | ALPHA | ALLOW, PRINT | ALPHA | ALLOW,
    344 	PRINT | ALPHA | ALLOW,
    345 /* def */ PRINT | ALPHA | ALLOW, PRINT | ALPHA | ALLOW,
    346 	PRINT | ALPHA | ALLOW,
    347 /* ghi */ PRINT | ALPHA | ALLOW, PRINT | ALPHA | ALLOW,
    348 	PRINT | ALPHA | ALLOW,
    349 /* jkl */ PRINT | ALPHA | ALLOW, PRINT | ALPHA | ALLOW,
    350 	PRINT | ALPHA | ALLOW,
    351 /* mno */ PRINT | ALPHA | ALLOW, PRINT | ALPHA | ALLOW,
    352 	PRINT | ALPHA | ALLOW,
    353 /* pqr */ PRINT | ALPHA | ALLOW, PRINT | ALPHA | ALLOW,
    354 	PRINT | ALPHA | ALLOW,
    355 /* stu */ PRINT | ALPHA | ALLOW, PRINT | ALPHA | ALLOW,
    356 	PRINT | ALPHA | ALLOW,
    357 /* vwx */ PRINT | ALPHA | ALLOW, PRINT | ALPHA | ALLOW,
    358 	PRINT | ALPHA | ALLOW,
    359 /* yz{ */ PRINT | ALPHA | ALLOW, PRINT | ALPHA | ALLOW, PRINT,
    360 /* |}~ */ PRINT, PRINT, PRINT,
    361 /* del */ 0,
    362 /* 80-8f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    363 /* 90-9f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    364 /* a0-af */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    365 /* b0-bf */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    366 /* c0-cf */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    367 /* d0-df */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    368 /* e0-ef */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    369 /* f0-ff */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    370 };
    371 
    372 /** Free up all resources of an Match entry (including the struct itself).
    373  * NOTE: this function may (also) be called for Match structs that have only been
    374  *       setup half-way, so use special care when accessing members (NULL checks!)
    375  */
    376 void unreal_delete_match(Match *m)
    377 {
    378 	safe_free(m->str);
    379 	if (m->type == MATCH_PCRE_REGEX)
    380 	{
    381 		if (m->ext.pcre2_expr)
    382 			pcre2_code_free(m->ext.pcre2_expr);
    383 	}
    384 	safe_free(m);
    385 }
    386 
    387 Match *unreal_create_match(MatchType type, const char *str, char **error)
    388 {
    389 	Match *m = safe_alloc(sizeof(Match));
    390 	static char errorbuf[512];
    391 
    392 	*errorbuf = '\0';
    393 
    394 	safe_strdup(m->str, str);
    395 	m->type = type;
    396 	
    397 	if (m->type == MATCH_SIMPLE)
    398 	{
    399 		/* Nothing to do */
    400 	}
    401 	else if (m->type == MATCH_PCRE_REGEX)
    402 	{
    403 		int errorcode = 0;
    404 		PCRE2_SIZE erroroffset = 0;
    405 		int options = 0;
    406 		char buf2[512];
    407 		
    408 		if (iConf.spamfilter_utf8)
    409 			options = PCRE2_CASELESS|PCRE2_MATCH_INVALID_UTF;
    410 		else
    411 			options = PCRE2_CASELESS|PCRE2_NEVER_UTF|PCRE2_NEVER_UCP;
    412 		
    413 		m->ext.pcre2_expr = pcre2_compile(str, PCRE2_ZERO_TERMINATED, options, &errorcode, &erroroffset, NULL);
    414 		if (m->ext.pcre2_expr == NULL)
    415 		{
    416 			*buf2 = '\0';
    417 			pcre2_get_error_message(errorcode, buf2, sizeof(buf2));
    418 			if (error)
    419 			{
    420 				if (erroroffset > 0)
    421 					snprintf(errorbuf, sizeof(errorbuf), "%s (at character #%d)", buf2, (int)erroroffset);
    422 				else
    423 					strlcpy(errorbuf, buf2, sizeof(errorbuf));
    424 				*error = errorbuf;
    425 			}
    426 			unreal_delete_match(m);
    427 			return NULL;
    428 		}
    429 		pcre2_jit_compile(m->ext.pcre2_expr, PCRE2_JIT_COMPLETE);
    430 		return m;
    431 	}
    432 	else {
    433 		/* Unknown type, how did that happen ? */
    434 		unreal_delete_match(m);
    435 		return NULL;
    436 	}
    437 	return m;
    438 }
    439 
    440 /** Try to match an Match entry ('m') against a string ('str').
    441  * @returns 1 if matched, 0 if not.
    442  * @note These (more logical) return values are opposite to the match_simple() function.
    443  */
    444 int unreal_match(Match *m, const char *str)
    445 {
    446 	if (m->type == MATCH_SIMPLE)
    447 	{
    448 		if (match_simple(m->str, str))
    449 			return 1;
    450 		return 0;
    451 	}
    452 	
    453 	if (m->type == MATCH_PCRE_REGEX)
    454 	{
    455 		pcre2_match_data *md = pcre2_match_data_create(9, NULL);
    456 		int ret;
    457 		
    458 		ret = pcre2_match(m->ext.pcre2_expr, str, PCRE2_ZERO_TERMINATED, 0, 0, md, NULL); /* run the regex */
    459 		pcre2_match_data_free(md); /* yeah, we never use it. unfortunately argument must be non-NULL for pcre2_match() */
    460 		
    461 		if (ret > 0)
    462 			return 1; /* MATCH */		
    463 		return 0; /* NO MATCH */
    464 	}
    465 
    466 	return 0;
    467 }
    468 
    469 int unreal_match_method_strtoval(const char *str)
    470 {
    471 	if (!strcmp(str, "regex") || !strcmp(str, "pcre"))
    472 		return MATCH_PCRE_REGEX;
    473 	if (!strcmp(str, "simple") || !strcmp(str, "glob"))
    474 		return MATCH_SIMPLE;
    475 	return 0;
    476 }
    477 
    478 char *unreal_match_method_valtostr(int val)
    479 {
    480 	if (val == MATCH_PCRE_REGEX)
    481 		return "regex";
    482 	if (val == MATCH_SIMPLE)
    483 		return "simple";
    484 	
    485 	return "unknown";
    486 }
    487 
    488 /* It is unfortunately that we have 2 matching/replace systems.
    489  * However, the above is for spamfilter matching and stuff
    490  * and below is for matching on WORDS, which does specific things
    491  * like replacement on word boundaries etc.
    492  * Moved here from the censor channel and user mode module
    493  * (previously was present in both modules, code duplication)
    494  */
    495 int fast_badword_match(ConfigItem_badword *badword, const char *line)
    496 {
    497 	const char *p;
    498 	int bwlen = strlen(badword->word);
    499 
    500 	if ((badword->type & BADW_TYPE_FAST_L) && (badword->type & BADW_TYPE_FAST_R))
    501 		return (our_strcasestr(line, badword->word) ? 1 : 0);
    502 
    503 	p = line;
    504 	while((p = our_strcasestr(p, badword->word)))
    505 	{
    506 		if (!(badword->type & BADW_TYPE_FAST_L))
    507 		{
    508 			if ((p != line) && !iswseperator(*(p - 1))) /* aaBLA but no *BLA */
    509 				goto next;
    510 		}
    511 		if (!(badword->type & BADW_TYPE_FAST_R))
    512 		{
    513 			if (!iswseperator(*(p + bwlen)))  /* BLAaa but no BLA* */
    514 				goto next;
    515 		}
    516 		/* Looks like it matched */
    517 		return 1;
    518 next:
    519 		p += bwlen;
    520 	}
    521 	return 0;
    522 }
    523 
    524 /* fast_badword_replace:
    525  * A fast replace routine written by Syzop used for replacing badwords.
    526  * This searches in line for the bad word and replaces it.
    527  * buf is used for the result and max is sizeof(buf).
    528  * Assumptions[!]: max > 0 AND max > strlen(line)+1
    529  */
    530 int fast_badword_replace(ConfigItem_badword *badword, const char *line, char *buf, int max)
    531 {
    532 	/* Some aliases ;P */
    533 	char *replacew = badword->replace ? badword->replace : REPLACEWORD;
    534 	const char *pold = line; /* pointer to the old string */
    535 	const char *poldx = line;
    536 	char *pnew = buf; /* pointer to the new string */
    537 	int replacen = -1; /* Only calculated if needed. w00t! saves us a few nanosecs? lol */
    538 	int searchn = -1;
    539 	const char *startw, *endw; /* start and end of the word */
    540 	char *c_eol = buf + max - 1; /* Cached end of (new) line */
    541 	int run = 1;
    542 	int cleaned = 0;
    543 
    544 	while(run) {
    545 		pold = our_strcasestr(pold, badword->word);
    546 		if (!pold)
    547 			break;
    548 		if (replacen == -1)
    549 			replacen = strlen(replacew);
    550 		if (searchn == -1)
    551 			searchn = strlen(badword->word);
    552 		/* Hunt for start of word */
    553 		if (pold > line) {
    554 			for (startw = pold; (!iswseperator(*startw) && (startw != line)); startw--);
    555 			if (iswseperator(*startw))
    556 				startw++; /* Don't point at the space/seperator but at the word! */
    557 		} else {
    558 			startw = pold;
    559 		}
    560 
    561 		if (!(badword->type & BADW_TYPE_FAST_L) && (pold != startw)) {
    562 			/* not matched */
    563 			pold++;
    564 			continue;
    565 		}
    566 
    567 		/* Hunt for end of word
    568 		 * Fix for bug #4909: word will be at least 'searchn' long so we can skip
    569 		 * 'searchn' bytes and avoid stopping half-way the badword.
    570 		 */
    571 		for (endw = pold+searchn; ((*endw != '\0') && (!iswseperator(*endw))); endw++);
    572 
    573 		if (!(badword->type & BADW_TYPE_FAST_R) && (pold+searchn != endw)) {
    574 			/* not matched */
    575 			pold++;
    576 			continue;
    577 		}
    578 
    579 		cleaned = 1; /* still too soon? Syzop/20050227 */
    580 
    581 		/* Do we have any not-copied-yet data? */
    582 		if (poldx != startw) {
    583 			int tmp_n = startw - poldx;
    584 			if (pnew + tmp_n >= c_eol) {
    585 				/* Partial copy and return... */
    586 				memcpy(pnew, poldx, c_eol - pnew);
    587 				*c_eol = '\0';
    588 				return 1;
    589 			}
    590 
    591 			memcpy(pnew, poldx, tmp_n);
    592 			pnew += tmp_n;
    593 		}
    594 		/* Now update the word in buf (pnew is now something like startw-in-new-buffer */
    595 
    596 		if (replacen) {
    597 			if ((pnew + replacen) >= c_eol) {
    598 				/* Partial copy and return... */
    599 				memcpy(pnew, replacew, c_eol - pnew);
    600 				*c_eol = '\0';
    601 				return 1;
    602 			}
    603 			memcpy(pnew, replacew, replacen);
    604 			pnew += replacen;
    605 		}
    606 		poldx = pold = endw;
    607 	}
    608 	/* Copy the last part */
    609 	if (*poldx) {
    610 		strncpy(pnew, poldx, c_eol - pnew);
    611 		*(c_eol) = '\0';
    612 	} else {
    613 		*pnew = '\0';
    614 	}
    615 	return cleaned;
    616 }
    617 
    618 /*
    619  * Returns a string, which has been filtered by the words loaded via
    620  * the loadbadwords() function.  It's primary use is to filter swearing
    621  * in both private and public messages
    622  */
    623 const char *stripbadwords(const char *str, ConfigItem_badword *start_bw, int *blocked)
    624 {
    625 	static char cleanstr[4096];
    626 	char buf[4096];
    627 	char *ptr;
    628 	int matchlen, m, stringlen, cleaned;
    629 	ConfigItem_badword *this_word;
    630 
    631 	*blocked = 0;
    632 
    633 	if (!start_bw)
    634 		return str;
    635 
    636 	/*
    637 	 * work on a copy
    638 	 */
    639 	stringlen = strlcpy(cleanstr, StripControlCodes(str), sizeof cleanstr);
    640 	matchlen = 0;
    641 	buf[0] = '\0';
    642 	cleaned = 0;
    643 
    644 	for (this_word = start_bw; this_word; this_word = this_word->next)
    645 	{
    646 		if (this_word->type & BADW_TYPE_FAST)
    647 		{
    648 			if (this_word->action == BADWORD_BLOCK)
    649 			{
    650 				if (fast_badword_match(this_word, cleanstr))
    651 				{
    652 					*blocked = 1;
    653 					return NULL;
    654 				}
    655 			}
    656 			else
    657 			{
    658 				int n;
    659 				/* fast_badword_replace() does size checking so we can use 512 here instead of 4096 */
    660 				n = fast_badword_replace(this_word, cleanstr, buf, 512);
    661 				if (!cleaned && n)
    662 					cleaned = n;
    663 				strcpy(cleanstr, buf);
    664 				memset(buf, 0, sizeof(buf)); /* regexp likes this somehow */
    665 			}
    666 		} else
    667 		if (this_word->type & BADW_TYPE_REGEX)
    668 		{
    669 			if (this_word->action == BADWORD_BLOCK)
    670 			{
    671 				pcre2_match_data *md = pcre2_match_data_create(9, NULL);
    672 				int ret;
    673 
    674 				ret = pcre2_match(this_word->pcre2_expr, cleanstr, PCRE2_ZERO_TERMINATED, 0, 0, md, NULL); /* run the regex */
    675 				pcre2_match_data_free(md); /* yeah, we never use it. unfortunately argument must be non-NULL for pcre2_match() */
    676 				if (ret > 0)
    677 				{
    678 					*blocked = 1;
    679 					return NULL;
    680 				}
    681 			}
    682 			else
    683 			{
    684 				pcre2_match_data *md;
    685 				int ret;
    686 				PCRE2_SIZE *dd;
    687 				int start, end;
    688 
    689 				ptr = cleanstr; /* set pointer to start of string */
    690 				while(1) {
    691 					md = pcre2_match_data_create(9, NULL);
    692 					/* ^^ we need to free 'md' in ALL circumstances.
    693 					 * remember this if you break or continue in this loop!
    694 					 */
    695 					ret = pcre2_match(this_word->pcre2_expr, ptr, PCRE2_ZERO_TERMINATED, 0, 0, md, NULL); /* run the regex */
    696 					if (ret > 0)
    697 					{
    698 						dd = pcre2_get_ovector_pointer(md);
    699 						start = (int)dd[0];
    700 						end = (int)dd[1];
    701 						if ((start < 0) || (end < 0) || (start > strlen(ptr)) || (end > strlen(ptr)+1))
    702 						{
    703 							unreal_log(ULOG_FATAL, "main", "BUG_STRIPBADWORDS_PCRE2_MATCH_OOB", NULL,
    704 							           "[BUG] pcre2_match() returned an ovector with OOB start/end: $start/$end, len $length: '$buf'",
    705 							           log_data_integer("start", start),
    706 							           log_data_integer("end", end),
    707 							           log_data_integer("length", strlen(ptr)),
    708 							           log_data_string("buf", ptr));
    709 							abort();
    710 						}
    711 						m = end - start;
    712 						if (m == 0)
    713 						{
    714 							pcre2_match_data_free(md);
    715 							break; /* anti-loop */
    716 						}
    717 						cleaned = 1;
    718 						matchlen += m;
    719 						strlncat(buf, ptr, sizeof buf, start);
    720 						if (this_word->replace)
    721 							strlcat(buf, this_word->replace, sizeof buf); 
    722 						else
    723 							strlcat(buf, REPLACEWORD, sizeof buf);
    724 						ptr += end; /* Set pointer after the match pos */
    725 						pcre2_match_data_free(md);
    726 						continue; /* next! */
    727 					}
    728 					pcre2_match_data_free(md);
    729 					break; /* NOMATCH: we are done! */
    730 				}
    731 				/* All the better to eat you with! */
    732 				strlcat(buf, ptr, sizeof buf);	
    733 				memcpy(cleanstr, buf, sizeof cleanstr);
    734 				memset(buf, 0, sizeof(buf));
    735 				if (matchlen == stringlen)
    736 					break;
    737 			}
    738 		}
    739 	}
    740 
    741 	cleanstr[511] = '\0'; /* cutoff, just to be sure */
    742 
    743 	return (cleaned) ? cleanstr : str;
    744 }
    745 
    746 /** Checks if the specified regex (or fast badwords) is valid.
    747  * returns NULL in case of success [!],
    748  * pointer to buffer with error message otherwise
    749  * if check_broadness is 1, the function will attempt to determine
    750  * if the given regex string is too broad (i.e. matches everything)
    751  */
    752 const char *badword_config_check_regex(const char *str, int fastsupport, int check_broadness)
    753 {
    754 	int regex=0;
    755 	const char *tmp;
    756 	static char errorbuf[512];
    757 
    758 	if (fastsupport)
    759 	{
    760 		for (tmp = str; *tmp; tmp++) {
    761 			if (!isalnum(*tmp) && !(*tmp >= 128)) {
    762 				if ((str == tmp) && (*tmp == '*'))
    763 					continue;
    764 				if ((*(tmp + 1) == '\0') && (*tmp == '*'))
    765 					continue;
    766 				regex = 1;
    767 				break;
    768 			}
    769 		}
    770 	}
    771 	if (!fastsupport || regex)
    772 	{
    773 		int errorcode = 0;
    774 		PCRE2_SIZE erroroffset = 0;
    775 		pcre2_code *expr;
    776 		int options = 0;
    777 		char buf2[512];
    778 
    779 		options = PCRE2_CASELESS|PCRE2_NEVER_UTF|PCRE2_NEVER_UCP;
    780 
    781 		expr = pcre2_compile(str, PCRE2_ZERO_TERMINATED, options, &errorcode, &erroroffset, NULL);
    782 		if (expr == NULL)
    783 		{
    784 			pcre2_get_error_message(errorcode, buf2, sizeof(buf2));
    785 			if (erroroffset > 0)
    786 				snprintf(errorbuf, sizeof(errorbuf), "%s (at character #%d)", buf2, (int)erroroffset);
    787 			else
    788 				strlcpy(errorbuf, buf2, sizeof(errorbuf));
    789 			return errorbuf;
    790 		}
    791 		pcre2_code_free(expr);
    792 	}
    793 	return NULL;
    794 }
    795 
    796 int badword_config_process(ConfigItem_badword *ca, const char *str)
    797 {
    798 	const char *tmp;
    799 	short regex = 0;
    800 	int ast_l = 0, ast_r = 0;
    801 
    802 	/* The fast badwords routine can do: "blah" "*blah" "blah*" and "*blah*",
    803 	 * in all other cases use regex.
    804 	 */
    805 	for (tmp = str; *tmp; tmp++) {
    806 		if (!isalnum(*tmp) && !(*tmp >= 128)) {
    807 			if ((str == tmp) && (*tmp == '*')) {
    808 				ast_l = 1; /* Asterisk at the left */
    809 				continue;
    810 			}
    811 			if ((*(tmp + 1) == '\0') && (*tmp == '*')) {
    812 				ast_r = 1; /* Asterisk at the right */
    813 				continue;
    814 			}
    815 			regex = 1;
    816 			break;
    817 		}
    818 	}
    819 	if (regex)
    820 	{
    821 		int errorcode = 0;
    822 		PCRE2_SIZE erroroffset = 0;
    823 		int options = 0;
    824 
    825 		ca->type = BADW_TYPE_REGEX;
    826 		safe_strdup(ca->word, str);
    827 
    828 		options = PCRE2_CASELESS|PCRE2_NEVER_UTF|PCRE2_NEVER_UCP;
    829 
    830 		ca->pcre2_expr = pcre2_compile(str, PCRE2_ZERO_TERMINATED, options, &errorcode, &erroroffset, NULL);
    831 		if (ca->pcre2_expr == NULL)
    832 		{
    833 			/* This cannot happen since badword_config_check_regex()
    834 			 * should be called from config_test on each regex.
    835 			 */
    836 			config_error("badword_config_process(): failed to compile regex '%s', this is impossible!", str);
    837 			abort();
    838 		}
    839 		pcre2_jit_compile(ca->pcre2_expr, PCRE2_JIT_COMPLETE);
    840 	}
    841 	else
    842 	{
    843 		char *tmpw;
    844 		ca->type = BADW_TYPE_FAST;
    845 		ca->word = tmpw = safe_alloc(strlen(str) - ast_l - ast_r + 1);
    846 		/* Copy except for asterisks */
    847 		for (tmp = str; *tmp; tmp++)
    848 			if (*tmp != '*')
    849 				*tmpw++ = *tmp;
    850 		*tmpw = '\0';
    851 		if (ast_l)
    852 			ca->type |= BADW_TYPE_FAST_L;
    853 		if (ast_r)
    854 			ca->type |= BADW_TYPE_FAST_R;
    855 	}
    856 
    857 	return 1;
    858 }
    859 
    860 /** Frees a ConfigItem_badword item.
    861  * Note that it does NOT remove from the list, you need
    862  * to do this BEFORE calling this function.
    863  */
    864 void badword_config_free(ConfigItem_badword *e)
    865 {
    866 	safe_free(e->word);
    867 	safe_free(e->replace);
    868 	if (e->pcre2_expr)
    869 		pcre2_code_free(e->pcre2_expr);
    870 	safe_free(e);
    871 }