unrealircd

- supernets unrealircd source & configuration
git clone git://git.acid.vegas/unrealircd.git
Log | Files | Refs | Archive

antimixedutf8.c (9733B)

      1 /*
      2  * Anti mixed UTF8 - a filter written by Bram Matthys ("Syzop").
      3  * Reported by Mr_Smoke in https://bugs.unrealircd.org/view.php?id=5163
      4  * Tested by PeGaSuS (The_Myth) with some of the most used spam lines.
      5  * Help with testing and fixing Cyrillic from 'i' <info@servx.org>
      6  *
      7  * ==[ ABOUT ]==
      8  * This module will detect and stop spam containing of characters of
      9  * mixed "scripts", where some characters are in Latin script and other
     10  * characters are in Cyrillic.
     11  * This unusual behavior can be detected easily and action can be taken.
     12  *
     13  * ==[ MODULE LOADING AND CONFIGURATION ]==
     14  * loadmodule "antimixedutf8";
     15  * set {
     16  *         antimixedutf8 {
     17  *                 score 10;
     18  *                 ban-action block;
     19  *                 ban-reason "Possible mixed character spam";
     20  *                 ban-time 4h; // For other types
     21  *                 except {
     22  *                 }
     23  *         };
     24  * };
     25  *
     26  * ==[ LICENSE AND PORTING ]==
     27  * Feel free to copy/move the idea or code to other IRCds.
     28  * The license is GPLv1 (or later, at your option):
     29  *
     30  * This program is free software; you can redistribute it and/or modify
     31  * it under the terms of the GNU General Public License as published by
     32  * the Free Software Foundation; either version 1, or (at your option)
     33  * any later version.
     34  *
     35  * This program is distributed in the hope that it will be useful,
     36  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     37  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     38  * GNU General Public License for more details.
     39  *
     40  * You should have received a copy of the GNU General Public License
     41  * along with this program; if not, write to the Free Software
     42  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
     43  */
     44 
     45 #include "unrealircd.h"
     46 
     47 ModuleHeader MOD_HEADER
     48 = {
     49 	"antimixedutf8",
     50 	"1.0",
     51 	"Mixed UTF8 character filter (look-alike character spam) - by Syzop",
     52 	"UnrealIRCd Team",
     53 	"unrealircd-6",
     54 };
     55 
     56 struct {
     57 	int score;
     58 	BanAction *ban_action;
     59 	char *ban_reason;
     60 	long ban_time;
     61 	SecurityGroup *except;
     62 } cfg;
     63 
     64 static void free_config(void);
     65 static void init_config(void);
     66 int antimixedutf8_config_test(ConfigFile *, ConfigEntry *, int, int *);
     67 int antimixedutf8_config_run(ConfigFile *, ConfigEntry *, int);
     68 
     69 #define SCRIPT_UNDEFINED	0
     70 #define SCRIPT_LATIN		1
     71 #define SCRIPT_CYRILLIC		2
     72 #define SCRIPT_CJK		3
     73 #define SCRIPT_HANGUL		4
     74 #define SCRIPT_CANADIAN		5
     75 #define SCRIPT_TELUGU		6
     76 
     77 /**** the detection algorithm follows first, the module/config code is at the end ****/
     78 
     79 /** Detect which script the current character is,
     80  * such as latin script or cyrillic script.
     81  * @retval See SCRIPT_*
     82  */
     83 int detect_script(const char *t)
     84 {
     85 	/* Safety: as long as *t is never \0 then at worst
     86 	 * the character after this will be \0 and since we
     87 	 * only look at 2 characters (at most) at a time
     88 	 * this will be safe.
     89 	 */
     90 
     91 	/* Currently we only detect cyrillic and call all the
     92 	 * rest latin (which is not true). This can always
     93 	 * be enhanced later.
     94 	 */
     95 
     96 	if ((t[0] == 0xd0) && (t[1] >= 0x80) && (t[1] <= 0xbf))
     97 		return SCRIPT_CYRILLIC;
     98 	else if ((t[0] == 0xd1) && (t[1] >= 0x80) && (t[1] <= 0xbf))
     99 		return SCRIPT_CYRILLIC;
    100 	else if ((t[0] == 0xd2) && (t[1] >= 0x80) && (t[1] <= 0xbf))
    101 		return SCRIPT_CYRILLIC;
    102 	else if ((t[0] == 0xd3) && (t[1] >= 0x80) && (t[1] <= 0xbf))
    103 		return SCRIPT_CYRILLIC;
    104 
    105 	if ((t[0] == 0xe4) && (t[1] >= 0xb8) && (t[1] <= 0xbf))
    106 		return SCRIPT_CJK;
    107 	else if ((t[0] >= 0xe5) && (t[0] <= 0xe9) && (t[1] >= 0x80) && (t[1] <= 0xbf))
    108 		return SCRIPT_CJK;
    109 
    110 	if ((t[0] == 0xea) && (t[1] >= 0xb0) && (t[1] <= 0xbf))
    111 		return SCRIPT_HANGUL;
    112 	else if ((t[0] >= 0xeb) && (t[0] <= 0xec) && (t[1] >= 0x80) && (t[1] <= 0xbf))
    113 		return SCRIPT_HANGUL;
    114 	else if ((t[0] == 0xed) && (t[1] >= 0x80) && (t[1] <= 0x9f))
    115 		return SCRIPT_HANGUL;
    116 
    117 	if ((t[0] == 0xe1) && (t[1] >= 0x90) && (t[1] <= 0x99))
    118 		return SCRIPT_CANADIAN;
    119 
    120 	if ((t[0] == 0xe0) && (t[1] >= 0xb0) && (t[1] <= 0xb1))
    121 		return SCRIPT_TELUGU;
    122 
    123 	if ((t[0] >= 'a') && (t[0] <= 'z'))
    124 		return SCRIPT_LATIN;
    125 	if ((t[0] >= 'A') && (t[0] <= 'Z'))
    126 		return SCRIPT_LATIN;
    127 
    128 	return SCRIPT_UNDEFINED;
    129 }
    130 
    131 /** Returns length of an (UTF8) character. May return <1 for error conditions.
    132  * Made by i <info@servx.org>
    133  */
    134 static int utf8_charlen(const char *str)
    135 {
    136 	struct { char mask; char val; } t[4] =
    137 	{ { 0x80, 0x00 }, { 0xE0, 0xC0 }, { 0xF0, 0xE0 }, { 0xF8, 0xF0 } };
    138 	unsigned k, j;
    139 
    140 	for (k = 0; k < 4; k++)
    141 	{
    142 		if ((*str & t[k].mask) == t[k].val)
    143 		{
    144 			for (j = 0; j < k; j++)
    145 			{
    146 				if ((*(++str) & 0xC0) != 0x80)
    147 					return -1;
    148 			}
    149 			return k + 1;
    150 		}
    151 	}
    152 	return 1;
    153 }
    154 
    155 int lookalikespam_score(const char *text)
    156 {
    157 	const char *p;
    158 	int last_script = SCRIPT_UNDEFINED;
    159 	int current_script;
    160 	int points = 0;
    161 	int last_character_was_word_separator = 0;
    162 	int skip = 0;
    163 
    164 	for (p = text; *p; p++)
    165 	{
    166 		current_script = detect_script(p);
    167 
    168 		if (current_script != SCRIPT_UNDEFINED)
    169 		{
    170 			if ((current_script != last_script) && (last_script != SCRIPT_UNDEFINED))
    171 			{
    172 				/* A script change = 1 point */
    173 				points++;
    174 
    175 				/* Give an additional point if the script change happened
    176 				 * within the same word, as that would be rather unusual
    177 				 * in normal cases.
    178 				 */
    179 				if (!last_character_was_word_separator)
    180 					points++;
    181 			}
    182 			last_script = current_script;
    183 		}
    184 
    185 		if (strchr("., ", *p))
    186 			last_character_was_word_separator = 1;
    187 		else
    188 			last_character_was_word_separator = 0;
    189 
    190 		skip = utf8_charlen(p);
    191 		if (skip > 1)
    192 			p += skip - 1;
    193 	}
    194 
    195 	return points;
    196 }
    197 
    198 CMD_OVERRIDE_FUNC(override_msg)
    199 {
    200 	int score, retval;
    201 
    202 	if (!MyUser(client) || (parc < 3) || BadPtr(parv[2]) ||
    203 	    user_allowed_by_security_group(client, cfg.except))
    204 	{
    205 		/* Short circuit for: remote clients, insufficient parameters,
    206 		 * antimixedutf8::except.
    207 		 */
    208 		CALL_NEXT_COMMAND_OVERRIDE();
    209 		return;
    210 	}
    211 
    212 	score = lookalikespam_score(StripControlCodes(parv[2]));
    213 	if ((score >= cfg.score) && !find_tkl_exception(TKL_ANTIMIXEDUTF8, client))
    214 	{
    215 		unreal_log(ULOG_INFO, "antimixedutf8", "ANTIMIXEDUTF8_HIT", client,
    216 		           "[antimixedutf8] Client $client.details hit score $score -- taking action",
    217 		           log_data_integer("score", score));
    218 		/* Take the action */
    219 		retval = take_action(client, cfg.ban_action, cfg.ban_reason, cfg.ban_time, 0, NULL);
    220 		if ((retval == BAN_ACT_WARN) || (retval == BAN_ACT_SOFT_WARN))
    221 		{
    222 			/* no action */
    223 		} else
    224 		if ((retval == BAN_ACT_BLOCK) || (retval == BAN_ACT_SOFT_BLOCK))
    225 		{
    226 			sendnotice(client, "%s", cfg.ban_reason);
    227 			return;
    228 		} else if (retval > 0)
    229 		{
    230 			return;
    231 		}
    232 		/* fallthrough for retval <=0 */
    233 	}
    234 
    235 	CALL_NEXT_COMMAND_OVERRIDE();
    236 }
    237 
    238 /*** rest is module and config stuff ****/
    239 
    240 MOD_TEST()
    241 {
    242 	HookAdd(modinfo->handle, HOOKTYPE_CONFIGTEST, 0, antimixedutf8_config_test);
    243 	return MOD_SUCCESS;
    244 }
    245 
    246 MOD_INIT()
    247 {
    248 	MARK_AS_OFFICIAL_MODULE(modinfo);
    249 
    250 	init_config();
    251 	HookAdd(modinfo->handle, HOOKTYPE_CONFIGRUN, 0, antimixedutf8_config_run);
    252 	return MOD_SUCCESS;
    253 }
    254 
    255 MOD_LOAD()
    256 {
    257 	if (!CommandOverrideAdd(modinfo->handle, "PRIVMSG", 0, override_msg))
    258 		return MOD_FAILED;
    259 
    260 	if (!CommandOverrideAdd(modinfo->handle, "NOTICE", 0, override_msg))
    261 		return MOD_FAILED;
    262 
    263 	return MOD_SUCCESS;
    264 }
    265 
    266 MOD_UNLOAD()
    267 {
    268 	free_config();
    269 	return MOD_SUCCESS;
    270 }
    271 
    272 static void init_config(void)
    273 {
    274 	memset(&cfg, 0, sizeof(cfg));
    275 	/* Default values */
    276 	cfg.score = 10;
    277 	safe_strdup(cfg.ban_reason, "Possible mixed character spam");
    278 	cfg.ban_action = banact_value_to_struct(BAN_ACT_BLOCK);
    279 	cfg.ban_time = 60 * 60 * 4; /* irrelevant for block, but some default for others */
    280 }
    281 
    282 static void free_config(void)
    283 {
    284 	safe_free(cfg.ban_reason);
    285 	free_security_group(cfg.except);
    286 	safe_free_all_ban_actions(cfg.ban_action);
    287 	memset(&cfg, 0, sizeof(cfg)); /* needed! */
    288 }
    289 
    290 int antimixedutf8_config_test(ConfigFile *cf, ConfigEntry *ce, int type, int *errs)
    291 {
    292 	int errors = 0;
    293 	ConfigEntry *cep;
    294 
    295 	if (type != CONFIG_SET)
    296 		return 0;
    297 
    298 	/* We are only interrested in set::antimixedutf8... */
    299 	if (!ce || !ce->name || strcmp(ce->name, "antimixedutf8"))
    300 		return 0;
    301 
    302 	for (cep = ce->items; cep; cep = cep->next)
    303 	{
    304 		if (!cep->value)
    305 		{
    306 			config_error("%s:%i: set::antimixedutf8::%s with no value",
    307 				cep->file->filename, cep->line_number, cep->name);
    308 			errors++;
    309 		} else
    310 		if (!strcmp(cep->name, "score"))
    311 		{
    312 			int v = atoi(cep->value);
    313 			if ((v < 1) || (v > 99))
    314 			{
    315 				config_error("%s:%i: set::antimixedutf8::score: must be between 1 - 99 (got: %d)",
    316 					cep->file->filename, cep->line_number, v);
    317 				errors++;
    318 			}
    319 		} else
    320 		if (!strcmp(cep->name, "ban-action"))
    321 		{
    322 			errors += test_ban_action_config(cep);
    323 		} else
    324 		if (!strcmp(cep->name, "ban-reason"))
    325 		{
    326 		} else
    327 		if (!strcmp(cep->name, "ban-time"))
    328 		{
    329 		} else
    330 		if (!strcmp(cep->name, "except"))
    331 		{
    332 			test_match_block(cf, cep, &errors);
    333 		} else
    334 		{
    335 			config_error("%s:%i: unknown directive set::antimixedutf8::%s",
    336 				cep->file->filename, cep->line_number, cep->name);
    337 			errors++;
    338 		}
    339 	}
    340 	*errs = errors;
    341 	return errors ? -1 : 1;
    342 }
    343 
    344 int antimixedutf8_config_run(ConfigFile *cf, ConfigEntry *ce, int type)
    345 {
    346 	ConfigEntry *cep;
    347 
    348 	if (type != CONFIG_SET)
    349 		return 0;
    350 
    351 	/* We are only interrested in set::antimixedutf8... */
    352 	if (!ce || !ce->name || strcmp(ce->name, "antimixedutf8"))
    353 		return 0;
    354 
    355 	for (cep = ce->items; cep; cep = cep->next)
    356 	{
    357 		if (!strcmp(cep->name, "score"))
    358 		{
    359 			cfg.score = atoi(cep->value);
    360 		} else
    361 		if (!strcmp(cep->name, "ban-action"))
    362 		{
    363 			parse_ban_action_config(cep, &cfg.ban_action);
    364 		} else
    365 		if (!strcmp(cep->name, "ban-reason"))
    366 		{
    367 			safe_strdup(cfg.ban_reason, cep->value);
    368 		} else
    369 		if (!strcmp(cep->name, "ban-time"))
    370 		{
    371 			cfg.ban_time = config_checkval(cep->value, CFG_TIME);
    372 		} else
    373 		if (!strcmp(cep->name, "except"))
    374 		{
    375 			conf_match_block(cf, cep, &cfg.except);
    376 		}
    377 	}
    378 	return 1;
    379 }