unrealircd- supernets unrealircd source & configuration |
git clone git://git.acid.vegas/unrealircd.git |
Log | Files | Refs | Archive |
antimixedutf8.c (9733B)
1 /* 2 * Anti mixed UTF8 - a filter written by Bram Matthys ("Syzop"). 3 * Reported by Mr_Smoke in https://bugs.unrealircd.org/view.php?id=5163 4 * Tested by PeGaSuS (The_Myth) with some of the most used spam lines. 5 * Help with testing and fixing Cyrillic from 'i' <info@servx.org> 6 * 7 * ==[ ABOUT ]== 8 * This module will detect and stop spam containing of characters of 9 * mixed "scripts", where some characters are in Latin script and other 10 * characters are in Cyrillic. 11 * This unusual behavior can be detected easily and action can be taken. 12 * 13 * ==[ MODULE LOADING AND CONFIGURATION ]== 14 * loadmodule "antimixedutf8"; 15 * set { 16 * antimixedutf8 { 17 * score 10; 18 * ban-action block; 19 * ban-reason "Possible mixed character spam"; 20 * ban-time 4h; // For other types 21 * except { 22 * } 23 * }; 24 * }; 25 * 26 * ==[ LICENSE AND PORTING ]== 27 * Feel free to copy/move the idea or code to other IRCds. 28 * The license is GPLv1 (or later, at your option): 29 * 30 * This program is free software; you can redistribute it and/or modify 31 * it under the terms of the GNU General Public License as published by 32 * the Free Software Foundation; either version 1, or (at your option) 33 * any later version. 34 * 35 * This program is distributed in the hope that it will be useful, 36 * but WITHOUT ANY WARRANTY; without even the implied warranty of 37 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 38 * GNU General Public License for more details. 39 * 40 * You should have received a copy of the GNU General Public License 41 * along with this program; if not, write to the Free Software 42 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 43 */ 44 45 #include "unrealircd.h" 46 47 ModuleHeader MOD_HEADER 48 = { 49 "antimixedutf8", 50 "1.0", 51 "Mixed UTF8 character filter (look-alike character spam) - by Syzop", 52 "UnrealIRCd Team", 53 "unrealircd-6", 54 }; 55 56 struct { 57 int score; 58 BanAction *ban_action; 59 char *ban_reason; 60 long ban_time; 61 SecurityGroup *except; 62 } cfg; 63 64 static void free_config(void); 65 static void init_config(void); 66 int antimixedutf8_config_test(ConfigFile *, ConfigEntry *, int, int *); 67 int antimixedutf8_config_run(ConfigFile *, ConfigEntry *, int); 68 69 #define SCRIPT_UNDEFINED 0 70 #define SCRIPT_LATIN 1 71 #define SCRIPT_CYRILLIC 2 72 #define SCRIPT_CJK 3 73 #define SCRIPT_HANGUL 4 74 #define SCRIPT_CANADIAN 5 75 #define SCRIPT_TELUGU 6 76 77 /**** the detection algorithm follows first, the module/config code is at the end ****/ 78 79 /** Detect which script the current character is, 80 * such as latin script or cyrillic script. 81 * @retval See SCRIPT_* 82 */ 83 int detect_script(const char *t) 84 { 85 /* Safety: as long as *t is never \0 then at worst 86 * the character after this will be \0 and since we 87 * only look at 2 characters (at most) at a time 88 * this will be safe. 89 */ 90 91 /* Currently we only detect cyrillic and call all the 92 * rest latin (which is not true). This can always 93 * be enhanced later. 94 */ 95 96 if ((t[0] == 0xd0) && (t[1] >= 0x80) && (t[1] <= 0xbf)) 97 return SCRIPT_CYRILLIC; 98 else if ((t[0] == 0xd1) && (t[1] >= 0x80) && (t[1] <= 0xbf)) 99 return SCRIPT_CYRILLIC; 100 else if ((t[0] == 0xd2) && (t[1] >= 0x80) && (t[1] <= 0xbf)) 101 return SCRIPT_CYRILLIC; 102 else if ((t[0] == 0xd3) && (t[1] >= 0x80) && (t[1] <= 0xbf)) 103 return SCRIPT_CYRILLIC; 104 105 if ((t[0] == 0xe4) && (t[1] >= 0xb8) && (t[1] <= 0xbf)) 106 return SCRIPT_CJK; 107 else if ((t[0] >= 0xe5) && (t[0] <= 0xe9) && (t[1] >= 0x80) && (t[1] <= 0xbf)) 108 return SCRIPT_CJK; 109 110 if ((t[0] == 0xea) && (t[1] >= 0xb0) && (t[1] <= 0xbf)) 111 return SCRIPT_HANGUL; 112 else if ((t[0] >= 0xeb) && (t[0] <= 0xec) && (t[1] >= 0x80) && (t[1] <= 0xbf)) 113 return SCRIPT_HANGUL; 114 else if ((t[0] == 0xed) && (t[1] >= 0x80) && (t[1] <= 0x9f)) 115 return SCRIPT_HANGUL; 116 117 if ((t[0] == 0xe1) && (t[1] >= 0x90) && (t[1] <= 0x99)) 118 return SCRIPT_CANADIAN; 119 120 if ((t[0] == 0xe0) && (t[1] >= 0xb0) && (t[1] <= 0xb1)) 121 return SCRIPT_TELUGU; 122 123 if ((t[0] >= 'a') && (t[0] <= 'z')) 124 return SCRIPT_LATIN; 125 if ((t[0] >= 'A') && (t[0] <= 'Z')) 126 return SCRIPT_LATIN; 127 128 return SCRIPT_UNDEFINED; 129 } 130 131 /** Returns length of an (UTF8) character. May return <1 for error conditions. 132 * Made by i <info@servx.org> 133 */ 134 static int utf8_charlen(const char *str) 135 { 136 struct { char mask; char val; } t[4] = 137 { { 0x80, 0x00 }, { 0xE0, 0xC0 }, { 0xF0, 0xE0 }, { 0xF8, 0xF0 } }; 138 unsigned k, j; 139 140 for (k = 0; k < 4; k++) 141 { 142 if ((*str & t[k].mask) == t[k].val) 143 { 144 for (j = 0; j < k; j++) 145 { 146 if ((*(++str) & 0xC0) != 0x80) 147 return -1; 148 } 149 return k + 1; 150 } 151 } 152 return 1; 153 } 154 155 int lookalikespam_score(const char *text) 156 { 157 const char *p; 158 int last_script = SCRIPT_UNDEFINED; 159 int current_script; 160 int points = 0; 161 int last_character_was_word_separator = 0; 162 int skip = 0; 163 164 for (p = text; *p; p++) 165 { 166 current_script = detect_script(p); 167 168 if (current_script != SCRIPT_UNDEFINED) 169 { 170 if ((current_script != last_script) && (last_script != SCRIPT_UNDEFINED)) 171 { 172 /* A script change = 1 point */ 173 points++; 174 175 /* Give an additional point if the script change happened 176 * within the same word, as that would be rather unusual 177 * in normal cases. 178 */ 179 if (!last_character_was_word_separator) 180 points++; 181 } 182 last_script = current_script; 183 } 184 185 if (strchr("., ", *p)) 186 last_character_was_word_separator = 1; 187 else 188 last_character_was_word_separator = 0; 189 190 skip = utf8_charlen(p); 191 if (skip > 1) 192 p += skip - 1; 193 } 194 195 return points; 196 } 197 198 CMD_OVERRIDE_FUNC(override_msg) 199 { 200 int score, retval; 201 202 if (!MyUser(client) || (parc < 3) || BadPtr(parv[2]) || 203 user_allowed_by_security_group(client, cfg.except)) 204 { 205 /* Short circuit for: remote clients, insufficient parameters, 206 * antimixedutf8::except. 207 */ 208 CALL_NEXT_COMMAND_OVERRIDE(); 209 return; 210 } 211 212 score = lookalikespam_score(StripControlCodes(parv[2])); 213 if ((score >= cfg.score) && !find_tkl_exception(TKL_ANTIMIXEDUTF8, client)) 214 { 215 unreal_log(ULOG_INFO, "antimixedutf8", "ANTIMIXEDUTF8_HIT", client, 216 "[antimixedutf8] Client $client.details hit score $score -- taking action", 217 log_data_integer("score", score)); 218 /* Take the action */ 219 retval = take_action(client, cfg.ban_action, cfg.ban_reason, cfg.ban_time, 0, NULL); 220 if ((retval == BAN_ACT_WARN) || (retval == BAN_ACT_SOFT_WARN)) 221 { 222 /* no action */ 223 } else 224 if ((retval == BAN_ACT_BLOCK) || (retval == BAN_ACT_SOFT_BLOCK)) 225 { 226 sendnotice(client, "%s", cfg.ban_reason); 227 return; 228 } else if (retval > 0) 229 { 230 return; 231 } 232 /* fallthrough for retval <=0 */ 233 } 234 235 CALL_NEXT_COMMAND_OVERRIDE(); 236 } 237 238 /*** rest is module and config stuff ****/ 239 240 MOD_TEST() 241 { 242 HookAdd(modinfo->handle, HOOKTYPE_CONFIGTEST, 0, antimixedutf8_config_test); 243 return MOD_SUCCESS; 244 } 245 246 MOD_INIT() 247 { 248 MARK_AS_OFFICIAL_MODULE(modinfo); 249 250 init_config(); 251 HookAdd(modinfo->handle, HOOKTYPE_CONFIGRUN, 0, antimixedutf8_config_run); 252 return MOD_SUCCESS; 253 } 254 255 MOD_LOAD() 256 { 257 if (!CommandOverrideAdd(modinfo->handle, "PRIVMSG", 0, override_msg)) 258 return MOD_FAILED; 259 260 if (!CommandOverrideAdd(modinfo->handle, "NOTICE", 0, override_msg)) 261 return MOD_FAILED; 262 263 return MOD_SUCCESS; 264 } 265 266 MOD_UNLOAD() 267 { 268 free_config(); 269 return MOD_SUCCESS; 270 } 271 272 static void init_config(void) 273 { 274 memset(&cfg, 0, sizeof(cfg)); 275 /* Default values */ 276 cfg.score = 10; 277 safe_strdup(cfg.ban_reason, "Possible mixed character spam"); 278 cfg.ban_action = banact_value_to_struct(BAN_ACT_BLOCK); 279 cfg.ban_time = 60 * 60 * 4; /* irrelevant for block, but some default for others */ 280 } 281 282 static void free_config(void) 283 { 284 safe_free(cfg.ban_reason); 285 free_security_group(cfg.except); 286 safe_free_all_ban_actions(cfg.ban_action); 287 memset(&cfg, 0, sizeof(cfg)); /* needed! */ 288 } 289 290 int antimixedutf8_config_test(ConfigFile *cf, ConfigEntry *ce, int type, int *errs) 291 { 292 int errors = 0; 293 ConfigEntry *cep; 294 295 if (type != CONFIG_SET) 296 return 0; 297 298 /* We are only interrested in set::antimixedutf8... */ 299 if (!ce || !ce->name || strcmp(ce->name, "antimixedutf8")) 300 return 0; 301 302 for (cep = ce->items; cep; cep = cep->next) 303 { 304 if (!cep->value) 305 { 306 config_error("%s:%i: set::antimixedutf8::%s with no value", 307 cep->file->filename, cep->line_number, cep->name); 308 errors++; 309 } else 310 if (!strcmp(cep->name, "score")) 311 { 312 int v = atoi(cep->value); 313 if ((v < 1) || (v > 99)) 314 { 315 config_error("%s:%i: set::antimixedutf8::score: must be between 1 - 99 (got: %d)", 316 cep->file->filename, cep->line_number, v); 317 errors++; 318 } 319 } else 320 if (!strcmp(cep->name, "ban-action")) 321 { 322 errors += test_ban_action_config(cep); 323 } else 324 if (!strcmp(cep->name, "ban-reason")) 325 { 326 } else 327 if (!strcmp(cep->name, "ban-time")) 328 { 329 } else 330 if (!strcmp(cep->name, "except")) 331 { 332 test_match_block(cf, cep, &errors); 333 } else 334 { 335 config_error("%s:%i: unknown directive set::antimixedutf8::%s", 336 cep->file->filename, cep->line_number, cep->name); 337 errors++; 338 } 339 } 340 *errs = errors; 341 return errors ? -1 : 1; 342 } 343 344 int antimixedutf8_config_run(ConfigFile *cf, ConfigEntry *ce, int type) 345 { 346 ConfigEntry *cep; 347 348 if (type != CONFIG_SET) 349 return 0; 350 351 /* We are only interrested in set::antimixedutf8... */ 352 if (!ce || !ce->name || strcmp(ce->name, "antimixedutf8")) 353 return 0; 354 355 for (cep = ce->items; cep; cep = cep->next) 356 { 357 if (!strcmp(cep->name, "score")) 358 { 359 cfg.score = atoi(cep->value); 360 } else 361 if (!strcmp(cep->name, "ban-action")) 362 { 363 parse_ban_action_config(cep, &cfg.ban_action); 364 } else 365 if (!strcmp(cep->name, "ban-reason")) 366 { 367 safe_strdup(cfg.ban_reason, cep->value); 368 } else 369 if (!strcmp(cep->name, "ban-time")) 370 { 371 cfg.ban_time = config_checkval(cep->value, CFG_TIME); 372 } else 373 if (!strcmp(cep->name, "except")) 374 { 375 conf_match_block(cf, cep, &cfg.except); 376 } 377 } 378 return 1; 379 }