unrealircd- supernets unrealircd source & configuration |
git clone git://git.acid.vegas/unrealircd.git |
Log | Files | Refs | Archive | README | LICENSE |
antimixedutf8.c (9765B)
1 /* 2 * Anti mixed UTF8 - a filter written by Bram Matthys ("Syzop"). 3 * Reported by Mr_Smoke in https://bugs.unrealircd.org/view.php?id=5163 4 * Tested by PeGaSuS (The_Myth) with some of the most used spam lines. 5 * Help with testing and fixing Cyrillic from 'i' <info@servx.org> 6 * 7 * ==[ ABOUT ]== 8 * This module will detect and stop spam containing of characters of 9 * mixed "scripts", where some characters are in Latin script and other 10 * characters are in Cyrillic. 11 * This unusual behavior can be detected easily and action can be taken. 12 * 13 * ==[ MODULE LOADING AND CONFIGURATION ]== 14 * loadmodule "antimixedutf8"; 15 * set { 16 * antimixedutf8 { 17 * score 10; 18 * ban-action block; 19 * ban-reason "Possible mixed character spam"; 20 * ban-time 4h; // For other types 21 * except { 22 * } 23 * }; 24 * }; 25 * 26 * ==[ LICENSE AND PORTING ]== 27 * Feel free to copy/move the idea or code to other IRCds. 28 * The license is GPLv1 (or later, at your option): 29 * 30 * This program is free software; you can redistribute it and/or modify 31 * it under the terms of the GNU General Public License as published by 32 * the Free Software Foundation; either version 1, or (at your option) 33 * any later version. 34 * 35 * This program is distributed in the hope that it will be useful, 36 * but WITHOUT ANY WARRANTY; without even the implied warranty of 37 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 38 * GNU General Public License for more details. 39 * 40 * You should have received a copy of the GNU General Public License 41 * along with this program; if not, write to the Free Software 42 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 43 */ 44 45 #include "unrealircd.h" 46 47 ModuleHeader MOD_HEADER 48 = { 49 "antimixedutf8", 50 "1.0", 51 "Mixed UTF8 character filter (look-alike character spam) - by Syzop", 52 "UnrealIRCd Team", 53 "unrealircd-6", 54 }; 55 56 struct { 57 int score; 58 BanAction ban_action; 59 char *ban_reason; 60 long ban_time; 61 SecurityGroup *except; 62 } cfg; 63 64 static void free_config(void); 65 static void init_config(void); 66 int antimixedutf8_config_test(ConfigFile *, ConfigEntry *, int, int *); 67 int antimixedutf8_config_run(ConfigFile *, ConfigEntry *, int); 68 69 #define SCRIPT_UNDEFINED 0 70 #define SCRIPT_LATIN 1 71 #define SCRIPT_CYRILLIC 2 72 #define SCRIPT_CJK 3 73 #define SCRIPT_HANGUL 4 74 #define SCRIPT_CANADIAN 5 75 #define SCRIPT_TELUGU 6 76 77 /**** the detection algorithm follows first, the module/config code is at the end ****/ 78 79 /** Detect which script the current character is, 80 * such as latin script or cyrillic script. 81 * @retval See SCRIPT_* 82 */ 83 int detect_script(const char *t) 84 { 85 /* Safety: as long as *t is never \0 then at worst 86 * the character after this will be \0 and since we 87 * only look at 2 characters (at most) at a time 88 * this will be safe. 89 */ 90 91 /* Currently we only detect cyrillic and call all the 92 * rest latin (which is not true). This can always 93 * be enhanced later. 94 */ 95 96 if ((t[0] == 0xd0) && (t[1] >= 0x80) && (t[1] <= 0xbf)) 97 return SCRIPT_CYRILLIC; 98 else if ((t[0] == 0xd1) && (t[1] >= 0x80) && (t[1] <= 0xbf)) 99 return SCRIPT_CYRILLIC; 100 else if ((t[0] == 0xd2) && (t[1] >= 0x80) && (t[1] <= 0xbf)) 101 return SCRIPT_CYRILLIC; 102 else if ((t[0] == 0xd3) && (t[1] >= 0x80) && (t[1] <= 0xbf)) 103 return SCRIPT_CYRILLIC; 104 105 if ((t[0] == 0xe4) && (t[1] >= 0xb8) && (t[1] <= 0xbf)) 106 return SCRIPT_CJK; 107 else if ((t[0] >= 0xe5) && (t[0] <= 0xe9) && (t[1] >= 0x80) && (t[1] <= 0xbf)) 108 return SCRIPT_CJK; 109 110 if ((t[0] == 0xea) && (t[1] >= 0xb0) && (t[1] <= 0xbf)) 111 return SCRIPT_HANGUL; 112 else if ((t[0] >= 0xeb) && (t[0] <= 0xec) && (t[1] >= 0x80) && (t[1] <= 0xbf)) 113 return SCRIPT_HANGUL; 114 else if ((t[0] == 0xed) && (t[1] >= 0x80) && (t[1] <= 0x9f)) 115 return SCRIPT_HANGUL; 116 117 if ((t[0] == 0xe1) && (t[1] >= 0x90) && (t[1] <= 0x99)) 118 return SCRIPT_CANADIAN; 119 120 if ((t[0] == 0xe0) && (t[1] >= 0xb0) && (t[1] <= 0xb1)) 121 return SCRIPT_TELUGU; 122 123 if ((t[0] >= 'a') && (t[0] <= 'z')) 124 return SCRIPT_LATIN; 125 if ((t[0] >= 'A') && (t[0] <= 'Z')) 126 return SCRIPT_LATIN; 127 128 return SCRIPT_UNDEFINED; 129 } 130 131 /** Returns length of an (UTF8) character. May return <1 for error conditions. 132 * Made by i <info@servx.org> 133 */ 134 static int utf8_charlen(const char *str) 135 { 136 struct { char mask; char val; } t[4] = 137 { { 0x80, 0x00 }, { 0xE0, 0xC0 }, { 0xF0, 0xE0 }, { 0xF8, 0xF0 } }; 138 unsigned k, j; 139 140 for (k = 0; k < 4; k++) 141 { 142 if ((*str & t[k].mask) == t[k].val) 143 { 144 for (j = 0; j < k; j++) 145 { 146 if ((*(++str) & 0xC0) != 0x80) 147 return -1; 148 } 149 return k + 1; 150 } 151 } 152 return 1; 153 } 154 155 int lookalikespam_score(const char *text) 156 { 157 const char *p; 158 int last_script = SCRIPT_UNDEFINED; 159 int current_script; 160 int points = 0; 161 int last_character_was_word_separator = 0; 162 int skip = 0; 163 164 for (p = text; *p; p++) 165 { 166 current_script = detect_script(p); 167 168 if (current_script != SCRIPT_UNDEFINED) 169 { 170 if ((current_script != last_script) && (last_script != SCRIPT_UNDEFINED)) 171 { 172 /* A script change = 1 point */ 173 points++; 174 175 /* Give an additional point if the script change happened 176 * within the same word, as that would be rather unusual 177 * in normal cases. 178 */ 179 if (!last_character_was_word_separator) 180 points++; 181 } 182 last_script = current_script; 183 } 184 185 if (strchr("., ", *p)) 186 last_character_was_word_separator = 1; 187 else 188 last_character_was_word_separator = 0; 189 190 skip = utf8_charlen(p); 191 if (skip > 1) 192 p += skip - 1; 193 } 194 195 return points; 196 } 197 198 CMD_OVERRIDE_FUNC(override_msg) 199 { 200 int score, ret; 201 202 if (!MyUser(client) || (parc < 3) || BadPtr(parv[2]) || 203 user_allowed_by_security_group(client, cfg.except)) 204 { 205 /* Short circuit for: remote clients, insufficient parameters, 206 * antimixedutf8::except. 207 */ 208 CALL_NEXT_COMMAND_OVERRIDE(); 209 return; 210 } 211 212 score = lookalikespam_score(StripControlCodes(parv[2])); 213 if ((score >= cfg.score) && !find_tkl_exception(TKL_ANTIMIXEDUTF8, client)) 214 { 215 unreal_log(ULOG_INFO, "antimixedutf8", "ANTIMIXEDUTF8_HIT", client, 216 "[antimixedutf8] Client $client.details hit score $score -- taking action", 217 log_data_integer("score", score)); 218 if ((cfg.ban_action == BAN_ACT_BLOCK) || 219 ((cfg.ban_action == BAN_ACT_SOFT_BLOCK) && !IsLoggedIn(client))) 220 { 221 sendnotice(client, "%s", cfg.ban_reason); 222 return; 223 } else { 224 if (place_host_ban(client, cfg.ban_action, cfg.ban_reason, cfg.ban_time)) 225 return; 226 /* a return value of 0 means the user is exempted, so fallthrough.. */ 227 } 228 } 229 230 CALL_NEXT_COMMAND_OVERRIDE(); 231 } 232 233 /*** rest is module and config stuff ****/ 234 235 MOD_TEST() 236 { 237 HookAdd(modinfo->handle, HOOKTYPE_CONFIGTEST, 0, antimixedutf8_config_test); 238 return MOD_SUCCESS; 239 } 240 241 MOD_INIT() 242 { 243 MARK_AS_OFFICIAL_MODULE(modinfo); 244 245 init_config(); 246 HookAdd(modinfo->handle, HOOKTYPE_CONFIGRUN, 0, antimixedutf8_config_run); 247 return MOD_SUCCESS; 248 } 249 250 MOD_LOAD() 251 { 252 if (!CommandOverrideAdd(modinfo->handle, "PRIVMSG", 0, override_msg)) 253 return MOD_FAILED; 254 255 if (!CommandOverrideAdd(modinfo->handle, "NOTICE", 0, override_msg)) 256 return MOD_FAILED; 257 258 return MOD_SUCCESS; 259 } 260 261 MOD_UNLOAD() 262 { 263 free_config(); 264 return MOD_SUCCESS; 265 } 266 267 static void init_config(void) 268 { 269 memset(&cfg, 0, sizeof(cfg)); 270 /* Default values */ 271 cfg.score = 10; 272 safe_strdup(cfg.ban_reason, "Possible mixed character spam"); 273 cfg.ban_action = BAN_ACT_BLOCK; 274 cfg.ban_time = 60 * 60 * 4; /* irrelevant for block, but some default for others */ 275 } 276 277 static void free_config(void) 278 { 279 safe_free(cfg.ban_reason); 280 free_security_group(cfg.except); 281 memset(&cfg, 0, sizeof(cfg)); /* needed! */ 282 } 283 284 int antimixedutf8_config_test(ConfigFile *cf, ConfigEntry *ce, int type, int *errs) 285 { 286 int errors = 0; 287 ConfigEntry *cep; 288 289 if (type != CONFIG_SET) 290 return 0; 291 292 /* We are only interrested in set::antimixedutf8... */ 293 if (!ce || !ce->name || strcmp(ce->name, "antimixedutf8")) 294 return 0; 295 296 for (cep = ce->items; cep; cep = cep->next) 297 { 298 if (!cep->value) 299 { 300 config_error("%s:%i: set::antimixedutf8::%s with no value", 301 cep->file->filename, cep->line_number, cep->name); 302 errors++; 303 } else 304 if (!strcmp(cep->name, "score")) 305 { 306 int v = atoi(cep->value); 307 if ((v < 1) || (v > 99)) 308 { 309 config_error("%s:%i: set::antimixedutf8::score: must be between 1 - 99 (got: %d)", 310 cep->file->filename, cep->line_number, v); 311 errors++; 312 } 313 } else 314 if (!strcmp(cep->name, "ban-action")) 315 { 316 if (!banact_stringtoval(cep->value)) 317 { 318 config_error("%s:%i: set::antimixedutf8::ban-action: unknown action '%s'", 319 cep->file->filename, cep->line_number, cep->value); 320 errors++; 321 } 322 } else 323 if (!strcmp(cep->name, "ban-reason")) 324 { 325 } else 326 if (!strcmp(cep->name, "ban-time")) 327 { 328 } else 329 if (!strcmp(cep->name, "except")) 330 { 331 test_match_block(cf, cep, &errors); 332 } else 333 { 334 config_error("%s:%i: unknown directive set::antimixedutf8::%s", 335 cep->file->filename, cep->line_number, cep->name); 336 errors++; 337 } 338 } 339 *errs = errors; 340 return errors ? -1 : 1; 341 } 342 343 int antimixedutf8_config_run(ConfigFile *cf, ConfigEntry *ce, int type) 344 { 345 ConfigEntry *cep; 346 347 if (type != CONFIG_SET) 348 return 0; 349 350 /* We are only interrested in set::antimixedutf8... */ 351 if (!ce || !ce->name || strcmp(ce->name, "antimixedutf8")) 352 return 0; 353 354 for (cep = ce->items; cep; cep = cep->next) 355 { 356 if (!strcmp(cep->name, "score")) 357 { 358 cfg.score = atoi(cep->value); 359 } else 360 if (!strcmp(cep->name, "ban-action")) 361 { 362 cfg.ban_action = banact_stringtoval(cep->value); 363 } else 364 if (!strcmp(cep->name, "ban-reason")) 365 { 366 safe_strdup(cfg.ban_reason, cep->value); 367 } else 368 if (!strcmp(cep->name, "ban-time")) 369 { 370 cfg.ban_time = config_checkval(cep->value, CFG_TIME); 371 } else 372 if (!strcmp(cep->name, "except")) 373 { 374 conf_match_block(cf, cep, &cfg.except); 375 } 376 } 377 return 1; 378 }