| asciiblaster- draw irc art in your web browser | 
| git clone git://git.acid.vegas/asciiblaster.git | 
| Log | Files | Refs | Archive | README | 
unicode.js (19722B)
1 var unicode = (function(){ 2 var UNICODE_BLOCK_LIST = [ 3 0x0020, 0x007F, "Basic Latin", 4 0x0080, 0x00FF, "Latin-1 Supplement", 5 0x0100, 0x017F, "Latin Extended-A", 6 0x0180, 0x024F, "Latin Extended-B", 7 0x0250, 0x02AF, "IPA Extensions", 8 0x02B0, 0x02FF, "Spacing Modifier Letters", 9 0x0300, 0x036F, "Combining Diacritical Marks", 10 0x0370, 0x03FF, "Greek and Coptic", 11 0x0400, 0x04FF, "Cyrillic", 12 0x0500, 0x052F, "Cyrillic Supplement", 13 0x0530, 0x058F, "Armenian", 14 0x0590, 0x05FF, "Hebrew", 15 0x0600, 0x06FF, "Arabic", 16 0x0700, 0x074F, "Syriac", 17 0x0750, 0x077F, "Arabic Supplement", 18 0x0780, 0x07BF, "Thaana", 19 0x07C0, 0x07FF, "NKo", 20 0x0800, 0x083F, "Samaritan", 21 0x0840, 0x085F, "Mandaic", 22 0x08A0, 0x08FF, "Arabic Extended-A", 23 0x0900, 0x097F, "Devanagari", 24 0x0980, 0x09FF, "Bengali", 25 0x0A00, 0x0A7F, "Gurmukhi", 26 0x0A80, 0x0AFF, "Gujarati", 27 0x0B00, 0x0B7F, "Oriya", 28 0x0B80, 0x0BFF, "Tamil", 29 0x0C00, 0x0C7F, "Telugu", 30 0x0C80, 0x0CFF, "Kannada", 31 0x0D00, 0x0D7F, "Malayalam", 32 0x0D80, 0x0DFF, "Sinhala", 33 0x0E00, 0x0E7F, "Thai", 34 0x0E80, 0x0EFF, "Lao", 35 0x0F00, 0x0FFF, "Tibetan", 36 0x1000, 0x109F, "Myanmar", 37 0x10A0, 0x10FF, "Georgian", 38 0x1100, 0x11FF, "Hangul Jamo", 39 0x1200, 0x137F, "Ethiopic", 40 0x1380, 0x139F, "Ethiopic Supplement", 41 0x13A0, 0x13FF, "Cherokee", 42 0x1400, 0x167F, "Unified Canadian Aboriginal Syllabics", 43 0x1680, 0x169F, "Ogham", 44 0x16A0, 0x16FF, "Runic", 45 0x1700, 0x171F, "Tagalog", 46 0x1720, 0x173F, "Hanunoo", 47 0x1740, 0x175F, "Buhid", 48 0x1760, 0x177F, "Tagbanwa", 49 0x1780, 0x17FF, "Khmer", 50 0x1800, 0x18AF, "Mongolian", 51 0x18B0, 0x18FF, "Unified Canadian Aboriginal Syllabics Extended", 52 0x1900, 0x194F, "Limbu", 53 0x1950, 0x197F, "Tai Le", 54 0x1980, 0x19DF, "New Tai Lue", 55 0x19E0, 0x19FF, "Khmer Symbols", 56 0x1A00, 0x1A1F, "Buginese", 57 0x1A20, 0x1AAF, "Tai Tham", 58 0x1AB0, 0x1AFF, "Combining Diacritical Marks Extended", 59 0x1B00, 0x1B7F, "Balinese", 60 0x1B80, 0x1BBF, "Sundanese", 61 0x1BC0, 0x1BFF, "Batak", 62 0x1C00, 0x1C4F, "Lepcha", 63 0x1C50, 0x1C7F, "Ol Chiki", 64 0x1CC0, 0x1CCF, "Sundanese Supplement", 65 0x1CD0, 0x1CFF, "Vedic Extensions", 66 0x1D00, 0x1D7F, "Phonetic Extensions", 67 0x1D80, 0x1DBF, "Phonetic Extensions Supplement", 68 0x1DC0, 0x1DFF, "Combining Diacritical Marks Supplement", 69 0x1E00, 0x1EFF, "Latin Extended Additional", 70 0x1F00, 0x1FFF, "Greek Extended", 71 0x2000, 0x206F, "General Punctuation", 72 0x2070, 0x209F, "Superscripts and Subscripts", 73 0x20A0, 0x20CF, "Currency Symbols", 74 0x20D0, 0x20FF, "Combining Diacritical Marks for Symbols", 75 0x2100, 0x214F, "Letterlike Symbols", 76 0x2150, 0x218F, "Number Forms", 77 0x2190, 0x21FF, "Arrows", 78 0x2200, 0x22FF, "Mathematical Operators", 79 0x2300, 0x23FF, "Miscellaneous Technical", 80 0x2400, 0x243F, "Control Pictures", 81 0x2440, 0x245F, "Optical Character Recognition", 82 0x2460, 0x24FF, "Enclosed Alphanumerics", 83 0x2500, 0x257F, "Box Drawing", 84 0x2580, 0x259F, "Block Elements", 85 0x25A0, 0x25FF, "Geometric Shapes", 86 0x2600, 0x26FF, "Miscellaneous Symbols", 87 0x2700, 0x27BF, "Dingbats", 88 0x27C0, 0x27EF, "Miscellaneous Mathematical Symbols-A", 89 0x27F0, 0x27FF, "Supplemental Arrows-A", 90 0x2800, 0x28FF, "Braille Patterns", 91 0x2900, 0x297F, "Supplemental Arrows-B", 92 0x2980, 0x29FF, "Miscellaneous Mathematical Symbols-B", 93 0x2A00, 0x2AFF, "Supplemental Mathematical Operators", 94 0x2B00, 0x2BFF, "Miscellaneous Symbols and Arrows", 95 0x2C00, 0x2C5F, "Glagolitic", 96 0x2C60, 0x2C7F, "Latin Extended-C", 97 0x2C80, 0x2CFF, "Coptic", 98 0x2D00, 0x2D2F, "Georgian Supplement", 99 0x2D30, 0x2D7F, "Tifinagh", 100 0x2D80, 0x2DDF, "Ethiopic Extended", 101 0x2DE0, 0x2DFF, "Cyrillic Extended-A", 102 0x2E00, 0x2E7F, "Supplemental Punctuation", 103 0x2E80, 0x2EFF, "CJK Radicals Supplement", 104 0x2F00, 0x2FDF, "Kangxi Radicals", 105 0x2FF0, 0x2FFF, "Ideographic Description Characters", 106 0x3000, 0x303F, "CJK Symbols and Punctuation", 107 0x3040, 0x309F, "Hiragana", 108 0x30A0, 0x30FF, "Katakana", 109 0x3100, 0x312F, "Bopomofo", 110 0x3130, 0x318F, "Hangul Compatibility Jamo", 111 0x3190, 0x319F, "Kanbun", 112 0x31A0, 0x31BF, "Bopomofo Extended", 113 0x31C0, 0x31EF, "CJK Strokes", 114 0x31F0, 0x31FF, "Katakana Phonetic Extensions", 115 0x3200, 0x32FF, "Enclosed CJK Letters and Months", 116 0x3300, 0x33FF, "CJK Compatibility", 117 0x3400, 0x4DBF, "CJK Unified Ideographs Extension A", 118 0x4DC0, 0x4DFF, "Yijing Hexagram Symbols", 119 0x4E00, 0x9FFF, "CJK Unified Ideographs", 120 0xA000, 0xA48F, "Yi Syllables", 121 0xA490, 0xA4CF, "Yi Radicals", 122 0xA4D0, 0xA4FF, "Lisu", 123 0xA500, 0xA63F, "Vai", 124 0xA640, 0xA69F, "Cyrillic Extended-B", 125 0xA6A0, 0xA6FF, "Bamum", 126 0xA700, 0xA71F, "Modifier Tone Letters", 127 0xA720, 0xA7FF, "Latin Extended-D", 128 0xA800, 0xA82F, "Syloti Nagri", 129 0xA830, 0xA83F, "Common Indic Number Forms", 130 0xA840, 0xA87F, "Phags-pa", 131 0xA880, 0xA8DF, "Saurashtra", 132 0xA8E0, 0xA8FF, "Devanagari Extended", 133 0xA900, 0xA92F, "Kayah Li", 134 0xA930, 0xA95F, "Rejang", 135 0xA960, 0xA97F, "Hangul Jamo Extended-A", 136 0xA980, 0xA9DF, "Javanese", 137 0xA9E0, 0xA9FF, "Myanmar Extended-B", 138 0xAA00, 0xAA5F, "Cham", 139 0xAA60, 0xAA7F, "Myanmar Extended-A", 140 0xAA80, 0xAADF, "Tai Viet", 141 0xAAE0, 0xAAFF, "Meetei Mayek Extensions", 142 0xAB00, 0xAB2F, "Ethiopic Extended-A", 143 0xAB30, 0xAB6F, "Latin Extended-E", 144 0xABC0, 0xABFF, "Meetei Mayek", 145 0xAC00, 0xD7AF, "Hangul Syllables", 146 0xD7B0, 0xD7FF, "Hangul Jamo Extended-B", 147 0xD800, 0xDB7F, "High Surrogates", 148 0xDB80, 0xDBFF, "High Private Use Surrogates", 149 0xDC00, 0xDFFF, "Low Surrogates", 150 0xE000, 0xF8FF, "Private Use Area", 151 0xF900, 0xFAFF, "CJK Compatibility Ideographs", 152 0xFB00, 0xFB4F, "Alphabetic Presentation Forms", 153 0xFB50, 0xFDFF, "Arabic Presentation Forms-A", 154 0xFE00, 0xFE0F, "Variation Selectors", 155 0xFE10, 0xFE1F, "Vertical Forms", 156 0xFE20, 0xFE2F, "Combining Half Marks", 157 0xFE30, 0xFE4F, "CJK Compatibility Forms", 158 0xFE50, 0xFE6F, "Small Form Variants", 159 0xFE70, 0xFEFF, "Arabic Presentation Forms-B", 160 0xFF00, 0xFFEF, "Halfwidth and Fullwidth Forms", 161 0xFFF0, 0xFFFF, "Specials", 162 0x10000, 0x1007F, "Linear B Syllabary", 163 0x10080, 0x100FF, "Linear B Ideograms", 164 0x10100, 0x1013F, "Aegean Numbers", 165 0x10140, 0x1018F, "Ancient Greek Numbers", 166 0x10190, 0x101CF, "Ancient Symbols", 167 0x101D0, 0x101FF, "Phaistos Disc", 168 0x10280, 0x1029F, "Lycian", 169 0x102A0, 0x102DF, "Carian", 170 0x102E0, 0x102FF, "Coptic Epact Numbers", 171 0x10300, 0x1032F, "Old Italic", 172 0x10330, 0x1034F, "Gothic", 173 0x10350, 0x1037F, "Old Permic", 174 0x10380, 0x1039F, "Ugaritic", 175 0x103A0, 0x103DF, "Old Persian", 176 0x10400, 0x1044F, "Deseret", 177 0x10450, 0x1047F, "Shavian", 178 0x10480, 0x104AF, "Osmanya", 179 0x10500, 0x1052F, "Elbasan", 180 0x10530, 0x1056F, "Caucasian Albanian", 181 0x10600, 0x1077F, "Linear A", 182 0x10800, 0x1083F, "Cypriot Syllabary", 183 0x10840, 0x1085F, "Imperial Aramaic", 184 0x10860, 0x1087F, "Palmyrene", 185 0x10880, 0x108AF, "Nabataean", 186 0x10900, 0x1091F, "Phoenician", 187 0x10920, 0x1093F, "Lydian", 188 0x10980, 0x1099F, "Meroitic Hieroglyphs", 189 0x109A0, 0x109FF, "Meroitic Cursive", 190 0x10A00, 0x10A5F, "Kharoshthi", 191 0x10A60, 0x10A7F, "Old South Arabian", 192 0x10A80, 0x10A9F, "Old North Arabian", 193 0x10AC0, 0x10AFF, "Manichaean", 194 0x10B00, 0x10B3F, "Avestan", 195 0x10B40, 0x10B5F, "Inscriptional Parthian", 196 0x10B60, 0x10B7F, "Inscriptional Pahlavi", 197 0x10B80, 0x10BAF, "Psalter Pahlavi", 198 0x10C00, 0x10C4F, "Old Turkic", 199 0x10E60, 0x10E7F, "Rumi Numeral Symbols", 200 0x11000, 0x1107F, "Brahmi", 201 0x11080, 0x110CF, "Kaithi", 202 0x110D0, 0x110FF, "Sora Sompeng", 203 0x11100, 0x1114F, "Chakma", 204 0x11150, 0x1117F, "Mahajani", 205 0x11180, 0x111DF, "Sharada", 206 0x111E0, 0x111FF, "Sinhala Archaic Numbers", 207 0x11200, 0x1124F, "Khojki", 208 0x112B0, 0x112FF, "Khudawadi", 209 0x11300, 0x1137F, "Grantha", 210 0x11480, 0x114DF, "Tirhuta", 211 0x11580, 0x115FF, "Siddham", 212 0x11600, 0x1165F, "Modi", 213 0x11680, 0x116CF, "Takri", 214 0x118A0, 0x118FF, "Warang Citi", 215 0x11AC0, 0x11AFF, "Pau Cin Hau", 216 0x12000, 0x123FF, "Cuneiform", 217 0x12400, 0x1247F, "Cuneiform Numbers and Punctuation", 218 0x13000, 0x1342F, "Egyptian Hieroglyphs", 219 0x16800, 0x16A3F, "Bamum Supplement", 220 0x16A40, 0x16A6F, "Mro", 221 0x16AD0, 0x16AFF, "Bassa Vah", 222 0x16B00, 0x16B8F, "Pahawh Hmong", 223 0x16F00, 0x16F9F, "Miao", 224 0x1B000, 0x1B0FF, "Kana Supplement", 225 0x1BC00, 0x1BC9F, "Duployan", 226 0x1BCA0, 0x1BCAF, "Shorthand Format Controls", 227 0x1D000, 0x1D0FF, "Byzantine Musical Symbols", 228 0x1D100, 0x1D1FF, "Musical Symbols", 229 0x1D200, 0x1D24F, "Ancient Greek Musical Notation", 230 0x1D300, 0x1D35F, "Tai Xuan Jing Symbols", 231 0x1D360, 0x1D37F, "Counting Rod Numerals", 232 0x1D400, 0x1D7FF, "Mathematical Alphanumeric Symbols", 233 0x1E800, 0x1E8DF, "Mende Kikakui", 234 0x1EE00, 0x1EEFF, "Arabic Mathematical Alphabetic Symbols", 235 0x1F000, 0x1F02F, "Mahjong Tiles", 236 0x1F030, 0x1F09F, "Domino Tiles", 237 0x1F0A0, 0x1F0FF, "Playing Cards", 238 0x1F100, 0x1F1FF, "Enclosed Alphanumeric Supplement", 239 0x1F200, 0x1F2FF, "Enclosed Ideographic Supplement", 240 0x1F300, 0x1F5FF, "Miscellaneous Symbols and Pictographs", 241 0x1F600, 0x1F64F, "Emoticons", 242 0x1F650, 0x1F67F, "Ornamental Dingbats", 243 0x1F680, 0x1F6FF, "Transport and Map Symbols", 244 0x1F700, 0x1F77F, "Alchemical Symbols", 245 0x1F780, 0x1F7FF, "Geometric Shapes Extended", 246 0x1F800, 0x1F8FF, "Supplemental Arrows-C", 247 0x20000, 0x2A6DF, "CJK Unified Ideographs Extension B", 248 0x2A700, 0x2B73F, "CJK Unified Ideographs Extension C", 249 0x2B740, 0x2B81F, "CJK Unified Ideographs Extension D", 250 0x2F800, 0x2FA1F, "CJK Compatibility Ideographs Supplement", 251 0xE0000, 0xE007F, "Tags", 252 0xE0100, 0xE01EF, "Variation Selectors Supplement", 253 0xF0000, 0xFFFFF, "Supplementary Private Use Area-A", 254 0x100000, 0x10FFFF, "Supplementary Private Use Area-B", 255 ] 256 var UNICODE_BLOCK_COUNT = UNICODE_BLOCK_LIST.length / 3 257 var UNICODE_LOOKUP = {} 258 for (var i = 0, len = UNICODE_BLOCK_LIST.length; i < len; i += 3) { 259 UNICODE_LOOKUP[ UNICODE_BLOCK_LIST[i+2] ] = [ UNICODE_BLOCK_LIST[i], UNICODE_BLOCK_LIST[i+1] ] 260 } 261 262 function index (j) { 263 return [ UNICODE_BLOCK_LIST[j*3], UNICODE_BLOCK_LIST[j*3+1], UNICODE_BLOCK_LIST[j*3+2], [] ] 264 } 265 function range(m,n){ 266 if (m > n) return [] 267 var a = new Array (n-m) 268 for (var i = 0, j = m; j <= n; i++, j++) { 269 a[i] = j 270 } 271 return a 272 } 273 function paginate (a, n){ 274 var aa = [], ai, i = 0 275 while (i < 100) { 276 ai = a.slice(i * n, (i+1) * n) 277 if (! ai.length) break 278 aa.push(ai) 279 i++ 280 } 281 return aa 282 } 283 function block (name, n){ 284 var b = UNICODE_LOOKUP[name] 285 if (! b) return "" 286 return range.apply(null, b).map(function(n){ return String.fromCharCode(n) }) 287 } 288 function entities (a) { 289 return a.map(function(k){ return "&#" + k.join(";&#") + ";" }).join("<br>") 290 } 291 function findGroups (chars){ 292 var groups = [], row, list 293 for (var i = 0, j = -1, next = -1, len = chars.length; i < len; i++) { 294 if (chars[i] < next) { 295 list.push(chars[i]) 296 continue 297 } 298 do { 299 j += 1 300 next = UNICODE_BLOCK_LIST[(j+1)*3] 301 } while (chars[i] > next) 302 row = index(j) 303 list = row[3] 304 groups.push( row ) 305 } 306 return groups 307 } 308 309 // encodes unicode characters as escaped utf16 - \xFFFF 310 // encodes ONLY non-ascii characters 311 function escapeToUtf16 (txt) { 312 var escaped_txt = "", kode 313 for (var i = 0; i < txt.length; i++) { 314 kode = txt.charCodeAt(i) 315 if (kode > 0x7f) { 316 kode = kode.toString(16) 317 switch (kode.length) { 318 case 2: 319 kode = "0" + kode 320 case 3: 321 kode = "0" + kode 322 } 323 escaped_txt += "\\u" + kode 324 } 325 else { 326 escaped_txt += txt[i] 327 } 328 } 329 return escaped_txt 330 } 331 332 // encodes unicode characters as escaped bytes - \xFF 333 // encodes ONLY non-ascii characters 334 function escapeToEscapedBytes (txt) { 335 var escaped_txt = "", kode, utf8_bytes 336 for (var i = 0; i < txt.length; i++) { 337 kode = txt.charCodeAt(i) 338 if (kode > 0x7f) { 339 utf8_bytes = convertUnicodeCodePointToUtf8Bytes(kode) 340 escaped_txt += convertBytesToEscapedString(utf8_bytes, 16) 341 } 342 else { 343 escaped_txt += txt[i] 344 } 345 } 346 return escaped_txt 347 } 348 349 // encodes unicode characters as escaped bytes - \xFF 350 // encodes an ENTIRE string 351 function escapeAllToEscapedBytes(str, base) { 352 var unicode_codes = convertStringToUnicodeCodePoints(str); 353 var data_bytes = convertUnicodeCodePointsToBytes(unicode_codes); 354 return convertBytesToEscapedString(data_bytes, 16); 355 } 356 // [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => '\xE3\x81\x82\xE3\x81\x84' 357 // [ 0343, 0201, 0202, 0343, 0201, 0204 ] => '\343\201\202\343\201\204' 358 function convertBytesToEscapedString(data_bytes, base) { 359 var escaped = ''; 360 for (var i = 0; i < data_bytes.length; ++i) { 361 var prefix = (base == 16 ? "\\x" : "\\"); 362 var num_digits = base == 16 ? 2 : 3; 363 var escaped_byte = prefix + formatNumber(data_bytes[i], base, num_digits) 364 escaped += escaped_byte; 365 } 366 return escaped; 367 } 368 // [ 0x3042, 0x3044 ] => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] 369 function convertUnicodeCodePointsToBytes(unicode_codes) { 370 var utf8_bytes = []; 371 for (var i = 0; i < unicode_codes.length; ++i) { 372 var bytes = convertUnicodeCodePointToUtf8Bytes(unicode_codes[i]); 373 utf8_bytes = utf8_bytes.concat(bytes); 374 } 375 return utf8_bytes; 376 } 377 // 0x3042 => [ 0xE3, 0x81, 0x82 ] 378 function convertUnicodeCodePointToUtf8Bytes(unicode_code) { 379 var utf8_bytes = []; 380 if (unicode_code < 0x80) { // 1-byte 381 utf8_bytes.push(unicode_code); 382 } else if (unicode_code < (1 << 11)) { // 2-byte 383 utf8_bytes.push((unicode_code >>> 6) | 0xC0); 384 utf8_bytes.push((unicode_code & 0x3F) | 0x80); 385 } else if (unicode_code < (1 << 16)) { // 3-byte 386 utf8_bytes.push((unicode_code >>> 12) | 0xE0); 387 utf8_bytes.push(((unicode_code >> 6) & 0x3f) | 0x80); 388 utf8_bytes.push((unicode_code & 0x3F) | 0x80); 389 } else if (unicode_code < (1 << 21)) { // 4-byte 390 utf8_bytes.push((unicode_code >>> 18) | 0xF0); 391 utf8_bytes.push(((unicode_code >> 12) & 0x3F) | 0x80); 392 utf8_bytes.push(((unicode_code >> 6) & 0x3F) | 0x80); 393 utf8_bytes.push((unicode_code & 0x3F) | 0x80); 394 } 395 return utf8_bytes; 396 } 397 // "ã‚ã„" => [ 0x3042, 0x3044 ] 398 function convertStringToUnicodeCodePoints(str) { 399 var surrogate_1st = 0; 400 var unicode_codes = []; 401 for (var i = 0; i < str.length; ++i) { 402 var utf16_code = str.charCodeAt(i); 403 if (surrogate_1st != 0) { 404 if (utf16_code >= 0xDC00 && utf16_code <= 0xDFFF) { 405 var surrogate_2nd = utf16_code; 406 var unicode_code = (surrogate_1st - 0xD800) * (1 << 10) + (1 << 16) + 407 (surrogate_2nd - 0xDC00); 408 unicode_codes.push(unicode_code); 409 } else { 410 // Malformed surrogate pair ignored. 411 } 412 surrogate_1st = 0; 413 } else if (utf16_code >= 0xD800 && utf16_code <= 0xDBFF) { 414 surrogate_1st = utf16_code; 415 } else { 416 unicode_codes.push(utf16_code); 417 } 418 } 419 return unicode_codes; 420 } 421 // 0xff => "ff" 422 // 0xff => "377" 423 function formatNumber(number, base, num_digits) { 424 var str = number.toString(base).toUpperCase(); 425 for (var i = str.length; i < num_digits; ++i) { 426 str = "0" + str; 427 } 428 return str; 429 } 430 431 // convert \xFF\xFF\xFF to unicode 432 function unescapeFromEscapedBytes (str) { 433 var data_bytes = convertEscapedBytesToBytes(str); 434 var unicode_codes = convertUtf8BytesToUnicodeCodePoints(data_bytes); 435 return convertUnicodeCodePointsToString(unicode_codes); 436 } 437 // r'\xE3\x81\x82\xE3\x81\x84' => [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] 438 // r'\343\201\202\343\201\204' => [ 0343, 0201, 0202, 0343, 0201, 0204 ] 439 function convertEscapedBytesToBytes(str) { 440 var parts = str.split("\\x"); 441 parts.shift(); // Trim the first element. 442 var codes = []; 443 var max = Math.pow(2, 8); 444 for (var i = 0; i < parts.length; ++i) { 445 var code = parseInt(parts[i], 16); 446 if (code >= 0 && code < max) { 447 codes.push(code); 448 } else { 449 // Malformed code ignored. 450 } 451 } 452 return codes; 453 } 454 // [ 0xE3, 0x81, 0x82, 0xE3, 0x81, 0x84 ] => [ 0x3042, 0x3044 ] 455 function convertUtf8BytesToUnicodeCodePoints(utf8_bytes) { 456 var unicode_codes = []; 457 var unicode_code = 0; 458 var num_followed = 0; 459 for (var i = 0; i < utf8_bytes.length; ++i) { 460 var utf8_byte = utf8_bytes[i]; 461 if (utf8_byte >= 0x100) { 462 // Malformed utf8 byte ignored. 463 } else if ((utf8_byte & 0xC0) == 0x80) { 464 if (num_followed > 0) { 465 unicode_code = (unicode_code << 6) | (utf8_byte & 0x3f); 466 num_followed -= 1; 467 } else { 468 // Malformed UTF-8 sequence ignored. 469 } 470 } else { 471 if (num_followed == 0) { 472 unicode_codes.push(unicode_code); 473 } else { 474 // Malformed UTF-8 sequence ignored. 475 } 476 if (utf8_byte < 0x80){ // 1-byte 477 unicode_code = utf8_byte; 478 num_followed = 0; 479 } else if ((utf8_byte & 0xE0) == 0xC0) { // 2-byte 480 unicode_code = utf8_byte & 0x1f; 481 num_followed = 1; 482 } else if ((utf8_byte & 0xF0) == 0xE0) { // 3-byte 483 unicode_code = utf8_byte & 0x0f; 484 num_followed = 2; 485 } else if ((utf8_byte & 0xF8) == 0xF0) { // 4-byte 486 unicode_code = utf8_byte & 0x07; 487 num_followed = 3; 488 } else { 489 // Malformed UTF-8 sequence ignored. 490 } 491 } 492 } 493 if (num_followed == 0) { 494 unicode_codes.push(unicode_code); 495 } else { 496 // Malformed UTF-8 sequence ignored. 497 } 498 unicode_codes.shift(); // Trim the first element. 499 return unicode_codes; 500 } 501 // [ 0x3042, 0x3044 ] => [ 0x3042, 0x3044 ] 502 // [ 0xD840, 0xDC0B ] => [ 0x2000B ] // A surrogate pair. 503 function convertUnicodeCodePointsToUtf16Codes(unicode_codes) { 504 var utf16_codes = []; 505 for (var i = 0; i < unicode_codes.length; ++i) { 506 var unicode_code = unicode_codes[i]; 507 if (unicode_code < (1 << 16)) { 508 utf16_codes.push(unicode_code); 509 } else { 510 var first = ((unicode_code - (1 << 16)) / (1 << 10)) + 0xD800; 511 var second = (unicode_code % (1 << 10)) + 0xDC00; 512 utf16_codes.push(first) 513 utf16_codes.push(second) 514 } 515 } 516 return utf16_codes; 517 } 518 // [ 0x3042, 0x3044 ] => "ã‚ã„" 519 function convertUnicodeCodePointsToString(unicode_codes) { 520 var utf16_codes = convertUnicodeCodePointsToUtf16Codes(unicode_codes); 521 return convertUtf16CodesToString(utf16_codes); 522 } 523 // [ 0x3042, 0x3044 ] => "ã‚ã„" 524 function convertUtf16CodesToString(utf16_codes) { 525 var unescaped = ''; 526 for (var i = 0; i < utf16_codes.length; ++i) { 527 unescaped += String.fromCharCode(utf16_codes[i]); 528 } 529 return unescaped; 530 } 531 532 return { 533 raw: UNICODE_BLOCK_LIST, 534 lookup: UNICODE_LOOKUP, 535 index: index, 536 range: range, 537 block: block, 538 findGroups: findGroups, 539 paginate: paginate, 540 escapeToEscapedBytes: escapeToEscapedBytes, 541 unescapeFromEscapedBytes: unescapeFromEscapedBytes, 542 } 543 })()

