00001 00002 // 00003 // SFML - Simple and Fast Multimedia Library 00004 // Copyright (C) 2007-2009 Laurent Gomila (laurent.gom@gmail.com) 00005 // 00006 // This software is provided 'as-is', without any express or implied warranty. 00007 // In no event will the authors be held liable for any damages arising from the use of this software. 00008 // 00009 // Permission is granted to anyone to use this software for any purpose, 00010 // including commercial applications, and to alter it and redistribute it freely, 00011 // subject to the following restrictions: 00012 // 00013 // 1. The origin of this software must not be misrepresented; 00014 // you must not claim that you wrote the original software. 00015 // If you use this software in a product, an acknowledgment 00016 // in the product documentation would be appreciated but is not required. 00017 // 00018 // 2. Altered source versions must be plainly marked as such, 00019 // and must not be misrepresented as being the original software. 00020 // 00021 // 3. This notice may not be removed or altered from any source distribution. 00022 // 00024 00025 00027 template <typename In> 00028 In Utf<8>::Decode(In begin, In end, Uint32& output, Uint32 replacement) 00029 { 00030 // Some useful precomputed data 00031 static const int trailing[256] = 00032 { 00033 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00034 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00035 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00036 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00037 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00038 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00039 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00040 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 00041 }; 00042 static const Uint32 offsets[6] = 00043 { 00044 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 00045 }; 00046 00047 // Decode the character 00048 int trailingBytes = trailing[static_cast<int>(*begin)]; 00049 if (begin + trailingBytes < end) 00050 { 00051 output = 0; 00052 switch (trailingBytes) 00053 { 00054 case 5 : output += *begin++; output <<= 6; 00055 case 4 : output += *begin++; output <<= 6; 00056 case 3 : output += *begin++; output <<= 6; 00057 case 2 : output += *begin++; output <<= 6; 00058 case 1 : output += *begin++; output <<= 6; 00059 case 0 : output += *begin++; 00060 } 00061 output -= offsets[trailingBytes]; 00062 } 00063 else 00064 { 00065 // Incomplete character 00066 begin = end; 00067 output = replacement; 00068 } 00069 00070 return begin; 00071 } 00072 00073 00075 template <typename Out> 00076 Out Utf<8>::Encode(Uint32 input, Out output, Uint8 replacement) 00077 { 00078 // Some useful precomputed data 00079 static const Uint8 firstBytes[7] = 00080 { 00081 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC 00082 }; 00083 00084 // Encode the character 00085 if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF))) 00086 { 00087 // Invalid character 00088 if (replacement) 00089 *output++ = replacement; 00090 } 00091 else 00092 { 00093 // Valid character 00094 00095 // Get the number of bytes to write 00096 int bytesToWrite = 1; 00097 if (input < 0x80) bytesToWrite = 1; 00098 else if (input < 0x800) bytesToWrite = 2; 00099 else if (input < 0x10000) bytesToWrite = 3; 00100 else if (input <= 0x0010FFFF) bytesToWrite = 4; 00101 00102 // Extract the bytes to write 00103 Uint8 bytes[4]; 00104 switch (bytesToWrite) 00105 { 00106 case 4 : bytes[3] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6; 00107 case 3 : bytes[2] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6; 00108 case 2 : bytes[1] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6; 00109 case 1 : bytes[0] = static_cast<Uint8> (input | firstBytes[bytesToWrite]); 00110 } 00111 00112 // Add them to the output 00113 const Uint8* currentByte = bytes; 00114 switch (bytesToWrite) 00115 { 00116 case 4 : *output++ = *currentByte++; 00117 case 3 : *output++ = *currentByte++; 00118 case 2 : *output++ = *currentByte++; 00119 case 1 : *output++ = *currentByte++; 00120 } 00121 } 00122 00123 return output; 00124 } 00125 00126 00128 template <typename In> 00129 In Utf<8>::Next(In begin, In end) 00130 { 00131 Uint32 codepoint; 00132 return Decode(begin, end, codepoint); 00133 } 00134 00135 00137 template <typename In> 00138 std::size_t Utf<8>::Count(In begin, In end) 00139 { 00140 std::size_t length = 0; 00141 while (begin < end) 00142 { 00143 begin = Next(begin, end); 00144 ++length; 00145 } 00146 00147 return length; 00148 } 00149 00150 00152 template <typename In, typename Out> 00153 Out Utf<8>::FromAnsi(In begin, In end, Out output, const std::locale& locale) 00154 { 00155 while (begin < end) 00156 { 00157 Uint32 codepoint = Utf<32>::DecodeAnsi(*begin++, locale); 00158 output = Encode(codepoint, output); 00159 } 00160 00161 return output; 00162 } 00163 00164 00166 template <typename In, typename Out> 00167 Out Utf<8>::FromWide(In begin, In end, Out output) 00168 { 00169 while (begin < end) 00170 { 00171 Uint32 codepoint = Utf<32>::DecodeWide(*begin++); 00172 output = Encode(codepoint, output); 00173 } 00174 00175 return output; 00176 } 00177 00178 00180 template <typename In, typename Out> 00181 Out Utf<8>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale) 00182 { 00183 while (begin < end) 00184 { 00185 Uint32 codepoint; 00186 begin = Decode(begin, end, codepoint); 00187 output = Utf<32>::EncodeAnsi(codepoint, output, replacement, locale); 00188 } 00189 00190 return output; 00191 } 00192 00193 00195 template <typename In, typename Out> 00196 Out Utf<8>::ToWide(In begin, In end, Out output, wchar_t replacement) 00197 { 00198 while (begin < end) 00199 { 00200 Uint32 codepoint; 00201 begin = Decode(begin, end, codepoint); 00202 output = Utf<32>::EncodeWide(codepoint, output, replacement); 00203 } 00204 00205 return output; 00206 } 00207 00208 00210 template <typename In, typename Out> 00211 Out Utf<8>::ToUtf8(In begin, In end, Out output) 00212 { 00213 while (begin < end) 00214 *output++ = *begin++; 00215 00216 return output; 00217 } 00218 00219 00221 template <typename In, typename Out> 00222 Out Utf<8>::ToUtf16(In begin, In end, Out output) 00223 { 00224 while (begin < end) 00225 { 00226 Uint32 codepoint; 00227 begin = Decode(begin, end, codepoint); 00228 output = Utf<16>::Encode(codepoint, output); 00229 } 00230 00231 return output; 00232 } 00233 00234 00236 template <typename In, typename Out> 00237 Out Utf<8>::ToUtf32(In begin, In end, Out output) 00238 { 00239 while (begin < end) 00240 { 00241 Uint32 codepoint; 00242 begin = Decode(begin, end, codepoint); 00243 *output++ = codepoint; 00244 } 00245 00246 return output; 00247 } 00248 00249 00251 template <typename In> 00252 In Utf<16>::Decode(In begin, In end, Uint32& output, Uint32 replacement) 00253 { 00254 Uint16 first = *begin++; 00255 00256 // If it's a surrogate pair, first convert to a single UTF-32 character 00257 if ((first >= 0xD800) && (first <= 0xDBFF)) 00258 { 00259 if (begin < end) 00260 { 00261 Uint32 second = *begin++; 00262 if ((second >= 0xDC00) && (second <= 0xDFFF)) 00263 { 00264 // The second element is valid: convert the two elements to a UTF-32 character 00265 output = static_cast<Uint32>(((first - 0xD800) << 10) + (second - 0xDC00) + 0x0010000); 00266 } 00267 else 00268 { 00269 // Invalid character 00270 output = replacement; 00271 } 00272 } 00273 else 00274 { 00275 // Invalid character 00276 begin = end; 00277 output = replacement; 00278 } 00279 } 00280 else 00281 { 00282 // We can make a direct copy 00283 output = first; 00284 } 00285 00286 return begin; 00287 } 00288 00289 00291 template <typename Out> 00292 Out Utf<16>::Encode(Uint32 input, Out output, Uint16 replacement) 00293 { 00294 if (input < 0xFFFF) 00295 { 00296 // The character can be copied directly, we just need to check if it's in the valid range 00297 if ((input >= 0xD800) && (input <= 0xDFFF)) 00298 { 00299 // Invalid character (this range is reserved) 00300 if (replacement) 00301 *output++ = replacement; 00302 } 00303 else 00304 { 00305 // Valid character directly convertible to a single UTF-16 character 00306 *output++ = static_cast<Uint16>(input); 00307 } 00308 } 00309 else if (input > 0x0010FFFF) 00310 { 00311 // Invalid character (greater than the maximum unicode value) 00312 if (replacement) 00313 *output++ = replacement; 00314 } 00315 else 00316 { 00317 // The input character will be converted to two UTF-16 elements 00318 input -= 0x0010000; 00319 *output++ = static_cast<Uint16>((input >> 10) + 0xD800); 00320 *output++ = static_cast<Uint16>((input & 0x3FFUL) + 0xDC00); 00321 } 00322 00323 return output; 00324 } 00325 00326 00328 template <typename In> 00329 In Utf<16>::Next(In begin, In end) 00330 { 00331 Uint32 codepoint; 00332 return Decode(begin, end, codepoint); 00333 } 00334 00335 00337 template <typename In> 00338 std::size_t Utf<16>::Count(In begin, In end) 00339 { 00340 std::size_t length = 0; 00341 while (begin < end) 00342 { 00343 begin = Next(begin, end); 00344 ++length; 00345 } 00346 00347 return length; 00348 } 00349 00350 00352 template <typename In, typename Out> 00353 Out Utf<16>::FromAnsi(In begin, In end, Out output, const std::locale& locale) 00354 { 00355 while (begin < end) 00356 { 00357 Uint32 codepoint = Utf<32>::DecodeAnsi(*begin++, locale); 00358 output = Encode(codepoint, output); 00359 } 00360 00361 return output; 00362 } 00363 00364 00366 template <typename In, typename Out> 00367 Out Utf<16>::FromWide(In begin, In end, Out output) 00368 { 00369 while (begin < end) 00370 { 00371 Uint32 codepoint = Utf<32>::DecodeWide(*begin++); 00372 output = Encode(codepoint, output); 00373 } 00374 00375 return output; 00376 } 00377 00378 00380 template <typename In, typename Out> 00381 Out Utf<16>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale) 00382 { 00383 while (begin < end) 00384 { 00385 Uint32 codepoint; 00386 begin = Decode(begin, end, codepoint); 00387 output = Utf<32>::EncodeAnsi(codepoint, output, replacement, locale); 00388 } 00389 00390 return output; 00391 } 00392 00393 00395 template <typename In, typename Out> 00396 Out Utf<16>::ToWide(In begin, In end, Out output, wchar_t replacement) 00397 { 00398 while (begin < end) 00399 { 00400 Uint32 codepoint; 00401 begin = Decode(begin, end, codepoint); 00402 output = Utf<32>::EncodeWide(codepoint, output, replacement); 00403 } 00404 00405 return output; 00406 } 00407 00408 00410 template <typename In, typename Out> 00411 Out Utf<16>::ToUtf8(In begin, In end, Out output) 00412 { 00413 while (begin < end) 00414 { 00415 Uint32 codepoint; 00416 begin = Decode(begin, end, codepoint); 00417 output = Utf<8>::Encode(codepoint, output); 00418 } 00419 00420 return output; 00421 } 00422 00423 00425 template <typename In, typename Out> 00426 Out Utf<16>::ToUtf16(In begin, In end, Out output) 00427 { 00428 while (begin < end) 00429 *output++ = *begin++; 00430 00431 return output; 00432 } 00433 00434 00436 template <typename In, typename Out> 00437 Out Utf<16>::ToUtf32(In begin, In end, Out output) 00438 { 00439 while (begin < end) 00440 { 00441 Uint32 codepoint; 00442 begin = Decode(begin, end, codepoint); 00443 *output++ = codepoint; 00444 } 00445 00446 return output; 00447 } 00448 00449 00451 template <typename In> 00452 In Utf<32>::Decode(In begin, In end, Uint32& output, Uint32) 00453 { 00454 output = *begin++; 00455 return begin; 00456 } 00457 00458 00460 template <typename Out> 00461 Out Utf<32>::Encode(Uint32 input, Out output, Uint32 replacement) 00462 { 00463 *output++ = input; 00464 return output; 00465 } 00466 00467 00469 template <typename In> 00470 In Utf<32>::Next(In begin, In end) 00471 { 00472 return ++begin; 00473 } 00474 00475 00477 template <typename In> 00478 std::size_t Utf<32>::Count(In begin, In end) 00479 { 00480 return begin - end; 00481 } 00482 00483 00485 template <typename In, typename Out> 00486 Out Utf<32>::FromAnsi(In begin, In end, Out output, const std::locale& locale) 00487 { 00488 while (begin < end) 00489 *output++ = DecodeAnsi(*begin++, locale); 00490 00491 return output; 00492 } 00493 00494 00496 template <typename In, typename Out> 00497 Out Utf<32>::FromWide(In begin, In end, Out output) 00498 { 00499 while (begin < end) 00500 *output++ = DecodeWide(*begin++); 00501 00502 return output; 00503 } 00504 00505 00507 template <typename In, typename Out> 00508 Out Utf<32>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale) 00509 { 00510 while (begin < end) 00511 output = EncodeAnsi(*begin++, output, replacement, locale); 00512 00513 return output; 00514 } 00515 00516 00518 template <typename In, typename Out> 00519 Out Utf<32>::ToWide(In begin, In end, Out output, wchar_t replacement) 00520 { 00521 while (begin < end) 00522 output = EncodeWide(*begin++, output, replacement); 00523 00524 return output; 00525 } 00526 00527 00529 template <typename In, typename Out> 00530 Out Utf<32>::ToUtf8(In begin, In end, Out output) 00531 { 00532 while (begin < end) 00533 output = Utf<8>::Encode(*begin++, output); 00534 00535 return output; 00536 } 00537 00539 template <typename In, typename Out> 00540 Out Utf<32>::ToUtf16(In begin, In end, Out output) 00541 { 00542 while (begin < end) 00543 output = Utf<16>::Encode(*begin++, output); 00544 00545 return output; 00546 } 00547 00548 00550 template <typename In, typename Out> 00551 Out Utf<32>::ToUtf32(In begin, In end, Out output) 00552 { 00553 while (begin < end) 00554 *output++ = *begin++; 00555 00556 return output; 00557 } 00558 00559 00561 template <typename In> 00562 Uint32 Utf<32>::DecodeAnsi(In input, const std::locale& locale) 00563 { 00564 #ifdef __MINGW32__ 00565 00566 // MinGW has almost no support for unicode stuff 00567 // As a consequence, the MinGW version of this function can only use the default locale 00568 // and ignores the one passed as parameter 00569 00570 wchar_t character = 0; 00571 mbtowc(&character, &input, 1); 00572 return static_cast<Uint32>(character); 00573 00574 #else 00575 00576 // Get the facet of the locale which deals with character conversion 00577 const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale); 00578 00579 // Use the facet to convert each character of the input string 00580 return static_cast<Uint32>(facet.widen(input)); 00581 00582 #endif 00583 } 00584 00585 00587 template <typename In> 00588 Uint32 Utf<32>::DecodeWide(In input) 00589 { 00590 // The encoding of wide characters is not well defined and is left to the system; 00591 // however we can safely assume that it is UCS-2 on Windows and 00592 // UCS-4 on Unix systems. 00593 // In both cases, a simple copy is enough (UCS-2 is a subset of UCS-4, 00594 // and UCS-4 *is* UTF-32). 00595 00596 return input; 00597 } 00598 00599 00601 template <typename Out> 00602 Out Utf<32>::EncodeAnsi(Uint32 codepoint, Out output, char replacement, const std::locale& locale) 00603 { 00604 #ifdef __MINGW32__ 00605 00606 // MinGW has almost no support for unicode stuff 00607 // As a consequence, the MinGW version of this function can only use the default locale 00608 // and ignores the one passed as parameter 00609 00610 char character = 0; 00611 if (wctomb(&character, static_cast<wchar_t>(codepoint)) >= 0) 00612 *output++ = character; 00613 else if (replacement) 00614 *output++ = replacement; 00615 00616 return output; 00617 00618 #else 00619 00620 // Get the facet of the locale which deals with character conversion 00621 const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale); 00622 00623 // Use the facet to convert each character of the input string 00624 *output++ = facet.narrow(static_cast<wchar_t>(codepoint), replacement); 00625 00626 return output; 00627 00628 #endif 00629 } 00630 00631 00633 template <typename Out> 00634 Out Utf<32>::EncodeWide(Uint32 codepoint, Out output, wchar_t replacement) 00635 { 00636 // The encoding of wide characters is not well defined and is left to the system; 00637 // however we can safely assume that it is UCS-2 on Windows and 00638 // UCS-4 on Unix systems. 00639 // For UCS-2 we need to check if the source characters fits in (UCS-2 is a subset of UCS-4). 00640 // For UCS-4 we can do a direct copy (UCS-4 *is* UTF-32). 00641 00642 switch (sizeof(wchar_t)) 00643 { 00644 case 4: 00645 { 00646 *output++ = static_cast<wchar_t>(codepoint); 00647 break; 00648 } 00649 00650 default: 00651 { 00652 if ((codepoint <= 0xFFFF) && ((codepoint < 0xD800) || (codepoint > 0xDFFF))) 00653 { 00654 *output++ = static_cast<wchar_t>(codepoint); 00655 } 00656 else if (replacement) 00657 { 00658 *output++ = replacement; 00659 } 00660 break; 00661 } 00662 } 00663 00664 return output; 00665 }
:: Copyright © 2007-2008 Laurent Gomila, all rights reserved :: Documentation generated by doxygen 1.5.2 ::