SFML logo
  • Main Page
  • Namespaces
  • Classes
  • Files
  • File List

Utf.inl

00001 
00002 //
00003 // SFML - Simple and Fast Multimedia Library
00004 // Copyright (C) 2007-2009 Laurent Gomila (laurent.gom@gmail.com)
00005 //
00006 // This software is provided 'as-is', without any express or implied warranty.
00007 // In no event will the authors be held liable for any damages arising from the use of this software.
00008 //
00009 // Permission is granted to anyone to use this software for any purpose,
00010 // including commercial applications, and to alter it and redistribute it freely,
00011 // subject to the following restrictions:
00012 //
00013 // 1. The origin of this software must not be misrepresented;
00014 //    you must not claim that you wrote the original software.
00015 //    If you use this software in a product, an acknowledgment
00016 //    in the product documentation would be appreciated but is not required.
00017 //
00018 // 2. Altered source versions must be plainly marked as such,
00019 //    and must not be misrepresented as being the original software.
00020 //
00021 // 3. This notice may not be removed or altered from any source distribution.
00022 //
00024 
00025 
00027 template <typename In>
00028 In Utf<8>::Decode(In begin, In end, Uint32& output, Uint32 replacement)
00029 {
00030     // Some useful precomputed data
00031     static const int trailing[256] =
00032     {
00033         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00034         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00035         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00036         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00037         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00038         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00039         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00040         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
00041     };
00042     static const Uint32 offsets[6] =
00043     {
00044         0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080
00045     };
00046 
00047     // Decode the character
00048     int trailingBytes = trailing[static_cast<int>(*begin)];
00049     if (begin + trailingBytes < end)
00050     {
00051         output = 0;
00052         switch (trailingBytes)
00053         {
00054             case 5 : output += *begin++; output <<= 6;
00055             case 4 : output += *begin++; output <<= 6;
00056             case 3 : output += *begin++; output <<= 6;
00057             case 2 : output += *begin++; output <<= 6;
00058             case 1 : output += *begin++; output <<= 6;
00059             case 0 : output += *begin++;
00060         }
00061         output -= offsets[trailingBytes];
00062     }
00063     else
00064     {
00065         // Incomplete character
00066         begin = end;
00067         output = replacement;
00068     }
00069 
00070     return begin;
00071 }
00072 
00073 
00075 template <typename Out>
00076 Out Utf<8>::Encode(Uint32 input, Out output, Uint8 replacement)
00077 {
00078     // Some useful precomputed data
00079     static const Uint8 firstBytes[7] =
00080     {
00081         0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
00082     };
00083 
00084     // Encode the character
00085     if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF)))
00086     {
00087         // Invalid character
00088         if (replacement)
00089             *output++ = replacement;
00090     }
00091     else
00092     {
00093         // Valid character
00094 
00095         // Get the number of bytes to write
00096         int bytesToWrite = 1;
00097         if      (input <  0x80)       bytesToWrite = 1;
00098         else if (input <  0x800)      bytesToWrite = 2;
00099         else if (input <  0x10000)    bytesToWrite = 3;
00100         else if (input <= 0x0010FFFF) bytesToWrite = 4;
00101 
00102         // Extract the bytes to write
00103         Uint8 bytes[4];
00104         switch (bytesToWrite)
00105         {
00106             case 4 : bytes[3] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6;
00107             case 3 : bytes[2] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6;
00108             case 2 : bytes[1] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6;
00109             case 1 : bytes[0] = static_cast<Uint8> (input | firstBytes[bytesToWrite]);
00110         }
00111 
00112         // Add them to the output
00113         const Uint8* currentByte = bytes;
00114         switch (bytesToWrite)
00115         {
00116             case 4 : *output++ = *currentByte++;
00117             case 3 : *output++ = *currentByte++;
00118             case 2 : *output++ = *currentByte++;
00119             case 1 : *output++ = *currentByte++;
00120         }
00121     }
00122 
00123     return output;
00124 }
00125 
00126 
00128 template <typename In>
00129 In Utf<8>::Next(In begin, In end)
00130 {
00131     Uint32 codepoint;
00132     return Decode(begin, end, codepoint);
00133 }
00134 
00135 
00137 template <typename In>
00138 std::size_t Utf<8>::Count(In begin, In end)
00139 {
00140     std::size_t length = 0;
00141     while (begin < end)
00142     {
00143         begin = Next(begin, end);
00144         ++length;
00145     }
00146 
00147     return length;
00148 }
00149 
00150 
00152 template <typename In, typename Out>
00153 Out Utf<8>::FromAnsi(In begin, In end, Out output, const std::locale& locale)
00154 {
00155     while (begin < end)
00156     {
00157         Uint32 codepoint = Utf<32>::DecodeAnsi(*begin++, locale);
00158         output = Encode(codepoint, output);
00159     }
00160 
00161     return output;
00162 }
00163 
00164 
00166 template <typename In, typename Out>
00167 Out Utf<8>::FromWide(In begin, In end, Out output)
00168 {
00169     while (begin < end)
00170     {
00171         Uint32 codepoint = Utf<32>::DecodeWide(*begin++);
00172         output = Encode(codepoint, output);
00173     }
00174 
00175     return output;
00176 }
00177 
00178 
00180 template <typename In, typename Out>
00181 Out Utf<8>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale)
00182 {
00183     while (begin < end)
00184     {
00185         Uint32 codepoint;
00186         begin = Decode(begin, end, codepoint);
00187         output = Utf<32>::EncodeAnsi(codepoint, output, replacement, locale);
00188     }
00189 
00190     return output;
00191 }
00192 
00193 
00195 template <typename In, typename Out>
00196 Out Utf<8>::ToWide(In begin, In end, Out output, wchar_t replacement)
00197 {
00198     while (begin < end)
00199     {
00200         Uint32 codepoint;
00201         begin = Decode(begin, end, codepoint);
00202         output = Utf<32>::EncodeWide(codepoint, output, replacement);
00203     }
00204 
00205     return output;
00206 }
00207 
00208 
00210 template <typename In, typename Out>
00211 Out Utf<8>::ToUtf8(In begin, In end, Out output)
00212 {
00213     while (begin < end)
00214         *output++ = *begin++;
00215 
00216     return output;
00217 }
00218 
00219 
00221 template <typename In, typename Out>
00222 Out Utf<8>::ToUtf16(In begin, In end, Out output)
00223 {
00224     while (begin < end)
00225     {
00226         Uint32 codepoint;
00227         begin = Decode(begin, end, codepoint);
00228         output = Utf<16>::Encode(codepoint, output);
00229     }
00230 
00231     return output;
00232 }
00233 
00234 
00236 template <typename In, typename Out>
00237 Out Utf<8>::ToUtf32(In begin, In end, Out output)
00238 {
00239     while (begin < end)
00240     {
00241         Uint32 codepoint;
00242         begin = Decode(begin, end, codepoint);
00243         *output++ = codepoint;
00244     }
00245 
00246     return output;
00247 }
00248 
00249 
00251 template <typename In>
00252 In Utf<16>::Decode(In begin, In end, Uint32& output, Uint32 replacement)
00253 {
00254     Uint16 first = *begin++;
00255 
00256     // If it's a surrogate pair, first convert to a single UTF-32 character
00257     if ((first >= 0xD800) && (first <= 0xDBFF))
00258     {
00259         if (begin < end)
00260         {
00261             Uint32 second = *begin++;
00262             if ((second >= 0xDC00) && (second <= 0xDFFF))
00263             {
00264                 // The second element is valid: convert the two elements to a UTF-32 character
00265                 output = static_cast<Uint32>(((first - 0xD800) << 10) + (second - 0xDC00) + 0x0010000);
00266             }
00267             else
00268             {
00269                 // Invalid character
00270                 output = replacement;
00271             }
00272         }
00273         else
00274         {
00275             // Invalid character
00276             begin = end;
00277             output = replacement;
00278         }
00279     }
00280     else
00281     {
00282         // We can make a direct copy
00283         output = first;
00284     }
00285 
00286     return begin;
00287 }
00288 
00289 
00291 template <typename Out>
00292 Out Utf<16>::Encode(Uint32 input, Out output, Uint16 replacement)
00293 {
00294     if (input < 0xFFFF)
00295     {
00296         // The character can be copied directly, we just need to check if it's in the valid range
00297         if ((input >= 0xD800) && (input <= 0xDFFF))
00298         {
00299             // Invalid character (this range is reserved)
00300             if (replacement)
00301                 *output++ = replacement;
00302         }
00303         else
00304         {
00305             // Valid character directly convertible to a single UTF-16 character
00306             *output++ = static_cast<Uint16>(input);
00307         }
00308     }
00309     else if (input > 0x0010FFFF)
00310     {
00311         // Invalid character (greater than the maximum unicode value)
00312         if (replacement)
00313             *output++ = replacement;
00314     }
00315     else
00316     {
00317         // The input character will be converted to two UTF-16 elements
00318         input -= 0x0010000;
00319         *output++ = static_cast<Uint16>((input >> 10)     + 0xD800);
00320         *output++ = static_cast<Uint16>((input & 0x3FFUL) + 0xDC00);
00321     }
00322 
00323     return output;
00324 }
00325 
00326 
00328 template <typename In>
00329 In Utf<16>::Next(In begin, In end)
00330 {
00331     Uint32 codepoint;
00332     return Decode(begin, end, codepoint);
00333 }
00334 
00335 
00337 template <typename In>
00338 std::size_t Utf<16>::Count(In begin, In end)
00339 {
00340     std::size_t length = 0;
00341     while (begin < end)
00342     {
00343         begin = Next(begin, end);
00344         ++length;
00345     }
00346 
00347     return length;
00348 }
00349 
00350 
00352 template <typename In, typename Out>
00353 Out Utf<16>::FromAnsi(In begin, In end, Out output, const std::locale& locale)
00354 {
00355     while (begin < end)
00356     {
00357         Uint32 codepoint = Utf<32>::DecodeAnsi(*begin++, locale);
00358         output = Encode(codepoint, output);
00359     }
00360 
00361     return output;
00362 }
00363 
00364 
00366 template <typename In, typename Out>
00367 Out Utf<16>::FromWide(In begin, In end, Out output)
00368 {
00369     while (begin < end)
00370     {
00371         Uint32 codepoint = Utf<32>::DecodeWide(*begin++);
00372         output = Encode(codepoint, output);
00373     }
00374 
00375     return output;
00376 }
00377 
00378 
00380 template <typename In, typename Out>
00381 Out Utf<16>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale)
00382 {
00383     while (begin < end)
00384     {
00385         Uint32 codepoint;
00386         begin = Decode(begin, end, codepoint);
00387         output = Utf<32>::EncodeAnsi(codepoint, output, replacement, locale);
00388     }
00389 
00390     return output;
00391 }
00392 
00393 
00395 template <typename In, typename Out>
00396 Out Utf<16>::ToWide(In begin, In end, Out output, wchar_t replacement)
00397 {
00398     while (begin < end)
00399     {
00400         Uint32 codepoint;
00401         begin = Decode(begin, end, codepoint);
00402         output = Utf<32>::EncodeWide(codepoint, output, replacement);
00403     }
00404 
00405     return output;
00406 }
00407 
00408 
00410 template <typename In, typename Out>
00411 Out Utf<16>::ToUtf8(In begin, In end, Out output)
00412 {
00413     while (begin < end)
00414     {
00415         Uint32 codepoint;
00416         begin = Decode(begin, end, codepoint);
00417         output = Utf<8>::Encode(codepoint, output);
00418     }
00419 
00420     return output;
00421 }
00422 
00423 
00425 template <typename In, typename Out>
00426 Out Utf<16>::ToUtf16(In begin, In end, Out output)
00427 {
00428     while (begin < end)
00429         *output++ = *begin++;
00430 
00431     return output;
00432 }
00433 
00434 
00436 template <typename In, typename Out>
00437 Out Utf<16>::ToUtf32(In begin, In end, Out output)
00438 {
00439     while (begin < end)
00440     {
00441         Uint32 codepoint;
00442         begin = Decode(begin, end, codepoint);
00443         *output++ = codepoint;
00444     }
00445 
00446     return output;
00447 }
00448 
00449 
00451 template <typename In>
00452 In Utf<32>::Decode(In begin, In end, Uint32& output, Uint32)
00453 {
00454     output = *begin++;
00455     return begin;
00456 }
00457 
00458 
00460 template <typename Out>
00461 Out Utf<32>::Encode(Uint32 input, Out output, Uint32 replacement)
00462 {
00463     *output++ = input;
00464     return output;
00465 }
00466 
00467 
00469 template <typename In>
00470 In Utf<32>::Next(In begin, In end)
00471 {
00472     return ++begin;
00473 }
00474 
00475 
00477 template <typename In>
00478 std::size_t Utf<32>::Count(In begin, In end)
00479 {
00480     return begin - end;
00481 }
00482 
00483 
00485 template <typename In, typename Out>
00486 Out Utf<32>::FromAnsi(In begin, In end, Out output, const std::locale& locale)
00487 {
00488     while (begin < end)
00489         *output++ = DecodeAnsi(*begin++, locale);
00490 
00491     return output;
00492 }
00493 
00494 
00496 template <typename In, typename Out>
00497 Out Utf<32>::FromWide(In begin, In end, Out output)
00498 {
00499     while (begin < end)
00500         *output++ = DecodeWide(*begin++);
00501 
00502     return output;
00503 }
00504 
00505 
00507 template <typename In, typename Out>
00508 Out Utf<32>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale)
00509 {
00510     while (begin < end)
00511         output = EncodeAnsi(*begin++, output, replacement, locale);
00512 
00513     return output;
00514 }
00515 
00516 
00518 template <typename In, typename Out>
00519 Out Utf<32>::ToWide(In begin, In end, Out output, wchar_t replacement)
00520 {
00521     while (begin < end)
00522         output = EncodeWide(*begin++, output, replacement);
00523 
00524     return output;
00525 }
00526 
00527 
00529 template <typename In, typename Out>
00530 Out Utf<32>::ToUtf8(In begin, In end, Out output)
00531 {
00532     while (begin < end)
00533         output = Utf<8>::Encode(*begin++, output);
00534 
00535     return output;
00536 }
00537 
00539 template <typename In, typename Out>
00540 Out Utf<32>::ToUtf16(In begin, In end, Out output)
00541 {
00542     while (begin < end)
00543         output = Utf<16>::Encode(*begin++, output);
00544 
00545     return output;
00546 }
00547 
00548 
00550 template <typename In, typename Out>
00551 Out Utf<32>::ToUtf32(In begin, In end, Out output)
00552 {
00553     while (begin < end)
00554         *output++ = *begin++;
00555 
00556     return output;
00557 }
00558 
00559 
00561 template <typename In>
00562 Uint32 Utf<32>::DecodeAnsi(In input, const std::locale& locale)
00563 {
00564     #ifdef __MINGW32__
00565 
00566         // MinGW has almost no support for unicode stuff
00567         // As a consequence, the MinGW version of this function can only use the default locale
00568         // and ignores the one passed as parameter
00569 
00570         wchar_t character = 0;
00571         mbtowc(&character, &input, 1);
00572         return static_cast<Uint32>(character);
00573 
00574     #else
00575 
00576         // Get the facet of the locale which deals with character conversion
00577         const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale);
00578 
00579         // Use the facet to convert each character of the input string
00580         return static_cast<Uint32>(facet.widen(input));
00581 
00582     #endif
00583 }
00584 
00585 
00587 template <typename In>
00588 Uint32 Utf<32>::DecodeWide(In input)
00589 {
00590     // The encoding of wide characters is not well defined and is left to the system;
00591     // however we can safely assume that it is UCS-2 on Windows and
00592     // UCS-4 on Unix systems.
00593     // In both cases, a simple copy is enough (UCS-2 is a subset of UCS-4,
00594     // and UCS-4 *is* UTF-32).
00595 
00596     return input;
00597 }
00598 
00599 
00601 template <typename Out>
00602 Out Utf<32>::EncodeAnsi(Uint32 codepoint, Out output, char replacement, const std::locale& locale)
00603 {
00604     #ifdef __MINGW32__
00605 
00606         // MinGW has almost no support for unicode stuff
00607         // As a consequence, the MinGW version of this function can only use the default locale
00608         // and ignores the one passed as parameter
00609 
00610         char character = 0;
00611         if (wctomb(&character, static_cast<wchar_t>(codepoint)) >= 0)
00612             *output++ = character;
00613         else if (replacement)
00614             *output++ = replacement;
00615 
00616         return output;
00617 
00618     #else
00619 
00620         // Get the facet of the locale which deals with character conversion
00621         const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale);
00622 
00623         // Use the facet to convert each character of the input string
00624         *output++ = facet.narrow(static_cast<wchar_t>(codepoint), replacement);
00625 
00626         return output;
00627 
00628     #endif
00629 }
00630 
00631 
00633 template <typename Out>
00634 Out Utf<32>::EncodeWide(Uint32 codepoint, Out output, wchar_t replacement)
00635 {
00636     // The encoding of wide characters is not well defined and is left to the system;
00637     // however we can safely assume that it is UCS-2 on Windows and
00638     // UCS-4 on Unix systems.
00639     // For UCS-2 we need to check if the source characters fits in (UCS-2 is a subset of UCS-4).
00640     // For UCS-4 we can do a direct copy (UCS-4 *is* UTF-32).
00641 
00642     switch (sizeof(wchar_t))
00643     {
00644         case 4:
00645         {
00646             *output++ = static_cast<wchar_t>(codepoint);
00647             break;
00648         }
00649 
00650         default:
00651         {
00652             if ((codepoint <= 0xFFFF) && ((codepoint < 0xD800) || (codepoint > 0xDFFF)))
00653             {
00654                 *output++ = static_cast<wchar_t>(codepoint);
00655             }
00656             else if (replacement)
00657             {
00658                 *output++ = replacement;
00659             }
00660             break;
00661         }
00662     }
00663 
00664     return output;
00665 }

 ::  Copyright © 2007-2008 Laurent Gomila, all rights reserved  ::  Documentation generated by doxygen 1.5.2  ::