Unicode.inl
1
2//
3// SFML - Simple and Fast Multimedia Library
4// Copyright (C) 2007-2009 Laurent Gomila (laurent.gom@gmail.com)
5//
6// This software is provided 'as-is', without any express or implied warranty.
7// In no event will the authors be held liable for any damages arising from the use of this software.
8//
9// Permission is granted to anyone to use this software for any purpose,
10// including commercial applications, and to alter it and redistribute it freely,
11// subject to the following restrictions:
12//
13// 1. The origin of this software must not be misrepresented;
14// you must not claim that you wrote the original software.
15// If you use this software in a product, an acknowledgment
16// in the product documentation would be appreciated but is not required.
17//
18// 2. Altered source versions must be plainly marked as such,
19// and must not be misrepresented as being the original software.
20//
21// 3. This notice may not be removed or altered from any source distribution.
22//
24
25
30template <typename In, typename Out>
31inline Out Unicode::UTF32ToANSI(In Begin, In End, Out Output, char Replacement, const std::locale& Locale)
32{
33 #ifdef __MINGW32__
34
35 // MinGW has a almost no support for unicode stuff
36 // As a consequence, the MinGW version of this function can only use the default locale
37 // and ignores the one passed as parameter
38 while (Begin < End)
39 {
40 char Char = 0;
41 if (wctomb(&Char, static_cast<wchar_t>(*Begin++)) >= 0)
42 *Output++ = Char;
43 else if (Replacement)
44 *Output++ = Replacement;
45 }
46
47 #else
48
49 // Get the facet of the locale which deals with character conversion
50 const std::ctype<wchar_t>& Facet = std::use_facet< std::ctype<wchar_t> >(Locale);
51
52 // Use the facet to convert each character of the input string
53 while (Begin < End)
54 *Output++ = Facet.narrow(static_cast<wchar_t>(*Begin++), Replacement);
55
56 #endif
57
58 return Output;
59}
60
61
66template <typename In, typename Out>
67inline Out Unicode::ANSIToUTF32(In Begin, In End, Out Output, const std::locale& Locale)
68{
69 #ifdef __MINGW32__
70
71 // MinGW has a almost no support for unicode stuff
72 // As a consequence, the MinGW version of this function can only use the default locale
73 // and ignores the one passed as parameter
74 while (Begin < End)
75 {
76 wchar_t Char = 0;
77 mbtowc(&Char, &*Begin, 1);
78 Begin++;
79 *Output++ = static_cast<Uint32>(Char);
80 }
81
82 #else
83
84 // Get the facet of the locale which deals with character conversion
85 const std::ctype<wchar_t>& Facet = std::use_facet< std::ctype<wchar_t> >(Locale);
86
87 // Use the facet to convert each character of the input string
88 while (Begin < End)
89 *Output++ = static_cast<Uint32>(Facet.widen(*Begin++));
90
91 #endif
92
93 return Output;
94}
95
96
101template <typename In, typename Out>
102inline Out Unicode::UTF8ToUTF16(In Begin, In End, Out Output, Uint16 Replacement)
103{
104 while (Begin < End)
105 {
106 Uint32 c = 0;
107 int TrailingBytes = UTF8TrailingBytes[static_cast<int>(*Begin)];
108 if (Begin + TrailingBytes < End)
109 {
110 // First decode the UTF-8 character
111 switch (TrailingBytes)
112 {
113 case 5 : c += *Begin++; c <<= 6;
114 case 4 : c += *Begin++; c <<= 6;
115 case 3 : c += *Begin++; c <<= 6;
116 case 2 : c += *Begin++; c <<= 6;
117 case 1 : c += *Begin++; c <<= 6;
118 case 0 : c += *Begin++;
119 }
120 c -= UTF8Offsets[TrailingBytes];
121
122 // Then encode it in UTF-16
123 if (c < 0xFFFF)
124 {
125 // Character can be converted directly to 16 bits, just need to check it's in the valid range
126 if ((c >= 0xD800) && (c <= 0xDFFF))
127 {
128 // Invalid character (this range is reserved)
129 if (Replacement)
130 *Output++ = Replacement;
131 }
132 else
133 {
134 // Valid character directly convertible to 16 bits
135 *Output++ = static_cast<Uint16>(c);
136 }
137 }
138 else if (c > 0x0010FFFF)
139 {
140 // Invalid character (greater than the maximum unicode value)
141 if (Replacement)
142 *Output++ = Replacement;
143 }
144 else
145 {
146 // Character will be converted to 2 UTF-16 elements
147 c -= 0x0010000;
148 *Output++ = static_cast<Uint16>((c >> 10) + 0xD800);
149 *Output++ = static_cast<Uint16>((c & 0x3FFUL) + 0xDC00);
150 }
151 }
152 }
153
154 return Output;
155}
156
157
162template <typename In, typename Out>
163inline Out Unicode::UTF8ToUTF32(In Begin, In End, Out Output, Uint32 Replacement)
164{
165 while (Begin < End)
166 {
167 Uint32 c = 0;
168 int TrailingBytes = UTF8TrailingBytes[static_cast<int>(*Begin)];
169 if (Begin + TrailingBytes < End)
170 {
171 // First decode the UTF-8 character
172 switch (TrailingBytes)
173 {
174 case 5 : c += *Begin++; c <<= 6;
175 case 4 : c += *Begin++; c <<= 6;
176 case 3 : c += *Begin++; c <<= 6;
177 case 2 : c += *Begin++; c <<= 6;
178 case 1 : c += *Begin++; c <<= 6;
179 case 0 : c += *Begin++;
180 }
181 c -= UTF8Offsets[TrailingBytes];
182
183 // Then write it if valid
184 if ((c < 0xD800) || (c > 0xDFFF))
185 {
186 // Valid UTF-32 character
187 *Output++ = c;
188 }
189 else
190 {
191 // Invalid UTF-32 character
192 if (Replacement)
193 *Output++ = Replacement;
194 }
195 }
196 }
197
198 return Output;
199}
200
201
206template <typename In, typename Out>
207inline Out Unicode::UTF16ToUTF8(In Begin, In End, Out Output, Uint8 Replacement)
208{
209 while (Begin < End)
210 {
211 Uint32 c = *Begin++;
212
213 // If it's a surrogate pair, first convert to a single UTF-32 character
214 if ((c >= 0xD800) && (c <= 0xDBFF))
215 {
216 if (Begin < End)
217 {
218 // The second element is valid : convert the two elements to a UTF-32 character
219 Uint32 d = *Begin++;
220 if ((d >= 0xDC00) && (d <= 0xDFFF))
221 c = static_cast<Uint32>(((c - 0xD800) << 10) + (d - 0xDC00) + 0x0010000);
222 }
223 else
224 {
225 // Invalid second element
226 if (Replacement)
227 *Output++ = Replacement;
228 }
229 }
230
231 // Then convert to UTF-8
232 if (c > 0x0010FFFF)
233 {
234 // Invalid character (greater than the maximum unicode value)
235 if (Replacement)
236 *Output++ = Replacement;
237 }
238 else
239 {
240 // Valid character
241
242 // Get number of bytes to write
243 int BytesToWrite = 1;
244 if (c < 0x80) BytesToWrite = 1;
245 else if (c < 0x800) BytesToWrite = 2;
246 else if (c < 0x10000) BytesToWrite = 3;
247 else if (c <= 0x0010FFFF) BytesToWrite = 4;
248
249 // Extract bytes to write
250 Uint8 Bytes[4];
251 switch (BytesToWrite)
252 {
253 case 4 : Bytes[3] = static_cast<Uint8>((c | 0x80) & 0xBF); c >>= 6;
254 case 3 : Bytes[2] = static_cast<Uint8>((c | 0x80) & 0xBF); c >>= 6;
255 case 2 : Bytes[1] = static_cast<Uint8>((c | 0x80) & 0xBF); c >>= 6;
256 case 1 : Bytes[0] = static_cast<Uint8> (c | UTF8FirstBytes[BytesToWrite]);
257 }
258
259 // Add them to the output
260 const Uint8* CurByte = Bytes;
261 switch (BytesToWrite)
262 {
263 case 4 : *Output++ = *CurByte++;
264 case 3 : *Output++ = *CurByte++;
265 case 2 : *Output++ = *CurByte++;
266 case 1 : *Output++ = *CurByte++;
267 }
268 }
269 }
270
271 return Output;
272}
273
274
279template <typename In, typename Out>
280inline Out Unicode::UTF16ToUTF32(In Begin, In End, Out Output, Uint32 Replacement)
281{
282 while (Begin < End)
283 {
284 Uint16 c = *Begin++;
285 if ((c >= 0xD800) && (c <= 0xDBFF))
286 {
287 // We have a surrogate pair, ie. a character composed of two elements
288 if (Begin < End)
289 {
290 Uint16 d = *Begin++;
291 if ((d >= 0xDC00) && (d <= 0xDFFF))
292 {
293 // The second element is valid : convert the two elements to a UTF-32 character
294 *Output++ = static_cast<Uint32>(((c - 0xD800) << 10) + (d - 0xDC00) + 0x0010000);
295 }
296 else
297 {
298 // Invalid second element
299 if (Replacement)
300 *Output++ = Replacement;
301 }
302 }
303 }
304 else if ((c >= 0xDC00) && (c <= 0xDFFF))
305 {
306 // Invalid character
307 if (Replacement)
308 *Output++ = Replacement;
309 }
310 else
311 {
312 // Valid character directly convertible to UTF-32
313 *Output++ = static_cast<Uint32>(c);
314 }
315 }
316
317 return Output;
318}
319
320
325template <typename In, typename Out>
326inline Out Unicode::UTF32ToUTF8(In Begin, In End, Out Output, Uint8 Replacement)
327{
328 while (Begin < End)
329 {
330 Uint32 c = *Begin++;
331 if (c > 0x0010FFFF)
332 {
333 // Invalid character (greater than the maximum unicode value)
334 if (Replacement)
335 *Output++ = Replacement;
336 }
337 else
338 {
339 // Valid character
340
341 // Get number of bytes to write
342 int BytesToWrite = 1;
343 if (c < 0x80) BytesToWrite = 1;
344 else if (c < 0x800) BytesToWrite = 2;
345 else if (c < 0x10000) BytesToWrite = 3;
346 else if (c <= 0x0010FFFF) BytesToWrite = 4;
347
348 // Extract bytes to write
349 Uint8 Bytes[4];
350 switch (BytesToWrite)
351 {
352 case 4 : Bytes[3] = static_cast<Uint8>((c | 0x80) & 0xBF); c >>= 6;
353 case 3 : Bytes[2] = static_cast<Uint8>((c | 0x80) & 0xBF); c >>= 6;
354 case 2 : Bytes[1] = static_cast<Uint8>((c | 0x80) & 0xBF); c >>= 6;
355 case 1 : Bytes[0] = static_cast<Uint8> (c | UTF8FirstBytes[BytesToWrite]);
356 }
357
358 // Add them to the output
359 const Uint8* CurByte = Bytes;
360 switch (BytesToWrite)
361 {
362 case 4 : *Output++ = *CurByte++;
363 case 3 : *Output++ = *CurByte++;
364 case 2 : *Output++ = *CurByte++;
365 case 1 : *Output++ = *CurByte++;
366 }
367 }
368 }
369
370 return Output;
371}
372
373
378template <typename In, typename Out>
379inline Out Unicode::UTF32ToUTF16(In Begin, In End, Out Output, Uint16 Replacement)
380{
381 while (Begin < End)
382 {
383 Uint32 c = *Begin++;
384 if (c < 0xFFFF)
385 {
386 // Character can be converted directly to 16 bits, just need to check it's in the valid range
387 if ((c >= 0xD800) && (c <= 0xDFFF))
388 {
389 // Invalid character (this range is reserved)
390 if (Replacement)
391 *Output++ = Replacement;
392 }
393 else
394 {
395 // Valid character directly convertible to 16 bits
396 *Output++ = static_cast<Uint16>(c);
397 }
398 }
399 else if (c > 0x0010FFFF)
400 {
401 // Invalid character (greater than the maximum unicode value)
402 if (Replacement)
403 *Output++ = Replacement;
404 }
405 else
406 {
407 // Character will be converted to 2 UTF-16 elements
408 c -= 0x0010000;
409 *Output++ = static_cast<Uint16>((c >> 10) + 0xD800);
410 *Output++ = static_cast<Uint16>((c & 0x3FFUL) + 0xDC00);
411 }
412 }
413
414 return Output;
415}
416
417
421template <typename In>
422inline std::size_t Unicode::GetUTF8Length(In Begin, In End)
423{
424 std::size_t Length = 0;
425 while (Begin < End)
426 {
427 int NbBytes = UTF8TrailingBytes[static_cast<int>(*Begin)];
428 if (Begin + NbBytes < End)
429 ++Length;
430
431 Begin += NbBytes + 1;
432 }
433
434 return Length;
435}
436
437
441template <typename In>
442inline std::size_t Unicode::GetUTF16Length(In Begin, In End)
443{
444 std::size_t Length = 0;
445 while (Begin < End)
446 {
447 if ((*Begin >= 0xD800) && (*Begin <= 0xDBFF))
448 {
449 ++Begin;
450 if ((Begin < End) && ((*Begin >= 0xDC00) && (*Begin <= 0xDFFF)))
451 {
452 ++Length;
453 }
454 }
455 else
456 {
457 ++Length;
458 }
459
460 ++Begin;
461 }
462
463 return Length;
464}
465
466
470template <typename In>
471inline std::size_t Unicode::GetUTF32Length(In Begin, In End)
472{
473 return End - Begin;
474}
static Out ANSIToUTF32(In Begin, In End, Out Output, const std::locale &Locale=GetDefaultLocale())
Generic function to convert an ANSI characters range to an UTF-32 characters range,...
Definition Unicode.hpp:68
static std::size_t GetUTF16Length(In Begin, In End)
Get the number of characters composing an UTF-16 string.
Definition Unicode.hpp:443
static Out UTF8ToUTF16(In Begin, In End, Out Output, Uint16 Replacement='?')
Generic function to convert an UTF-8 characters range to an UTF-16 characters range,...
Definition Unicode.hpp:103
static Out UTF32ToUTF8(In Begin, In End, Out Output, Uint8 Replacement='?')
Generic function to convert an UTF-32 characters range to an UTF-8 characters range,...
Definition Unicode.hpp:327
static Out UTF16ToUTF32(In Begin, In End, Out Output, Uint32 Replacement='?')
Generic function to convert an UTF-16 characters range to an UTF-32 characters range,...
Definition Unicode.hpp:281
static std::size_t GetUTF32Length(In Begin, In End)
Get the number of characters composing an UTF-32 string.
Definition Unicode.hpp:472
static Out UTF16ToUTF8(In Begin, In End, Out Output, Uint8 Replacement='?')
Generic function to convert an UTF-16 characters range to an UTF-8 characters range,...
Definition Unicode.hpp:208
static Out UTF32ToANSI(In Begin, In End, Out Output, char Replacement='?', const std::locale &Locale=GetDefaultLocale())
Generic function to convert an UTF-32 characters range to an ANSI characters range,...
Definition Unicode.hpp:32
static Out UTF32ToUTF16(In Begin, In End, Out Output, Uint16 Replacement='?')
Generic function to convert an UTF-32 characters range to an UTF-16 characters range,...
Definition Unicode.hpp:380
static std::size_t GetUTF8Length(In Begin, In End)
Get the number of characters composing an UTF-8 string.
Definition Unicode.hpp:423
static Out UTF8ToUTF32(In Begin, In End, Out Output, Uint32 Replacement='?')
Generic function to convert an UTF-8 characters range to an UTF-32 characters range,...
Definition Unicode.hpp:164