MyGUI  3.2.2
MyGUI_UString.cpp
Go to the documentation of this file.
1 /*
2  * This source file is part of MyGUI. For the latest info, see http://mygui.info/
3  * Distributed under the MIT License
4  * (See accompanying file COPYING.MIT or copy at http://opensource.org/licenses/MIT)
5  */
6 
7 #include "MyGUI_Precompiled.h"
8 #include "MyGUI_UString.h"
9 
10 namespace MyGUI
11 {
12 
13  //--------------------------------------------------------------------------
15  {
16  mString = 0;
17  }
18  //--------------------------------------------------------------------------
20  {
21  mIter += c;
22  }
23  //--------------------------------------------------------------------------
25  {
26  mIter -= c;
27  }
28  //--------------------------------------------------------------------------
30  {
31  mIter = i.mIter;
32  mString = i.mString;
33  }
34  //--------------------------------------------------------------------------
36  {
37  return mIter == mString->mData.begin();
38  }
39  //--------------------------------------------------------------------------
41  {
42  return mIter == mString->mData.end();
43  }
44  //--------------------------------------------------------------------------
46  {
47  return mIter - mString->mData.begin();
48  }
49  //--------------------------------------------------------------------------
51  {
52  mIter = mString->mData.begin() + index;
53  }
54  //--------------------------------------------------------------------------
56  {
57  size_type current_index = _get_index();
58  return mString->getChar( current_index );
59  }
60  //--------------------------------------------------------------------------
62  {
63  size_type current_index = _get_index();
64  int change = mString->setChar( current_index, uc );
65  _jump_to( current_index );
66  return change;
67  }
68  //--------------------------------------------------------------------------
70  {
71  _seekFwd( 1 ); // move 1 code point forward
72  if ( _test_end() ) return; // exit if we hit the end
73  if ( _utf16_surrogate_follow( mIter[0] ) ) {
74  // landing on a follow code point means we might be part of a bigger character
75  // so we test for that
76  code_point lead_half = 0;
77  //NB: we can't possibly be at the beginning here, so no need to test
78  lead_half = mIter[-1]; // check the previous code point to see if we're part of a surrogate pair
79  if ( _utf16_surrogate_lead( lead_half ) ) {
80  _seekFwd( 1 ); // if so, then advance 1 more code point
81  }
82  }
83  }
84  //--------------------------------------------------------------------------
86  {
87  _seekRev( 1 ); // move 1 code point backwards
88  if ( _test_begin() ) return; // exit if we hit the beginning
89  if ( _utf16_surrogate_follow( mIter[0] ) ) {
90  // landing on a follow code point means we might be part of a bigger character
91  // so we test for that
92  code_point lead_half = 0;
93  lead_half = mIter[-1]; // check the previous character to see if we're part of a surrogate pair
94  if ( _utf16_surrogate_lead( lead_half ) ) {
95  _seekRev( 1 ); // if so, then rewind 1 more code point
96  }
97  }
98  }
99  //--------------------------------------------------------------------------
100  //--------------------------------------------------------------------------
101  //--------------------------------------------------------------------------
102  //--------------------------------------------------------------------------
104  {
105 
106  }
107  //--------------------------------------------------------------------------
109  {
110  _become( i );
111  }
112  //--------------------------------------------------------------------------
114  {
115  _seekFwd( 1 );
116  return *this;
117  }
118  //--------------------------------------------------------------------------
120  {
121  _fwd_iterator tmp( *this );
122  _seekFwd( 1 );
123  return tmp;
124  }
125  //--------------------------------------------------------------------------
127  {
128  _seekRev( 1 );
129  return *this;
130  }
131  //--------------------------------------------------------------------------
133  {
134  _fwd_iterator tmp( *this );
135  _seekRev( 1 );
136  return tmp;
137  }
138  //--------------------------------------------------------------------------
140  {
141  _fwd_iterator tmp( *this );
142  if ( n < 0 )
143  tmp._seekRev( -n );
144  else
145  tmp._seekFwd( n );
146  return tmp;
147  }
148  //--------------------------------------------------------------------------
150  {
151  _fwd_iterator tmp( *this );
152  if ( n < 0 )
153  tmp._seekFwd( -n );
154  else
155  tmp._seekRev( n );
156  return tmp;
157  }
158  //--------------------------------------------------------------------------
160  {
161  if ( n < 0 )
162  _seekRev( -n );
163  else
164  _seekFwd( n );
165  return *this;
166  }
167  //--------------------------------------------------------------------------
169  {
170  if ( n < 0 )
171  _seekFwd( -n );
172  else
173  _seekRev( n );
174  return *this;
175  }
176  //--------------------------------------------------------------------------
178  {
179  return *mIter;
180  }
181  //--------------------------------------------------------------------------
183  {
184  _fwd_iterator tmp( *this );
185  tmp += n;
186  return *tmp;
187  }
188  //--------------------------------------------------------------------------
190  {
191  _moveNext();
192  return *this;
193  }
194  //--------------------------------------------------------------------------
196  {
197  _movePrev();
198  return *this;
199  }
200  //--------------------------------------------------------------------------
202  {
203  return _getCharacter();
204  }
205  //--------------------------------------------------------------------------
207  {
208  return _setCharacter( uc );
209  }
210  //--------------------------------------------------------------------------
211  //--------------------------------------------------------------------------
212  //--------------------------------------------------------------------------
213  //--------------------------------------------------------------------------
215  {
216 
217  }
218  //--------------------------------------------------------------------------
220  {
221  _become( i );
222  }
223  //--------------------------------------------------------------------------
225  {
226  _become( i );
227  }
228  //--------------------------------------------------------------------------
230  {
231  _seekFwd( 1 );
232  return *this;
233  }
234  //--------------------------------------------------------------------------
236  {
237  _const_fwd_iterator tmp( *this );
238  _seekFwd( 1 );
239  return tmp;
240  }
241  //--------------------------------------------------------------------------
243  {
244  _seekRev( 1 );
245  return *this;
246  }
247  //--------------------------------------------------------------------------
249  {
250  _const_fwd_iterator tmp( *this );
251  _seekRev( 1 );
252  return tmp;
253  }
254  //--------------------------------------------------------------------------
256  {
257  _const_fwd_iterator tmp( *this );
258  if ( n < 0 )
259  tmp._seekRev( -n );
260  else
261  tmp._seekFwd( n );
262  return tmp;
263  }
264  //--------------------------------------------------------------------------
266  {
267  _const_fwd_iterator tmp( *this );
268  if ( n < 0 )
269  tmp._seekFwd( -n );
270  else
271  tmp._seekRev( n );
272  return tmp;
273  }
274  //--------------------------------------------------------------------------
276  {
277  if ( n < 0 )
278  _seekRev( -n );
279  else
280  _seekFwd( n );
281  return *this;
282  }
283  //--------------------------------------------------------------------------
285  {
286  if ( n < 0 )
287  _seekFwd( -n );
288  else
289  _seekRev( n );
290  return *this;
291  }
292  //--------------------------------------------------------------------------
294  {
295  return *mIter;
296  }
297  //--------------------------------------------------------------------------
299  {
300  _const_fwd_iterator tmp( *this );
301  tmp += n;
302  return *tmp;
303  }
304  //--------------------------------------------------------------------------
306  {
307  _moveNext();
308  return *this;
309  }
310  //--------------------------------------------------------------------------
312  {
313  _movePrev();
314  return *this;
315  }
316  //--------------------------------------------------------------------------
318  {
319  return _getCharacter();
320  }
321  //--------------------------------------------------------------------------
322  //--------------------------------------------------------------------------
323  //--------------------------------------------------------------------------
324  //--------------------------------------------------------------------------
326  {
327 
328  }
329  //--------------------------------------------------------------------------
331  {
332  _become( i );
333  }
334  //--------------------------------------------------------------------------
336  {
337  _seekRev( 1 );
338  return *this;
339  }
340  //--------------------------------------------------------------------------
342  {
343  _rev_iterator tmp( *this );
344  _seekRev( 1 );
345  return tmp;
346  }
347  //--------------------------------------------------------------------------
349  {
350  _seekFwd( 1 );
351  return *this;
352  }
353  //--------------------------------------------------------------------------
355  {
356  _rev_iterator tmp( *this );
357  _seekFwd( 1 );
358  return tmp;
359  }
360  //--------------------------------------------------------------------------
362  {
363  _rev_iterator tmp( *this );
364  if ( n < 0 )
365  tmp._seekFwd( -n );
366  else
367  tmp._seekRev( n );
368  return tmp;
369  }
370  //--------------------------------------------------------------------------
372  {
373  _rev_iterator tmp( *this );
374  if ( n < 0 )
375  tmp._seekRev( -n );
376  else
377  tmp._seekFwd( n );
378  return tmp;
379  }
380  //--------------------------------------------------------------------------
382  {
383  if ( n < 0 )
384  _seekFwd( -n );
385  else
386  _seekRev( n );
387  return *this;
388  }
389  //--------------------------------------------------------------------------
391  {
392  if ( n < 0 )
393  _seekRev( -n );
394  else
395  _seekFwd( n );
396  return *this;
397  }
398  //--------------------------------------------------------------------------
400  {
401  return mIter[-1];
402  }
403  //--------------------------------------------------------------------------
405  {
406  _rev_iterator tmp( *this );
407  tmp -= n;
408  return *tmp;
409  }
410  //--------------------------------------------------------------------------
411  //--------------------------------------------------------------------------
412  //--------------------------------------------------------------------------
413  //--------------------------------------------------------------------------
415  {
416 
417  }
418  //--------------------------------------------------------------------------
420  {
421  _become( i );
422  }
423  //--------------------------------------------------------------------------
425  {
426  _become( i );
427  }
428  //--------------------------------------------------------------------------
430  {
431  _seekRev( 1 );
432  return *this;
433  }
434  //--------------------------------------------------------------------------
436  {
437  _const_rev_iterator tmp( *this );
438  _seekRev( 1 );
439  return tmp;
440  }
441  //--------------------------------------------------------------------------
443  {
444  _seekFwd( 1 );
445  return *this;
446  }
447  //--------------------------------------------------------------------------
449  {
450  _const_rev_iterator tmp( *this );
451  _seekFwd( 1 );
452  return tmp;
453  }
454  //--------------------------------------------------------------------------
456  {
457  _const_rev_iterator tmp( *this );
458  if ( n < 0 )
459  tmp._seekFwd( -n );
460  else
461  tmp._seekRev( n );
462  return tmp;
463  }
464  //--------------------------------------------------------------------------
466  {
467  _const_rev_iterator tmp( *this );
468  if ( n < 0 )
469  tmp._seekRev( -n );
470  else
471  tmp._seekFwd( n );
472  return tmp;
473  }
474  //--------------------------------------------------------------------------
476  {
477  if ( n < 0 )
478  _seekFwd( -n );
479  else
480  _seekRev( n );
481  return *this;
482  }
483  //--------------------------------------------------------------------------
485  {
486  if ( n < 0 )
487  _seekRev( -n );
488  else
489  _seekFwd( n );
490  return *this;
491  }
492  //--------------------------------------------------------------------------
494  {
495  return mIter[-1];
496  }
497  //--------------------------------------------------------------------------
499  {
500  _const_rev_iterator tmp( *this );
501  tmp -= n;
502  return *tmp;
503  }
504  //--------------------------------------------------------------------------
505  //--------------------------------------------------------------------------
506  //--------------------------------------------------------------------------
507  //--------------------------------------------------------------------------
509  {
510  _init();
511  }
512  //--------------------------------------------------------------------------
513  UString::UString( const UString& copy )
514  {
515  _init();
516  mData = copy.mData;
517  }
518  //--------------------------------------------------------------------------
520  {
521  _init();
522  assign( length, ch );
523  }
524  //--------------------------------------------------------------------------
526  {
527  _init();
528  assign( str );
529  }
530  //--------------------------------------------------------------------------
532  {
533  _init();
534  assign( str, length );
535  }
536  //--------------------------------------------------------------------------
538  {
539  _init();
540  assign( str, index, length );
541  }
542  //--------------------------------------------------------------------------
543 #if MYGUI_IS_NATIVE_WCHAR_T
544  UString::UString( const wchar_t* w_str )
545  {
546  _init();
547  assign( w_str );
548  }
549  //--------------------------------------------------------------------------
550  UString::UString( const wchar_t* w_str, size_type length )
551  {
552  _init();
553  assign( w_str, length );
554  }
555 #endif
556  //--------------------------------------------------------------------------
557  UString::UString( const std::wstring& wstr )
558  {
559  _init();
560  assign( wstr );
561  }
562  //--------------------------------------------------------------------------
563  UString::UString( const char* c_str )
564  {
565  _init();
566  assign( c_str );
567  }
568  //--------------------------------------------------------------------------
569  UString::UString( const char* c_str, size_type length )
570  {
571  _init();
572  assign( c_str, length );
573  }
574  //--------------------------------------------------------------------------
575  UString::UString( const std::string& str )
576  {
577  _init();
578  assign( str );
579  }
580  //--------------------------------------------------------------------------
582  {
583  _cleanBuffer();
584  }
585  //--------------------------------------------------------------------------
587  {
588  return mData.size();
589  }
590  //--------------------------------------------------------------------------
592  {
593  return size();
594  }
595  //--------------------------------------------------------------------------
597  {
598  const_iterator i = begin(), ie = end();
599  size_type c = 0;
600  while ( i != ie ) {
601  i.moveNext();
602  ++c;
603  }
604  return c;
605  }
606  //--------------------------------------------------------------------------
608  {
609  return mData.max_size();
610  }
611  //--------------------------------------------------------------------------
613  {
614  mData.reserve( size );
615  }
616  //--------------------------------------------------------------------------
617  void UString::resize( size_type num, const code_point& val /*= 0 */ )
618  {
619  mData.resize( num, val );
620  }
621  //--------------------------------------------------------------------------
622  void UString::swap( UString& from )
623  {
624  mData.swap( from.mData );
625  }
626  //--------------------------------------------------------------------------
627  bool UString::empty() const
628  {
629  return mData.empty();
630  }
631  //--------------------------------------------------------------------------
633  {
634  return mData.c_str();
635  }
636  //--------------------------------------------------------------------------
638  {
639  return c_str();
640  }
641  //--------------------------------------------------------------------------
643  {
644  return mData.capacity();
645  }
646  //--------------------------------------------------------------------------
648  {
649  mData.clear();
650  }
651  //--------------------------------------------------------------------------
652  UString UString::substr( size_type index, size_type num /*= npos */ ) const
653  {
654  // this could avoid the extra copy if we used a private specialty constructor
655  dstring data = mData.substr( index, num );
656  UString tmp;
657  tmp.mData.swap( data );
658  return tmp;
659  }
660  //--------------------------------------------------------------------------
662  {
663  code_point cp[2];
664  size_t c = _utf32_to_utf16( val, cp );
665  if ( c > 0 ) push_back( cp[0] );
666  if ( c > 1 ) push_back( cp[1] );
667  }
668  //--------------------------------------------------------------------------
669 #if MYGUI_IS_NATIVE_WCHAR_T
670  void UString::push_back( wchar_t val )
671  {
672  // we do this because the Unicode method still preserves UTF-16 code points
673  mData.push_back( static_cast<code_point>( val ) );
674  }
675 #endif
676  //--------------------------------------------------------------------------
678  {
679  mData.push_back( val );
680  }
681 
682  void UString::push_back( char val )
683  {
684  mData.push_back( static_cast<code_point>( val ) );
685  }
686 
688  {
689  const_iterator i, ie = end();
690  for ( i = begin(); i != ie; i.moveNext() ) {
691  if ( i.getCharacter() == ch )
692  return true;
693  }
694  return false;
695  }
696 
697  const std::string& UString::asUTF8() const
698  {
699  _load_buffer_UTF8();
700  return *m_buffer.mStrBuffer;
701  }
702 
703  const char* UString::asUTF8_c_str() const
704  {
705  _load_buffer_UTF8();
706  return m_buffer.mStrBuffer->c_str();
707  }
708 
710  {
711  _load_buffer_UTF32();
712  return *m_buffer.mUTF32StrBuffer;
713  }
714 
716  {
717  _load_buffer_UTF32();
718  return m_buffer.mUTF32StrBuffer->c_str();
719  }
720 
721  const std::wstring& UString::asWStr() const
722  {
723  _load_buffer_WStr();
724  return *m_buffer.mWStrBuffer;
725  }
726 
727  const wchar_t* UString::asWStr_c_str() const
728  {
729  _load_buffer_WStr();
730  return m_buffer.mWStrBuffer->c_str();
731  }
732 
734  {
735  return mData.at( loc );
736  }
737 
739  {
740  return mData.at( loc );
741  }
742 
744  {
745  const code_point* ptr = c_str();
746  unicode_char uc;
747  size_t l = _utf16_char_length( ptr[loc] );
748  code_point cp[2] = { /* blame the code beautifier */
749  0, 0
750  };
751  cp[0] = ptr[loc];
752 
753  if ( l == 2 && ( loc + 1 ) < mData.length() ) {
754  cp[1] = ptr[loc+1];
755  }
756  _utf16_to_utf32( cp, uc );
757  return uc;
758  }
759 
761  {
762  code_point cp[2] = { /* blame the code beautifier */
763  0, 0
764  };
765  size_t l = _utf32_to_utf16( ch, cp );
766  unicode_char existingChar = getChar( loc );
767  size_t existingSize = _utf16_char_length( existingChar );
768  size_t newSize = _utf16_char_length( ch );
769 
770  if ( newSize > existingSize ) {
771  at( loc ) = cp[0];
772  insert( loc + 1, 1, cp[1] );
773  return 1;
774  }
775  if ( newSize < existingSize ) {
776  erase( loc, 1 );
777  at( loc ) = cp[0];
778  return -1;
779  }
780 
781  // newSize == existingSize
782  at( loc ) = cp[0];
783  if ( l == 2 ) at( loc + 1 ) = cp[1];
784  return 0;
785  }
786 
788  {
789  iterator i;
790  i.mIter = mData.begin();
791  i.mString = this;
792  return i;
793  }
794 
796  {
797  const_iterator i;
798  i.mIter = const_cast<UString*>( this )->mData.begin();
799  i.mString = const_cast<UString*>( this );
800  return i;
801  }
802 
804  {
805  iterator i;
806  i.mIter = mData.end();
807  i.mString = this;
808  return i;
809  }
810 
812  {
813  const_iterator i;
814  i.mIter = const_cast<UString*>( this )->mData.end();
815  i.mString = const_cast<UString*>( this );
816  return i;
817  }
818 
820  {
822  i.mIter = mData.end();
823  i.mString = this;
824  return i;
825  }
826 
828  {
830  i.mIter = const_cast<UString*>( this )->mData.end();
831  i.mString = const_cast<UString*>( this );
832  return i;
833  }
834 
836  {
838  i.mIter = mData.begin();
839  i.mString = this;
840  return i;
841  }
842 
844  {
846  i.mIter = const_cast<UString*>( this )->mData.begin();
847  i.mString = const_cast<UString*>( this );
848  return i;
849  }
850 
852  {
853  mData.assign( start.mIter, end.mIter );
854  return *this;
855  }
856 
858  {
859  mData.assign( str.mData );
860  return *this;
861  }
862 
864  {
865  mData.assign( str );
866  return *this;
867  }
868 
870  {
871  mData.assign( str, num );
872  return *this;
873  }
874 
876  {
877  mData.assign( str.mData, index, len );
878  return *this;
879  }
880 
882  {
883  mData.assign( num, ch );
884  return *this;
885  }
886 
887  UString& UString::assign( const std::wstring& wstr )
888  {
889  mData.clear();
890  mData.reserve( wstr.length() ); // best guess bulk allocate
891 #ifdef WCHAR_UTF16 // if we're already working in UTF-16, this is easy
892  code_point tmp;
893  std::wstring::const_iterator i, ie = wstr.end();
894  for ( i = wstr.begin(); i != ie; i++ ) {
895  tmp = static_cast<code_point>( *i );
896  mData.push_back( tmp );
897  }
898 #else // otherwise we do it the safe way (which is still 100% safe to pass UTF-16 through, just slower)
899  code_point cp[3] = {0, 0, 0};
900  unicode_char tmp;
901  std::wstring::const_iterator i, ie = wstr.end();
902  for ( i = wstr.begin(); i != ie; i++ ) {
903  tmp = static_cast<unicode_char>( *i );
904  size_t l = _utf32_to_utf16( tmp, cp );
905  if ( l > 0 ) mData.push_back( cp[0] );
906  if ( l > 1 ) mData.push_back( cp[1] );
907  }
908 #endif
909  return *this;
910  }
911 
912 #if MYGUI_IS_NATIVE_WCHAR_T
913  UString& UString::assign( const wchar_t* w_str )
914  {
915  std::wstring tmp;
916  tmp.assign( w_str );
917  return assign( tmp );
918  }
919 
920  UString& UString::assign( const wchar_t* w_str, size_type num )
921  {
922  std::wstring tmp;
923  tmp.assign( w_str, num );
924  return assign( tmp );
925  }
926 #endif
927 
928  UString& UString::assign( const std::string& str )
929  {
930  size_type len = _verifyUTF8( str );
931  clear(); // empty our contents, if there are any
932  reserve( len ); // best guess bulk capacity growth
933 
934  // This is a 3 step process, converting each byte in the UTF-8 stream to UTF-32,
935  // then converting it to UTF-16, then finally appending the data buffer
936 
937  unicode_char uc; // temporary Unicode character buffer
938  unsigned char utf8buf[7]; // temporary UTF-8 buffer
939  utf8buf[6] = 0;
940  size_t utf8len; // UTF-8 length
941  code_point utf16buff[3]; // temporary UTF-16 buffer
942  utf16buff[2] = 0;
943  size_t utf16len; // UTF-16 length
944 
945  std::string::const_iterator i, ie = str.end();
946  for ( i = str.begin(); i != ie; i++ ) {
947  utf8len = _utf8_char_length( static_cast<unsigned char>( *i ) ); // estimate bytes to load
948  for ( size_t j = 0; j < utf8len; j++ ) { // load the needed UTF-8 bytes
949  utf8buf[j] = ( static_cast<unsigned char>( *( i + j ) ) ); // we don't increment 'i' here just in case the estimate is wrong (shouldn't happen, but we're being careful)
950  }
951  utf8buf[utf8len] = 0; // nul terminate so we throw an exception before running off the end of the buffer
952  utf8len = _utf8_to_utf32( utf8buf, uc ); // do the UTF-8 -> UTF-32 conversion
953  i += utf8len - 1; // we subtract 1 for the increment of the 'for' loop
954 
955  utf16len = _utf32_to_utf16( uc, utf16buff ); // UTF-32 -> UTF-16 conversion
956  append( utf16buff, utf16len ); // append the characters to the string
957  }
958  return *this;
959  }
960 
961  UString& UString::assign( const char* c_str )
962  {
963  std::string tmp( c_str );
964  return assign( tmp );
965  }
966 
967  UString& UString::assign( const char* c_str, size_type num )
968  {
969  std::string tmp;
970  tmp.assign( c_str, num );
971  return assign( tmp );
972  }
973 
975  {
976  mData.append( str.mData );
977  return *this;
978  }
979 
981  {
982  mData.append( str );
983  return *this;
984  }
985 
987  {
988  mData.append( str.mData, index, len );
989  return *this;
990  }
991 
993  {
994  mData.append( str, num );
995  return *this;
996  }
997 
999  {
1000  mData.append( num, ch );
1001  return *this;
1002  }
1003 
1005  {
1006  mData.append( start.mIter, end.mIter );
1007  return *this;
1008  }
1009 
1010 #if MYGUI_IS_NATIVE_WCHAR_T
1011  UString& UString::append( const wchar_t* w_str, size_type num )
1012  {
1013  std::wstring tmp( w_str, num );
1014  return append( tmp );
1015  }
1016 
1017  UString& UString::append( size_type num, wchar_t ch )
1018  {
1019  return append( num, static_cast<unicode_char>( ch ) );
1020  }
1021 #endif
1023  {
1024  UString tmp( c_str, num );
1025  append( tmp );
1026  return *this;
1027  }
1028 
1030  {
1031  append( num, static_cast<code_point>( ch ) );
1032  return *this;
1033  }
1034 
1036  {
1037  code_point cp[2] = {0, 0};
1038  if ( _utf32_to_utf16( ch, cp ) == 2 ) {
1039  for ( size_type i = 0; i < num; i++ ) {
1040  append( 1, cp[0] );
1041  append( 1, cp[1] );
1042  }
1043  } else {
1044  for ( size_type i = 0; i < num; i++ ) {
1045  append( 1, cp[0] );
1046  }
1047  }
1048  return *this;
1049  }
1050 
1052  {
1053  iterator ret;
1054  ret.mIter = mData.insert( i.mIter, ch );
1055  ret.mString = this;
1056  return ret;
1057  }
1058 
1060  {
1061  mData.insert( index, str.mData );
1062  return *this;
1063  }
1064 
1065  UString& UString::insert( size_type index1, const UString& str, size_type index2, size_type num )
1066  {
1067  mData.insert( index1, str.mData, index2, num );
1068  return *this;
1069  }
1070 
1072  {
1073  mData.insert( i.mIter, start.mIter, end.mIter );
1074  }
1075 
1077  {
1078  mData.insert( index, str, num );
1079  return *this;
1080  }
1081 
1082 #if MYGUI_IS_NATIVE_WCHAR_T
1083  UString& UString::insert( size_type index, const wchar_t* w_str, size_type num )
1084  {
1085  UString tmp( w_str, num );
1086  insert( index, tmp );
1087  return *this;
1088  }
1089 #endif
1090 
1091  UString& UString::insert( size_type index, const char* c_str, size_type num )
1092  {
1093  UString tmp( c_str, num );
1094  insert( index, tmp );
1095  return *this;
1096  }
1097 
1099  {
1100  mData.insert( index, num, ch );
1101  return *this;
1102  }
1103 
1104 #if MYGUI_IS_NATIVE_WCHAR_T
1105  UString& UString::insert( size_type index, size_type num, wchar_t ch )
1106  {
1107  insert( index, num, static_cast<unicode_char>( ch ) );
1108  return *this;
1109  }
1110 #endif
1111 
1112  UString& UString::insert( size_type index, size_type num, char ch )
1113  {
1114  insert( index, num, static_cast<code_point>( ch ) );
1115  return *this;
1116  }
1117 
1119  {
1120  code_point cp[3] = {0, 0, 0};
1121  size_t l = _utf32_to_utf16( ch, cp );
1122  if ( l == 1 ) {
1123  return insert( index, num, cp[0] );
1124  }
1125  for ( size_type c = 0; c < num; c++ ) {
1126  // insert in reverse order to preserve ordering after insert
1127  insert( index, 1, cp[1] );
1128  insert( index, 1, cp[0] );
1129  }
1130  return *this;
1131  }
1132 
1133  void UString::insert( iterator i, size_type num, const code_point& ch )
1134  {
1135  mData.insert( i.mIter, num, ch );
1136  }
1137 #if MYGUI_IS_NATIVE_WCHAR_T
1138  void UString::insert( iterator i, size_type num, const wchar_t& ch )
1139  {
1140  insert( i, num, static_cast<unicode_char>( ch ) );
1141  }
1142 #endif
1143 
1144  void UString::insert( iterator i, size_type num, const char& ch )
1145  {
1146  insert( i, num, static_cast<code_point>( ch ) );
1147  }
1148 
1150  {
1151  code_point cp[3] = {0, 0, 0};
1152  size_t l = _utf32_to_utf16( ch, cp );
1153  if ( l == 1 ) {
1154  insert( i, num, cp[0] );
1155  } else {
1156  for ( size_type c = 0; c < num; c++ ) {
1157  // insert in reverse order to preserve ordering after insert
1158  insert( i, 1, cp[1] );
1159  insert( i, 1, cp[0] );
1160  }
1161  }
1162  }
1163 
1165  {
1166  iterator ret;
1167  ret.mIter = mData.erase( loc.mIter );
1168  ret.mString = this;
1169  return ret;
1170  }
1171 
1173  {
1174  iterator ret;
1175  ret.mIter = mData.erase( start.mIter, end.mIter );
1176  ret.mString = this;
1177  return ret;
1178  }
1179 
1180  UString& UString::erase( size_type index /*= 0*/, size_type num /*= npos */ )
1181  {
1182  if ( num == npos )
1183  mData.erase( index );
1184  else
1185  mData.erase( index, num );
1186  return *this;
1187  }
1188 
1189  UString& UString::replace( size_type index1, size_type num1, const UString& str )
1190  {
1191  mData.replace( index1, num1, str.mData, 0, npos );
1192  return *this;
1193  }
1194 
1195  UString& UString::replace( size_type index1, size_type num1, const UString& str, size_type num2 )
1196  {
1197  mData.replace( index1, num1, str.mData, 0, num2 );
1198  return *this;
1199  }
1200 
1201  UString& UString::replace( size_type index1, size_type num1, const UString& str, size_type index2, size_type num2 )
1202  {
1203  mData.replace( index1, num1, str.mData, index2, num2 );
1204  return *this;
1205  }
1206 
1207  UString& UString::replace( iterator start, iterator end, const UString& str, size_type num /*= npos */ )
1208  {
1209  _const_fwd_iterator st(start); //Work around for gcc, allow it to find correct overload
1210 
1211  size_type index1 = begin() - st;
1212  size_type num1 = end - st;
1213  return replace( index1, num1, str, 0, num );
1214  }
1215 
1217  {
1218  mData.replace( index, num1, num2, ch );
1219  return *this;
1220  }
1221 
1223  {
1224  _const_fwd_iterator st(start); //Work around for gcc, allow it to find correct overload
1225 
1226  size_type index1 = begin() - st;
1227  size_type num1 = end - st;
1228  return replace( index1, num1, num, ch );
1229  }
1230 
1231  int UString::compare( const UString& str ) const
1232  {
1233  return mData.compare( str.mData );
1234  }
1235 
1236  int UString::compare( const code_point* str ) const
1237  {
1238  return mData.compare( str );
1239  }
1240 
1241  int UString::compare( size_type index, size_type length, const UString& str ) const
1242  {
1243  return mData.compare( index, length, str.mData );
1244  }
1245 
1246  int UString::compare( size_type index, size_type length, const UString& str, size_type index2, size_type length2 ) const
1247  {
1248  return mData.compare( index, length, str.mData, index2, length2 );
1249  }
1250 
1251  int UString::compare( size_type index, size_type length, const code_point* str, size_type length2 ) const
1252  {
1253  return mData.compare( index, length, str, length2 );
1254  }
1255 
1256 #if MYGUI_IS_NATIVE_WCHAR_T
1257  int UString::compare( size_type index, size_type length, const wchar_t* w_str, size_type length2 ) const
1258  {
1259  UString tmp( w_str, length2 );
1260  return compare( index, length, tmp );
1261  }
1262 #endif
1263 
1264  int UString::compare( size_type index, size_type length, const char* c_str, size_type length2 ) const
1265  {
1266  UString tmp( c_str, length2 );
1267  return compare( index, length, tmp );
1268  }
1269 
1270  UString::size_type UString::find( const UString& str, size_type index /*= 0 */ ) const
1271  {
1272  return mData.find( str.c_str(), index );
1273  }
1274 
1275  UString::size_type UString::find( const code_point* cp_str, size_type index, size_type length ) const
1276  {
1277  UString tmp( cp_str );
1278  return mData.find( tmp.c_str(), index, length );
1279  }
1280 
1281  UString::size_type UString::find( const char* c_str, size_type index, size_type length ) const
1282  {
1283  UString tmp( c_str );
1284  return mData.find( tmp.c_str(), index, length );
1285  }
1286 
1287 #if MYGUI_IS_NATIVE_WCHAR_T
1288  UString::size_type UString::find( const wchar_t* w_str, size_type index, size_type length ) const
1289  {
1290  UString tmp( w_str );
1291  return mData.find( tmp.c_str(), index, length );
1292  }
1293 #endif
1294 
1295  UString::size_type UString::find( char ch, size_type index /*= 0 */ ) const
1296  {
1297  return find( static_cast<code_point>( ch ), index );
1298  }
1299 
1301  {
1302  return mData.find( ch, index );
1303  }
1304 
1305 #if MYGUI_IS_NATIVE_WCHAR_T
1306  UString::size_type UString::find( wchar_t ch, size_type index /*= 0 */ ) const
1307  {
1308  return find( static_cast<unicode_char>( ch ), index );
1309  }
1310 #endif
1311 
1313  {
1314  code_point cp[3] = {0, 0, 0};
1315  size_t l = _utf32_to_utf16( ch, cp );
1316  return find( UString( cp, l ), index );
1317  }
1318 
1319  UString::size_type UString::rfind( const UString& str, size_type index /*= 0 */ ) const
1320  {
1321  return mData.rfind( str.c_str(), index );
1322  }
1323 
1325  {
1326  UString tmp( cp_str );
1327  return mData.rfind( tmp.c_str(), index, num );
1328  }
1329 
1330  UString::size_type UString::rfind( const char* c_str, size_type index, size_type num ) const
1331  {
1332  UString tmp( c_str );
1333  return mData.rfind( tmp.c_str(), index, num );
1334  }
1335 
1336 #if MYGUI_IS_NATIVE_WCHAR_T
1337  UString::size_type UString::rfind( const wchar_t* w_str, size_type index, size_type num ) const
1338  {
1339  UString tmp( w_str );
1340  return mData.rfind( tmp.c_str(), index, num );
1341  }
1342 #endif
1343 
1344  UString::size_type UString::rfind( char ch, size_type index /*= 0 */ ) const
1345  {
1346  return rfind( static_cast<code_point>( ch ), index );
1347  }
1348 
1350  {
1351  return mData.rfind( ch, index );
1352  }
1353 
1354 #if MYGUI_IS_NATIVE_WCHAR_T
1355  UString::size_type UString::rfind( wchar_t ch, size_type index /*= 0 */ ) const
1356  {
1357  return rfind( static_cast<unicode_char>( ch ), index );
1358  }
1359 #endif
1360 
1362  {
1363  code_point cp[3] = {0, 0, 0};
1364  size_t l = _utf32_to_utf16( ch, cp );
1365  return rfind( UString( cp, l ), index );
1366  }
1367 
1368  UString::size_type UString::find_first_of( const UString &str, size_type index /*= 0*/, size_type num /*= npos */ ) const
1369  {
1370  size_type i = 0;
1371  const size_type len = length();
1372  while ( i < num && ( index + i ) < len ) {
1373  unicode_char ch = getChar( index + i );
1374  if ( str.inString( ch ) )
1375  return index + i;
1376  i += _utf16_char_length( ch ); // increment by the Unicode character length
1377  }
1378  return npos;
1379  }
1380 
1382  {
1383  UString tmp;
1384  tmp.assign( 1, ch );
1385  return find_first_of( tmp, index );
1386  }
1387 
1388  UString::size_type UString::find_first_of( char ch, size_type index /*= 0 */ ) const
1389  {
1390  return find_first_of( static_cast<code_point>( ch ), index );
1391  }
1392 
1393 #if MYGUI_IS_NATIVE_WCHAR_T
1394  UString::size_type UString::find_first_of( wchar_t ch, size_type index /*= 0 */ ) const
1395  {
1396  return find_first_of( static_cast<unicode_char>( ch ), index );
1397  }
1398 #endif
1399 
1401  {
1402  code_point cp[3] = {0, 0, 0};
1403  size_t l = _utf32_to_utf16( ch, cp );
1404  return find_first_of( UString( cp, l ), index );
1405  }
1406 
1407  UString::size_type UString::find_first_not_of( const UString& str, size_type index /*= 0*/, size_type num /*= npos */ ) const
1408  {
1409  size_type i = 0;
1410  const size_type len = length();
1411  while ( i < num && ( index + i ) < len ) {
1412  unicode_char ch = getChar( index + i );
1413  if ( !str.inString( ch ) )
1414  return index + i;
1415  i += _utf16_char_length( ch ); // increment by the Unicode character length
1416  }
1417  return npos;
1418  }
1419 
1421  {
1422  UString tmp;
1423  tmp.assign( 1, ch );
1424  return find_first_not_of( tmp, index );
1425  }
1426 
1428  {
1429  return find_first_not_of( static_cast<code_point>( ch ), index );
1430  }
1431 
1432 #if MYGUI_IS_NATIVE_WCHAR_T
1433  UString::size_type UString::find_first_not_of( wchar_t ch, size_type index /*= 0 */ ) const
1434  {
1435  return find_first_not_of( static_cast<unicode_char>( ch ), index );
1436  }
1437 #endif
1438 
1440  {
1441  code_point cp[3] = {0, 0, 0};
1442  size_t l = _utf32_to_utf16( ch, cp );
1443  return find_first_not_of( UString( cp, l ), index );
1444  }
1445 
1446  UString::size_type UString::find_last_of( const UString& str, size_type index /*= npos*/, size_type num /*= npos */ ) const
1447  {
1448  size_type i = 0;
1449  const size_type len = length();
1450  if ( index > len ) index = len - 1;
1451 
1452  while ( i < num && ( index - i ) != npos ) {
1453  size_type j = index - i;
1454  // careful to step full Unicode characters
1455  if ( j != 0 && _utf16_surrogate_follow( at( j ) ) && _utf16_surrogate_lead( at( j - 1 ) ) ) {
1456  j = index - ++i;
1457  }
1458  // and back to the usual dull test
1459  unicode_char ch = getChar( j );
1460  if ( str.inString( ch ) )
1461  return j;
1462  i++;
1463  }
1464  return npos;
1465  }
1466 
1468  {
1469  UString tmp;
1470  tmp.assign( 1, ch );
1471  return find_last_of( tmp, index );
1472  }
1473 
1474 #if MYGUI_IS_NATIVE_WCHAR_T
1475  UString::size_type UString::find_last_of( wchar_t ch, size_type index /*= npos */ ) const
1476  {
1477  return find_last_of( static_cast<unicode_char>( ch ), index );
1478  }
1479 #endif
1480 
1482  {
1483  code_point cp[3] = {0, 0, 0};
1484  size_t l = _utf32_to_utf16( ch, cp );
1485  return find_last_of( UString( cp, l ), index );
1486  }
1487 
1488  UString::size_type UString::find_last_not_of( const UString& str, size_type index /*= npos*/, size_type num /*= npos */ ) const
1489  {
1490  size_type i = 0;
1491  const size_type len = length();
1492  if ( index > len ) index = len - 1;
1493 
1494  while ( i < num && ( index - i ) != npos ) {
1495  size_type j = index - i;
1496  // careful to step full Unicode characters
1497  if ( j != 0 && _utf16_surrogate_follow( at( j ) ) && _utf16_surrogate_lead( at( j - 1 ) ) ) {
1498  j = index - ++i;
1499  }
1500  // and back to the usual dull test
1501  unicode_char ch = getChar( j );
1502  if ( !str.inString( ch ) )
1503  return j;
1504  i++;
1505  }
1506  return npos;
1507  }
1508 
1510  {
1511  UString tmp;
1512  tmp.assign( 1, ch );
1513  return find_last_not_of( tmp, index );
1514  }
1515 
1516  UString::size_type UString::find_last_not_of( char ch, size_type index /*= npos */ ) const
1517  {
1518  return find_last_not_of( static_cast<code_point>( ch ), index );
1519  }
1520 
1521 #if MYGUI_IS_NATIVE_WCHAR_T
1522  UString::size_type UString::find_last_not_of( wchar_t ch, size_type index /*= npos */ ) const
1523  {
1524  return find_last_not_of( static_cast<unicode_char>( ch ), index );
1525  }
1526 #endif
1527 
1529  {
1530  code_point cp[3] = {0, 0, 0};
1531  size_t l = _utf32_to_utf16( ch, cp );
1532  return find_last_not_of( UString( cp, l ), index );
1533  }
1534 
1535  bool UString::operator<( const UString& right ) const
1536  {
1537  return compare( right ) < 0;
1538  }
1539 
1540  bool UString::operator<=( const UString& right ) const
1541  {
1542  return compare( right ) <= 0;
1543  }
1544 
1546  {
1547  return assign( s );
1548  }
1549 
1551  {
1552  clear();
1553  return append( 1, ch );
1554  }
1555 
1557  {
1558  clear();
1559  return append( 1, ch );
1560  }
1561 
1562 #if MYGUI_IS_NATIVE_WCHAR_T
1563  UString& UString::operator=( wchar_t ch )
1564  {
1565  clear();
1566  return append( 1, ch );
1567  }
1568 #endif
1569 
1571  {
1572  clear();
1573  return append( 1, ch );
1574  }
1575 
1576  bool UString::operator>( const UString& right ) const
1577  {
1578  return compare( right ) > 0;
1579  }
1580 
1581  bool UString::operator>=( const UString& right ) const
1582  {
1583  return compare( right ) >= 0;
1584  }
1585 
1586  bool UString::operator==( const UString& right ) const
1587  {
1588  return compare( right ) == 0;
1589  }
1590 
1591  bool UString::operator!=( const UString& right ) const
1592  {
1593  return !operator==( right );
1594  }
1595 
1597  {
1598  return at( index );
1599  }
1600 
1602  {
1603  return at( index );
1604  }
1605 
1606  UString::operator std::string() const
1607  {
1608  return std::string( asUTF8() );
1609  }
1610 
1612  UString::operator std::wstring() const
1613  {
1614  return std::wstring( asWStr() );
1615  }
1616 
1617 
1619  {
1620  if ( 0xD800 <= cp && cp <= 0xDFFF ) // tests if the cp is within the surrogate pair range
1621  return false; // it matches a surrogate pair signature
1622  return true; // everything else is a standalone code point
1623  }
1624 
1626  {
1627  if ( 0xD800 <= cp && cp <= 0xDBFF ) // tests if the cp is within the 2nd word of a surrogate pair
1628  return true; // it is a 1st word
1629  return false; // it isn't
1630  }
1631 
1633  {
1634  if ( 0xDC00 <= cp && cp <= 0xDFFF ) // tests if the cp is within the 2nd word of a surrogate pair
1635  return true; // it is a 2nd word
1636  return false; // everything else isn't
1637  }
1638 
1640  {
1641  if ( 0xD800 <= cp && cp <= 0xDBFF ) // test if cp is the beginning of a surrogate pair
1642  return 2; // if it is, then we are 2 words long
1643  return 1; // otherwise we are only 1 word long
1644  }
1645 
1647  {
1648  if ( uc > 0xFFFF ) // test if uc is greater than the single word maximum
1649  return 2; // if so, we need a surrogate pair
1650  return 1; // otherwise we can stuff it into a single word
1651  }
1652 
1653  size_t UString::_utf16_to_utf32( const code_point in_cp[2], unicode_char& out_uc )
1654  {
1655  const code_point& cp1 = in_cp[0];
1656  const code_point& cp2 = in_cp[1];
1657  bool wordPair = false;
1658 
1659  // does it look like a surrogate pair?
1660  if ( 0xD800 <= cp1 && cp1 <= 0xDBFF ) {
1661  // looks like one, but does the other half match the algorithm as well?
1662  if ( 0xDC00 <= cp2 && cp2 <= 0xDFFF )
1663  wordPair = true; // yep!
1664  }
1665 
1666  if ( !wordPair ) { // if we aren't a 100% authentic surrogate pair, then just copy the value
1667  out_uc = cp1;
1668  return 1;
1669  }
1670 
1671  unsigned short cU = cp1, cL = cp2; // copy upper and lower words of surrogate pair to writable buffers
1672  cU -= 0xD800; // remove the encoding markers
1673  cL -= 0xDC00;
1674 
1675  out_uc = ( cU & 0x03FF ) << 10; // grab the 10 upper bits and set them in their proper location
1676  out_uc |= ( cL & 0x03FF ); // combine in the lower 10 bits
1677  out_uc += 0x10000; // add back in the value offset
1678 
1679  return 2; // this whole operation takes to words, so that's what we'll return
1680  }
1681 
1682  size_t UString::_utf32_to_utf16( const unicode_char& in_uc, code_point out_cp[2] )
1683  {
1684  if ( in_uc <= 0xFFFF ) { // we blindly preserve sentinel values because our decoder understands them
1685  out_cp[0] = static_cast<code_point>(in_uc);
1686  return 1;
1687  }
1688  unicode_char uc = in_uc; // copy to writable buffer
1689  unsigned short tmp; // single code point buffer
1690  uc -= 0x10000; // subtract value offset
1691 
1692  //process upper word
1693  tmp = static_cast<unsigned short>(( uc >> 10 ) & 0x03FF); // grab the upper 10 bits
1694  tmp += 0xD800; // add encoding offset
1695  out_cp[0] = tmp; // write
1696 
1697  // process lower word
1698  tmp = static_cast<unsigned short>(uc & 0x03FF); // grab the lower 10 bits
1699  tmp += 0xDC00; // add encoding offset
1700  out_cp[1] = tmp; // write
1701 
1702  return 2; // return used word count (2 for surrogate pairs)
1703  }
1704 
1705  bool UString::_utf8_start_char( unsigned char cp )
1706  {
1707  return ( cp & ~_cont_mask ) != _cont;
1708  }
1709 
1710  size_t UString::_utf8_char_length( unsigned char cp )
1711  {
1712  if ( !( cp & 0x80 ) ) return 1;
1713  if (( cp & ~_lead1_mask ) == _lead1 ) return 2;
1714  if (( cp & ~_lead2_mask ) == _lead2 ) return 3;
1715  if (( cp & ~_lead3_mask ) == _lead3 ) return 4;
1716  if (( cp & ~_lead4_mask ) == _lead4 ) return 5;
1717  if (( cp & ~_lead5_mask ) == _lead5 ) return 6;
1718 
1719  return 1;
1720  //throw invalid_data( "invalid UTF-8 sequence header value" );
1721  }
1722 
1724  {
1725  /*
1726  7 bit: U-00000000 - U-0000007F: 0xxxxxxx
1727  11 bit: U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
1728  16 bit: U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
1729  21 bit: U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1730  26 bit: U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1731  31 bit: U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1732  */
1733  if ( !( uc & ~0x0000007F ) ) return 1;
1734  if ( !( uc & ~0x000007FF ) ) return 2;
1735  if ( !( uc & ~0x0000FFFF ) ) return 3;
1736  if ( !( uc & ~0x001FFFFF ) ) return 4;
1737  if ( !( uc & ~0x03FFFFFF ) ) return 5;
1738  if ( !( uc & ~0x7FFFFFFF ) ) return 6;
1739 
1740  return 1;
1741  //throw invalid_data( "invalid UTF-32 value" );
1742  }
1743 
1744  size_t UString::_utf8_to_utf32( const unsigned char in_cp[6], unicode_char& out_uc )
1745  {
1746  size_t len = _utf8_char_length( in_cp[0] );
1747  if ( len == 1 ) { // if we are only 1 byte long, then just grab it and exit
1748  out_uc = in_cp[0];
1749  return 1;
1750  }
1751 
1752  unicode_char c = 0; // temporary buffer
1753  size_t i = 0;
1754  switch ( len ) { // load header byte
1755  case 6:
1756  c = in_cp[i] & _lead5_mask;
1757  break;
1758  case 5:
1759  c = in_cp[i] & _lead4_mask;
1760  break;
1761  case 4:
1762  c = in_cp[i] & _lead3_mask;
1763  break;
1764  case 3:
1765  c = in_cp[i] & _lead2_mask;
1766  break;
1767  case 2:
1768  c = in_cp[i] & _lead1_mask;
1769  break;
1770  }
1771 
1772  // load each continuation byte
1773  for ( ++i; i < len; i++ )
1774  {
1775  if (( in_cp[i] & ~_cont_mask ) != _cont )
1776  {
1777  //throw invalid_data( "bad UTF-8 continuation byte" );
1778  out_uc = in_cp[0];
1779  return 1;
1780  }
1781  c <<= 6;
1782  c |= ( in_cp[i] & _cont_mask );
1783  }
1784 
1785  out_uc = c; // write the final value and return the used byte length
1786  return len;
1787  }
1788 
1789  size_t UString::_utf32_to_utf8( const unicode_char& in_uc, unsigned char out_cp[6] )
1790  {
1791  size_t len = _utf8_char_length( in_uc ); // predict byte length of sequence
1792  unicode_char c = in_uc; // copy to temp buffer
1793 
1794  //stuff all of the lower bits
1795  for ( size_t i = len - 1; i > 0; i-- ) {
1796  out_cp[i] = static_cast<unsigned char>((( c ) & _cont_mask ) | _cont);
1797  c >>= 6;
1798  }
1799 
1800  //now write the header byte
1801  switch ( len ) {
1802  case 6:
1803  out_cp[0] = static_cast<unsigned char>((( c ) & _lead5_mask ) | _lead5);
1804  break;
1805  case 5:
1806  out_cp[0] = static_cast<unsigned char>((( c ) & _lead4_mask ) | _lead4);
1807  break;
1808  case 4:
1809  out_cp[0] = static_cast<unsigned char>((( c ) & _lead3_mask ) | _lead3);
1810  break;
1811  case 3:
1812  out_cp[0] = static_cast<unsigned char>((( c ) & _lead2_mask ) | _lead2);
1813  break;
1814  case 2:
1815  out_cp[0] = static_cast<unsigned char>((( c ) & _lead1_mask ) | _lead1);
1816  break;
1817  case 1:
1818  default:
1819  out_cp[0] = static_cast<unsigned char>(( c ) & 0x7F);
1820  break;
1821  }
1822 
1823  // return the byte length of the sequence
1824  return len;
1825  }
1826 
1828  {
1829  std::string tmp( reinterpret_cast<const char*>( c_str ) );
1830  return _verifyUTF8( tmp );
1831  }
1832 
1833  UString::size_type UString::_verifyUTF8( const std::string& str )
1834  {
1835  std::string::const_iterator i, ie = str.end();
1836  i = str.begin();
1837  size_type length = 0;
1838 
1839  while ( i != ie ) {
1840  // characters pass until we find an extended sequence
1841  if (( *i ) & 0x80 ) {
1842  unsigned char c = ( *i );
1843  size_t contBytes = 0;
1844 
1845  // get continuation byte count and test for overlong sequences
1846  if (( c & ~_lead1_mask ) == _lead1 ) { // 1 additional byte
1847  if ( c == _lead1 )
1848  {
1849  //throw invalid_data( "overlong UTF-8 sequence" );
1850  return str.size();
1851  }
1852  contBytes = 1;
1853 
1854  } else if (( c & ~_lead2_mask ) == _lead2 ) { // 2 additional bytes
1855  contBytes = 2;
1856  if ( c == _lead2 ) { // possible overlong UTF-8 sequence
1857  c = ( *( i + 1 ) ); // look ahead to next byte in sequence
1858  if (( c & _lead2 ) == _cont )
1859  {
1860  //throw invalid_data( "overlong UTF-8 sequence" );
1861  return str.size();
1862  }
1863  }
1864 
1865  } else if (( c & ~_lead3_mask ) == _lead3 ) { // 3 additional bytes
1866  contBytes = 3;
1867  if ( c == _lead3 ) { // possible overlong UTF-8 sequence
1868  c = ( *( i + 1 ) ); // look ahead to next byte in sequence
1869  if (( c & _lead3 ) == _cont )
1870  {
1871  //throw invalid_data( "overlong UTF-8 sequence" );
1872  return str.size();
1873  }
1874  }
1875 
1876  } else if (( c & ~_lead4_mask ) == _lead4 ) { // 4 additional bytes
1877  contBytes = 4;
1878  if ( c == _lead4 ) { // possible overlong UTF-8 sequence
1879  c = ( *( i + 1 ) ); // look ahead to next byte in sequence
1880  if (( c & _lead4 ) == _cont )
1881  {
1882  //throw invalid_data( "overlong UTF-8 sequence" );
1883  return str.size();
1884  }
1885  }
1886 
1887  } else if (( c & ~_lead5_mask ) == _lead5 ) { // 5 additional bytes
1888  contBytes = 5;
1889  if ( c == _lead5 ) { // possible overlong UTF-8 sequence
1890  c = ( *( i + 1 ) ); // look ahead to next byte in sequence
1891  if (( c & _lead5 ) == _cont )
1892  {
1893  //throw invalid_data( "overlong UTF-8 sequence" );
1894  return str.size();
1895  }
1896  }
1897  }
1898 
1899  // check remaining continuation bytes for
1900  while ( contBytes-- ) {
1901  c = ( *( ++i ) ); // get next byte in sequence
1902  if (( c & ~_cont_mask ) != _cont )
1903  {
1904  //throw invalid_data( "bad UTF-8 continuation byte" );
1905  return str.size();
1906  }
1907  }
1908  }
1909  length++;
1910  i++;
1911  }
1912  return length;
1913  }
1914 
1915  void UString::_init()
1916  {
1917  m_buffer.mVoidBuffer = 0;
1918  m_bufferType = bt_none;
1919  m_bufferSize = 0;
1920  }
1921 
1922  void UString::_cleanBuffer() const
1923  {
1924  if ( m_buffer.mVoidBuffer != 0 ) {
1925  switch ( m_bufferType ) {
1926  case bt_string:
1927  delete m_buffer.mStrBuffer;
1928  break;
1929  case bt_wstring:
1930  delete m_buffer.mWStrBuffer;
1931  break;
1932  case bt_utf32string:
1933  delete m_buffer.mUTF32StrBuffer;
1934  break;
1935  case bt_none: // under the worse of circumstances, this is all we can do, and hope it works out
1936  //delete m_buffer.mVoidBuffer;
1937  // delete void* is undefined, don't do that
1938  assert("This should never happen - mVoidBuffer should never contain something if we "
1939  "don't know the type");
1940  break;
1941  }
1942  m_buffer.mVoidBuffer = 0;
1943  m_bufferSize = 0;
1944  m_bufferType = bt_none;
1945  }
1946  }
1947 
1948  void UString::_getBufferStr() const
1949  {
1950  if ( m_bufferType != bt_string ) {
1951  _cleanBuffer();
1952  m_buffer.mStrBuffer = new std::string();
1953  m_bufferType = bt_string;
1954  }
1955  m_buffer.mStrBuffer->clear();
1956  }
1957 
1958  void UString::_getBufferWStr() const
1959  {
1960  if ( m_bufferType != bt_wstring ) {
1961  _cleanBuffer();
1962  m_buffer.mWStrBuffer = new std::wstring();
1963  m_bufferType = bt_wstring;
1964  }
1965  m_buffer.mWStrBuffer->clear();
1966  }
1967 
1968  void UString::_getBufferUTF32Str() const
1969  {
1970  if ( m_bufferType != bt_utf32string ) {
1971  _cleanBuffer();
1972  m_buffer.mUTF32StrBuffer = new utf32string();
1973  m_bufferType = bt_utf32string;
1974  }
1975  m_buffer.mUTF32StrBuffer->clear();
1976  }
1977 
1978  void UString::_load_buffer_UTF8() const
1979  {
1980  _getBufferStr();
1981  std::string& buffer = ( *m_buffer.mStrBuffer );
1982  buffer.reserve( length() );
1983 
1984  unsigned char utf8buf[6];
1985  char* charbuf = ( char* )utf8buf;
1986  unicode_char c;
1987  size_t len;
1988 
1989  const_iterator i, ie = end();
1990  for ( i = begin(); i != ie; i.moveNext() ) {
1991  c = i.getCharacter();
1992  len = _utf32_to_utf8( c, utf8buf );
1993  size_t j = 0;
1994  while ( j < len )
1995  buffer.push_back( charbuf[j++] );
1996  }
1997  }
1998 
1999  void UString::_load_buffer_WStr() const
2000  {
2001  _getBufferWStr();
2002  std::wstring& buffer = ( *m_buffer.mWStrBuffer );
2003  buffer.reserve( length() ); // may over reserve, but should be close enough
2004 #ifdef WCHAR_UTF16 // wchar_t matches UTF-16
2005  const_iterator i, ie = end();
2006  for ( i = begin(); i != ie; ++i ) {
2007  buffer.push_back(( wchar_t )( *i ) );
2008  }
2009 #else // wchar_t fits UTF-32
2010  unicode_char c;
2011  const_iterator i, ie = end();
2012  for ( i = begin(); i != ie; i.moveNext() ) {
2013  c = i.getCharacter();
2014  buffer.push_back(( wchar_t )c );
2015  }
2016 #endif
2017  }
2018 
2019  void UString::_load_buffer_UTF32() const
2020  {
2021  _getBufferUTF32Str();
2022  utf32string& buffer = ( *m_buffer.mUTF32StrBuffer );
2023  buffer.reserve( length() ); // may over reserve, but should be close enough
2024 
2025  unicode_char c;
2026 
2027  const_iterator i, ie = end();
2028  for ( i = begin(); i != ie; i.moveNext() ) {
2029  c = i.getCharacter();
2030  buffer.push_back( c );
2031  }
2032  }
2033 
2034 } // namespace MyGUI
code_point & operator[](size_type index)
code point dereference operator
std::basic_string< code_point > dstring
base iterator class for UString
size_type find_last_of(const UString &str, size_type index=npos, size_type num=npos) const
returns the index of the first character within the current string that matches any character in str...
size_type rfind(const UString &str, size_type index=0) const
returns the location of the first occurrence of str in the current string, doing a reverse search fro...
_const_rev_iterator operator-(difference_type n)
subtraction operator
bool inString(unicode_char ch) const
returns true if the given Unicode character ch is in this string
bool empty() const
returns true if the string has no elements, false otherwise
_const_fwd_iterator & operator--()
pre-decrement
const code_point * c_str() const
returns a pointer to the first character in the current string
reverse_iterator rend()
returns a reverse iterator just past the beginning of the string
void resize(size_type num, const code_point &val=0)
changes the size of the string to size, filling in any new area with val
UString & append(const UString &str)
appends str on to the end of the current string
_rev_iterator & operator--()
pre-decrement
bool operator!=(const UString &right) const
inequality operator
int _setCharacter(unicode_char uc)
static size_t _utf32_to_utf8(const unicode_char &in_uc, unsigned char out_cp[6])
writes the given UTF-32 uc_in to the buffer location out_cp using UTF-8 encoding, returns the number ...
iterator erase(iterator loc)
removes the code point pointed to by loc, returning an iterator to the next character ...
UString()
default constructor, creates an empty string
static size_t _utf8_char_length(unsigned char cp)
estimates the number of UTF-8 code points in the sequence starting with cp
value_type & operator[](difference_type n) const
dereference at offset operator
_const_fwd_iterator operator+(difference_type n)
addition operator
_const_fwd_iterator & operator+=(difference_type n)
addition assignment operator
_const_rev_iterator & operator+=(difference_type n)
addition assignment operator
_rev_iterator operator-(difference_type n)
subtraction operator
size_type length_Characters() const
Returns the number of Unicode characters in the string.
const code_point * data() const
returns a pointer to the first character in the current string
_const_fwd_iterator operator-(difference_type n)
subtraction operator
_fwd_iterator & operator+=(difference_type n)
addition assignment operator
const utf32string & asUTF32() const
returns the current string in UTF-32 form within a utf32string
size_t size_type
size type used to indicate string size and character positions within the string
_const_rev_iterator & operator++()
pre-increment
const value_type & operator*() const
dereference operator
int setChar(size_type loc, unicode_char ch)
sets the value of the character at loc to the Unicode value ch (UTF-32)
unicode_char getCharacter() const
Returns the Unicode value of the character at the current position (decodes surrogate pairs if needed...
static const size_type npos
the usual constant representing: not found, no limit, etc
const value_type & operator[](difference_type n) const
dereference at offset operator
const forward iterator for UString
_fwd_iterator & operator-=(difference_type n)
subtraction assignment operator
_rev_iterator & operator-=(difference_type n)
subtraction assignment operator
_const_rev_iterator operator+(difference_type n)
addition operator
static size_t _utf32_to_utf16(const unicode_char &in_uc, code_point out_cp[2])
writes the given UTF-32 uc_in to the buffer location out_cp using UTF-16 encoding, returns the number of code points used to encode the input (always 1 or 2)
void push_back(unicode_char val)
appends val to the end of the string
std::basic_string< unicode_char > utf32string
string type used for returning UTF-32 formatted data
bool operator<(const UString &right) const
less than operator
void clear()
deletes all of the elements in the string
const reverse iterator for UString
iterator begin()
returns an iterator to the first element of the string
const char * asUTF8_c_str() const
returns the current string in UTF-8 form as a nul-terminated char array
_const_fwd_iterator & movePrev()
rewinds to the previous Unicode character, honoring surrogate pairs in the UTF-16 stream ...
reverse_iterator rbegin()
returns a reverse iterator to the last element of the string
value_type & operator*() const
dereference operator
_const_fwd_iterator & operator++()
pre-increment
uint16 code_point
a single UTF-16 code point
_fwd_iterator operator+(difference_type n)
addition operator
_const_rev_iterator & operator-=(difference_type n)
subtraction assignment operator
bool operator>(const UString &right) const
greater than operator
static bool _utf8_start_char(unsigned char cp)
returns true if cp is the beginning of a UTF-8 sequence
static bool _utf16_surrogate_lead(code_point cp)
returns true if cp matches the signature of a surrogate pair lead character
const value_type & operator[](difference_type n) const
dereference at offset operator
void swap(UString &from)
exchanges the elements of the current string with those of from
static size_t _utf16_to_utf32(const code_point in_cp[2], unicode_char &out_uc)
converts the given UTF-16 character buffer in_cp to a single UTF-32 Unicode character out_uc...
_fwd_iterator operator-(difference_type n)
subtraction operator
const value_type & operator*() const
dereference operator
_rev_iterator operator+(difference_type n)
addition operator
size_type find_last_not_of(const UString &str, size_type index=npos, size_type num=npos) const
returns the index of the last character within the current string that does not match any character i...
size_type find_first_of(const UString &str, size_type index=0, size_type num=npos) const
Returns the index of the first character within the current string that matches any character in str...
unicode_char _getCharacter() const
code_point value_type
value type typedef for use in iterators
const std::string & asUTF8() const
returns the current string in UTF-8 form within a std::string
size_type find(const UString &str, size_type index=0) const
returns the index of the first occurrence of str within the current string, starting at index; return...
void reserve(size_type size)
sets the capacity of the string to at least size code points
_fwd_iterator & movePrev()
rewinds to the previous Unicode character, honoring surrogate pairs in the UTF-16 stream ...
const wchar_t * asWStr_c_str() const
returns the current string in the native form of a nul-terminated wchar_t array
int compare(const UString &str) const
compare str to the current string
forward iterator for UString
uint32 unicode_char
a single 32-bit Unicode character
static bool _utf16_surrogate_follow(code_point cp)
returns true if cp matches the signature of a surrogate pair following character
~UString()
destructor
static size_t _utf16_char_length(code_point cp)
estimates the number of UTF-16 code points in the sequence starting with cp
bool operator>=(const UString &right) const
greater than or equal operator
code_point & at(size_type loc)
returns a reference to the element in the string at index loc
static bool _utf16_independent_char(code_point cp)
returns true if cp does not match the signature for the lead of follow code point of a surrogate pair...
_const_rev_iterator & operator--()
pre-decrement
void _become(const _base_iterator &i)
value_type & operator[](difference_type n) const
dereference at offset operator
_fwd_iterator & operator--()
pre-decrement
size_type size() const
Returns the number of code points in the current string.
size_type find_first_not_of(const UString &str, size_type index=0, size_type num=npos) const
returns the index of the first character within the current string that does not match any character ...
_rev_iterator & operator++()
pre-increment
_fwd_iterator & moveNext()
advances to the next Unicode character, honoring surrogate pairs in the UTF-16 stream ...
forward iterator for UString
static size_type _verifyUTF8(const unsigned char *c_str)
verifies a UTF-8 stream, returning the total number of Unicode characters found
UString & assign(iterator start, iterator end)
gives the current string the values from start to end
int setCharacter(unicode_char uc)
Sets the Unicode value of the character at the current position (adding a surrogate pair if needed); ...
_const_fwd_iterator & moveNext()
advances to the next Unicode character, honoring surrogate pairs in the UTF-16 stream ...
A UTF-16 string with implicit conversion to/from std::string and std::wstring.
_rev_iterator & operator+=(difference_type n)
addition assignment operator
static size_t _utf8_to_utf32(const unsigned char in_cp[6], unicode_char &out_uc)
converts the given UTF-8 character buffer to a single UTF-32 Unicode character, returns the number of...
iterator insert(iterator i, const code_point &ch)
inserts ch before the code point denoted by i
size_type length() const
Returns the number of code points in the current string.
UString & operator=(const UString &s)
assignment operator, implicitly casts all compatible types
iterator end()
returns an iterator just past the end of the string
UString substr(size_type index, size_type num=npos) const
returns a substring of the current string, starting at index, and num characters long.
const std::wstring & asWStr() const
returns the current string in the native form of std::wstring
UString & replace(size_type index1, size_type num1, const UString &str)
replaces up to num1 code points of the current string (starting at index1) with str ...
const unicode_char * asUTF32_c_str() const
returns the current string in UTF-32 form as a nul-terminated unicode_char array
unicode_char getChar(size_type loc) const
returns the data point loc evaluated as a UTF-32 value
value_type & operator*() const
dereference operator
unicode_char getCharacter() const
Returns the Unicode value of the character at the current position (decodes surrogate pairs if needed...
size_type capacity() const
returns the number of elements that the string can hold before it will need to allocate more space ...
float len(float x, float y)
size_type max_size() const
returns the maximum number of UTF-16 code points that the string can hold
bool operator<=(const UString &right) const
less than or equal operator
_const_fwd_iterator & operator-=(difference_type n)
subtraction assignment operator
_fwd_iterator & operator++()
pre-increment
bool operator==(const UString &right) const
equality operator
void _jump_to(size_type index)