Ruby  2.4.2p198(2017-09-14revision59899)
string.c
Go to the documentation of this file.
1 /**********************************************************************
2 
3  string.c -
4 
5  $Author: nagachika $
6  created at: Mon Aug 9 17:12:58 JST 1993
7 
8  Copyright (C) 1993-2007 Yukihiro Matsumoto
9  Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10  Copyright (C) 2000 Information-technology Promotion Agency, Japan
11 
12 **********************************************************************/
13 
14 #include "internal.h"
15 #include "ruby/re.h"
16 #include "encindex.h"
17 #include "probes.h"
18 #include "gc.h"
19 #include "ruby_assert.h"
20 #include "id.h"
21 
22 #define BEG(no) (regs->beg[(no)])
23 #define END(no) (regs->end[(no)])
24 
25 #include <math.h>
26 #include <ctype.h>
27 
28 #ifdef HAVE_UNISTD_H
29 #include <unistd.h>
30 #endif
31 
32 #if defined HAVE_CRYPT_R
33 # if defined HAVE_CRYPT_H
34 # include <crypt.h>
35 # endif
36 #elif !defined HAVE_CRYPT
37 # include "missing/crypt.h"
38 # define HAVE_CRYPT_R 1
39 #endif
40 
41 #define STRING_ENUMERATORS_WANTARRAY 0 /* next major */
42 
43 #undef rb_str_new
44 #undef rb_usascii_str_new
45 #undef rb_utf8_str_new
46 #undef rb_enc_str_new
47 #undef rb_str_new_cstr
48 #undef rb_tainted_str_new_cstr
49 #undef rb_usascii_str_new_cstr
50 #undef rb_utf8_str_new_cstr
51 #undef rb_enc_str_new_cstr
52 #undef rb_external_str_new_cstr
53 #undef rb_locale_str_new_cstr
54 #undef rb_str_dup_frozen
55 #undef rb_str_buf_new_cstr
56 #undef rb_str_buf_cat
57 #undef rb_str_buf_cat2
58 #undef rb_str_cat2
59 #undef rb_str_cat_cstr
60 #undef rb_fstring_cstr
61 #undef rb_fstring_enc_cstr
62 
63 static VALUE rb_str_clear(VALUE str);
64 
67 
68 /* FLAGS of RString
69  *
70  * 1: RSTRING_NOEMBED
71  * 2: STR_SHARED (== ELTS_SHARED)
72  * 2-6: RSTRING_EMBED_LEN (5 bits == 32)
73  * 6: STR_IS_SHARED_M (shared, when RSTRING_NOEMBED==1 && klass==0)
74  * 7: STR_TMPLOCK
75  * 8-9: ENC_CODERANGE (2 bits)
76  * 10-16: ENCODING (7 bits == 128)
77  * 17: RSTRING_FSTR
78  * 18: STR_NOFREE
79  * 19: STR_FAKESTR
80  */
81 
82 #define RUBY_MAX_CHAR_LEN 16
83 #define STR_IS_SHARED_M FL_USER6
84 #define STR_TMPLOCK FL_USER7
85 #define STR_NOFREE FL_USER18
86 #define STR_FAKESTR FL_USER19
87 
88 #define STR_SET_NOEMBED(str) do {\
89  FL_SET((str), STR_NOEMBED);\
90  STR_SET_EMBED_LEN((str), 0);\
91 } while (0)
92 #define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
93 #define STR_SET_EMBED_LEN(str, n) do { \
94  long tmp_n = (n);\
95  RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
96  RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
97 } while (0)
98 
99 #define STR_SET_LEN(str, n) do { \
100  if (STR_EMBED_P(str)) {\
101  STR_SET_EMBED_LEN((str), (n));\
102  }\
103  else {\
104  RSTRING(str)->as.heap.len = (n);\
105  }\
106 } while (0)
107 
108 #define STR_DEC_LEN(str) do {\
109  if (STR_EMBED_P(str)) {\
110  long n = RSTRING_LEN(str);\
111  n--;\
112  STR_SET_EMBED_LEN((str), n);\
113  }\
114  else {\
115  RSTRING(str)->as.heap.len--;\
116  }\
117 } while (0)
118 
119 #define TERM_LEN(str) rb_enc_mbminlen(rb_enc_get(str))
120 #define TERM_FILL(ptr, termlen) do {\
121  char *const term_fill_ptr = (ptr);\
122  const int term_fill_len = (termlen);\
123  *term_fill_ptr = '\0';\
124  if (UNLIKELY(term_fill_len > 1))\
125  memset(term_fill_ptr, 0, term_fill_len);\
126 } while (0)
127 
128 #define RESIZE_CAPA(str,capacity) do {\
129  const int termlen = TERM_LEN(str);\
130  RESIZE_CAPA_TERM(str,capacity,termlen);\
131 } while (0)
132 #define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
133  if (STR_EMBED_P(str)) {\
134  if (!STR_EMBEDDABLE_P(capacity, termlen)) {\
135  char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
136  const long tlen = RSTRING_LEN(str);\
137  memcpy(tmp, RSTRING_PTR(str), tlen);\
138  RSTRING(str)->as.heap.ptr = tmp;\
139  RSTRING(str)->as.heap.len = tlen;\
140  STR_SET_NOEMBED(str);\
141  RSTRING(str)->as.heap.aux.capa = (capacity);\
142  }\
143  }\
144  else {\
145  assert(!FL_TEST((str), STR_SHARED)); \
146  REALLOC_N(RSTRING(str)->as.heap.ptr, char, (size_t)(capacity) + (termlen));\
147  RSTRING(str)->as.heap.aux.capa = (capacity);\
148  }\
149 } while (0)
150 
151 #define STR_SET_SHARED(str, shared_str) do { \
152  if (!FL_TEST(str, STR_FAKESTR)) { \
153  RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
154  FL_SET((str), STR_SHARED); \
155  if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
156  FL_SET_RAW((shared_str), STR_IS_SHARED_M); \
157  } \
158 } while (0)
159 
160 #define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
161 #define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
162 
163 #define STR_ENC_GET(str) get_encoding(str)
164 
165 #if !defined SHARABLE_MIDDLE_SUBSTRING
166 # define SHARABLE_MIDDLE_SUBSTRING 0
167 #endif
168 #if !SHARABLE_MIDDLE_SUBSTRING
169 #define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
170 #else
171 #define SHARABLE_SUBSTRING_P(beg, len, end) 1
172 #endif
173 
174 #define STR_EMBEDDABLE_P(len, termlen) \
175  ((len) <= RSTRING_EMBED_LEN_MAX + 1 - (termlen))
176 
178 static VALUE str_new_shared(VALUE klass, VALUE str);
179 static VALUE str_new_frozen(VALUE klass, VALUE orig);
180 static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
181 static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
182 static inline void str_modifiable(VALUE str);
183 static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
184 
185 static inline void
187 {
188  long len = RSTRING_LEN(str);
189  int termlen = TERM_LEN(str);
190  str_make_independent_expand((str), len, 0L, termlen);
191 }
192 
193 /* symbols for [up|down|swap]case/capitalize options */
195 
196 static rb_encoding *
197 get_actual_encoding(const int encidx, VALUE str)
198 {
199  const unsigned char *q;
200 
201  switch (encidx) {
202  case ENCINDEX_UTF_16:
203  if (RSTRING_LEN(str) < 2) break;
204  q = (const unsigned char *)RSTRING_PTR(str);
205  if (q[0] == 0xFE && q[1] == 0xFF) {
207  }
208  if (q[0] == 0xFF && q[1] == 0xFE) {
210  }
211  return rb_ascii8bit_encoding();
212  case ENCINDEX_UTF_32:
213  if (RSTRING_LEN(str) < 4) break;
214  q = (const unsigned char *)RSTRING_PTR(str);
215  if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) {
217  }
218  if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) {
220  }
221  return rb_ascii8bit_encoding();
222  }
223  return rb_enc_from_index(encidx);
224 }
225 
226 static rb_encoding *
228 {
229  return get_actual_encoding(ENCODING_GET(str), str);
230 }
231 
232 static void
234 {
235  if (is_broken_string(str)) {
236  rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
237  }
238 }
239 
240 static void
242 {
243  rb_encoding *enc = STR_ENC_GET(str);
244  if (rb_enc_mbminlen(enc) > 1) {
245  rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
246  }
247 }
248 
249 static int fstring_cmp(VALUE a, VALUE b);
250 
251 static VALUE register_fstring(VALUE str);
252 
254  fstring_cmp,
255  rb_str_hash,
256 };
257 
258 #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_TAINT|FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
259 
260 static int
262 {
263  VALUE *fstr = (VALUE *)arg;
264  VALUE str = (VALUE)*key;
265 
266  if (existing) {
267  /* because of lazy sweep, str may be unmarked already and swept
268  * at next time */
269 
270  if (rb_objspace_garbage_object_p(str)) {
271  *fstr = Qundef;
272  return ST_DELETE;
273  }
274 
275  *fstr = str;
276  return ST_STOP;
277  }
278  else {
279  if (FL_TEST_RAW(str, STR_FAKESTR)) {
280  str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
281  RSTRING(str)->as.heap.len,
282  ENCODING_GET(str));
283  OBJ_FREEZE_RAW(str);
284  }
285  else {
286  str = str_new_frozen(rb_cString, str);
287  if (STR_SHARED_P(str)) { /* str should not be shared */
288  /* shared substring */
290  assert(OBJ_FROZEN(str));
291  }
292  if (!BARE_STRING_P(str)) {
293  str = str_new_frozen(rb_cString, str);
294  }
295  }
296  RBASIC(str)->flags |= RSTRING_FSTR;
297 
298  *key = *value = *fstr = str;
299  return ST_CONTINUE;
300  }
301 }
302 
304 VALUE
306 {
307  VALUE fstr;
308  int bare;
309 
310  Check_Type(str, T_STRING);
311 
312  if (FL_TEST(str, RSTRING_FSTR))
313  return str;
314 
315  bare = BARE_STRING_P(str);
316  if (STR_EMBED_P(str) && !bare) {
317  OBJ_FREEZE_RAW(str);
318  return str;
319  }
320 
321  fstr = register_fstring(str);
322 
323  if (!bare) {
325  OBJ_FREEZE_RAW(str);
326  return str;
327  }
328  return fstr;
329 }
330 
331 static VALUE
333 {
334  VALUE ret;
335  st_table *frozen_strings = rb_vm_fstring_table();
336 
337  do {
338  ret = str;
339  st_update(frozen_strings, (st_data_t)str,
341  } while (ret == Qundef);
342 
343  assert(OBJ_FROZEN(ret));
345  assert(!FL_TEST_RAW(ret, FL_EXIVAR));
346  assert(!FL_TEST_RAW(ret, FL_TAINT));
347  assert(RBASIC_CLASS(ret) == rb_cString);
348  return ret;
349 }
350 
351 static VALUE
352 setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
353 {
355  /* SHARED to be allocated by the callback */
356 
357  ENCODING_SET_INLINED((VALUE)fake_str, encidx);
358 
360  fake_str->as.heap.len = len;
361  fake_str->as.heap.ptr = (char *)name;
362  fake_str->as.heap.aux.capa = len;
363  return (VALUE)fake_str;
364 }
365 
366 VALUE
367 rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
368 {
369  return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
370 }
371 
372 VALUE
373 rb_fstring_new(const char *ptr, long len)
374 {
375  struct RString fake_str;
376  return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII));
377 }
378 
379 VALUE
380 rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
381 {
382  struct RString fake_str;
383  return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc));
384 }
385 
386 VALUE
387 rb_fstring_cstr(const char *ptr)
388 {
389  return rb_fstring_new(ptr, strlen(ptr));
390 }
391 
392 VALUE
394 {
395  return rb_fstring_enc_new(ptr, strlen(ptr), enc);
396 }
397 
398 static int
400 {
401  RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
402  return ST_CONTINUE;
403 }
404 
405 static int
407 {
408  long alen, blen;
409  const char *aptr, *bptr;
410  RSTRING_GETMEM(a, aptr, alen);
411  RSTRING_GETMEM(b, bptr, blen);
412  return (alen != blen ||
413  ENCODING_GET(a) != ENCODING_GET(b) ||
414  memcmp(aptr, bptr, alen) != 0);
415 }
416 
417 static inline int
419 {
420  rb_encoding *enc;
421 
422  /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
423  if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
424  return 1;
425 
426  enc = STR_ENC_GET(str);
427  if (rb_enc_mbmaxlen(enc) == 1)
428  return 1;
429 
430  /* Conservative. Possibly single byte.
431  * "\xa1" in Shift_JIS for example. */
432  return 0;
433 }
434 
436 
437 static inline const char *
438 search_nonascii(const char *p, const char *e)
439 {
440  const uintptr_t *s, *t;
441 #if SIZEOF_VOIDP == 8
442 # define NONASCII_MASK 0x8080808080808080ULL
443 #elif SIZEOF_VOIDP == 4
444 # define NONASCII_MASK 0x80808080UL
445 #endif
446 
447  if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
448 #if !UNALIGNED_WORD_ACCESS
449  if ((uintptr_t)p % SIZEOF_VOIDP) {
450  int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
451  p += l;
452  switch (l) {
453  default: UNREACHABLE;
454 #if SIZEOF_VOIDP > 4
455  case 7: if (p[-7]&0x80) return p-7;
456  case 6: if (p[-6]&0x80) return p-6;
457  case 5: if (p[-5]&0x80) return p-5;
458  case 4: if (p[-4]&0x80) return p-4;
459 #endif
460  case 3: if (p[-3]&0x80) return p-3;
461  case 2: if (p[-2]&0x80) return p-2;
462  case 1: if (p[-1]&0x80) return p-1;
463  case 0: break;
464  }
465  }
466 #endif
467  s = (const uintptr_t *)p;
468  t = (const uintptr_t *)(e - (SIZEOF_VOIDP-1));
469  for (;s < t; s++) {
470  if (*s & NONASCII_MASK) {
471 #ifdef WORDS_BIGENDIAN
472  return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
473 #else
474  return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
475 #endif
476  }
477  }
478  p = (const char *)s;
479  }
480 
481  switch (e - p) {
482  default: UNREACHABLE;
483 #if SIZEOF_VOIDP > 4
484  case 7: if (e[-7]&0x80) return e-7;
485  case 6: if (e[-6]&0x80) return e-6;
486  case 5: if (e[-5]&0x80) return e-5;
487  case 4: if (e[-4]&0x80) return e-4;
488 #endif
489  case 3: if (e[-3]&0x80) return e-3;
490  case 2: if (e[-2]&0x80) return e-2;
491  case 1: if (e[-1]&0x80) return e-1;
492  case 0: return NULL;
493  }
494 }
495 
496 static int
497 coderange_scan(const char *p, long len, rb_encoding *enc)
498 {
499  const char *e = p + len;
500 
501  if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
502  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
503  p = search_nonascii(p, e);
505  }
506 
507  if (rb_enc_asciicompat(enc)) {
508  p = search_nonascii(p, e);
509  if (!p) return ENC_CODERANGE_7BIT;
510  for (;;) {
511  int ret = rb_enc_precise_mbclen(p, e, enc);
512  if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
513  p += MBCLEN_CHARFOUND_LEN(ret);
514  if (p == e) break;
515  p = search_nonascii(p, e);
516  if (!p) break;
517  }
518  }
519  else {
520  while (p < e) {
521  int ret = rb_enc_precise_mbclen(p, e, enc);
522  if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;
523  p += MBCLEN_CHARFOUND_LEN(ret);
524  }
525  }
526  return ENC_CODERANGE_VALID;
527 }
528 
529 long
530 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
531 {
532  const char *p = s;
533 
534  if (*cr == ENC_CODERANGE_BROKEN)
535  return e - s;
536 
537  if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
538  /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
539  if (*cr == ENC_CODERANGE_VALID) return e - s;
540  p = search_nonascii(p, e);
542  return e - s;
543  }
544  else if (rb_enc_asciicompat(enc)) {
545  p = search_nonascii(p, e);
546  if (!p) {
547  if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
548  return e - s;
549  }
550  for (;;) {
551  int ret = rb_enc_precise_mbclen(p, e, enc);
552  if (!MBCLEN_CHARFOUND_P(ret)) {
554  return p - s;
555  }
556  p += MBCLEN_CHARFOUND_LEN(ret);
557  if (p == e) break;
558  p = search_nonascii(p, e);
559  if (!p) break;
560  }
561  }
562  else {
563  while (p < e) {
564  int ret = rb_enc_precise_mbclen(p, e, enc);
565  if (!MBCLEN_CHARFOUND_P(ret)) {
567  return p - s;
568  }
569  p += MBCLEN_CHARFOUND_LEN(ret);
570  }
571  }
572  *cr = ENC_CODERANGE_VALID;
573  return e - s;
574 }
575 
576 static inline void
578 {
579  rb_enc_set_index(str1, ENCODING_GET(str2));
580 }
581 
582 static void
584 {
585  /* this function is designed for copying encoding and coderange
586  * from src to new string "dest" which is made from the part of src.
587  */
588  str_enc_copy(dest, src);
589  if (RSTRING_LEN(dest) == 0) {
590  if (!rb_enc_asciicompat(STR_ENC_GET(src)))
592  else
594  return;
595  }
596  switch (ENC_CODERANGE(src)) {
597  case ENC_CODERANGE_7BIT:
599  break;
600  case ENC_CODERANGE_VALID:
601  if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
604  else
606  break;
607  default:
608  break;
609  }
610 }
611 
612 static void
614 {
615  str_enc_copy(dest, src);
616  ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
617 }
618 
619 int
621 {
622  int cr = ENC_CODERANGE(str);
623 
624  if (cr == ENC_CODERANGE_UNKNOWN) {
625  int encidx = ENCODING_GET(str);
626  rb_encoding *enc = rb_enc_from_index(encidx);
627  if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc)) {
629  }
630  else {
631  cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str),
632  get_actual_encoding(encidx, str));
633  }
634  ENC_CODERANGE_SET(str, cr);
635  }
636  return cr;
637 }
638 
639 int
641 {
642  rb_encoding *enc = STR_ENC_GET(str);
643 
644  if (!rb_enc_asciicompat(enc))
645  return FALSE;
646  else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
647  return TRUE;
648  return FALSE;
649 }
650 
651 static inline void
652 str_mod_check(VALUE s, const char *p, long len)
653 {
654  if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
655  rb_raise(rb_eRuntimeError, "string modified");
656  }
657 }
658 
659 static size_t
660 str_capacity(VALUE str, const int termlen)
661 {
662  if (STR_EMBED_P(str)) {
663  return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
664  }
665  else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
666  return RSTRING(str)->as.heap.len;
667  }
668  else {
669  return RSTRING(str)->as.heap.aux.capa;
670  }
671 }
672 
673 size_t
675 {
676  return str_capacity(str, TERM_LEN(str));
677 }
678 
679 static inline void
680 must_not_null(const char *ptr)
681 {
682  if (!ptr) {
683  rb_raise(rb_eArgError, "NULL pointer given");
684  }
685 }
686 
687 static inline VALUE
689 {
691  return (VALUE)str;
692 }
693 
694 static inline VALUE
696 {
698  return str_alloc(klass);
699 }
700 
701 static VALUE
702 str_new0(VALUE klass, const char *ptr, long len, int termlen)
703 {
704  VALUE str;
705 
706  if (len < 0) {
707  rb_raise(rb_eArgError, "negative string size (or size too big)");
708  }
709 
711 
712  str = str_alloc(klass);
713  if (!STR_EMBEDDABLE_P(len, termlen)) {
714  RSTRING(str)->as.heap.aux.capa = len;
715  RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)len + termlen);
716  STR_SET_NOEMBED(str);
717  }
718  else if (len == 0) {
720  }
721  if (ptr) {
722  memcpy(RSTRING_PTR(str), ptr, len);
723  }
724  STR_SET_LEN(str, len);
725  TERM_FILL(RSTRING_PTR(str) + len, termlen);
726  return str;
727 }
728 
729 static VALUE
730 str_new(VALUE klass, const char *ptr, long len)
731 {
732  return str_new0(klass, ptr, len, 1);
733 }
734 
735 VALUE
736 rb_str_new(const char *ptr, long len)
737 {
738  return str_new(rb_cString, ptr, len);
739 }
740 
741 VALUE
742 rb_usascii_str_new(const char *ptr, long len)
743 {
744  VALUE str = rb_str_new(ptr, len);
746  return str;
747 }
748 
749 VALUE
750 rb_utf8_str_new(const char *ptr, long len)
751 {
752  VALUE str = str_new(rb_cString, ptr, len);
754  return str;
755 }
756 
757 VALUE
758 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
759 {
760  VALUE str;
761 
762  if (!enc) return rb_str_new(ptr, len);
763 
764  str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
765  rb_enc_associate(str, enc);
766  return str;
767 }
768 
769 VALUE
770 rb_str_new_cstr(const char *ptr)
771 {
772  must_not_null(ptr);
773  return rb_str_new(ptr, strlen(ptr));
774 }
775 
776 VALUE
778 {
779  VALUE str = rb_str_new_cstr(ptr);
781  return str;
782 }
783 
784 VALUE
786 {
787  VALUE str = rb_str_new_cstr(ptr);
789  return str;
790 }
791 
792 VALUE
794 {
795  must_not_null(ptr);
796  if (rb_enc_mbminlen(enc) != 1) {
797  rb_raise(rb_eArgError, "wchar encoding given");
798  }
799  return rb_enc_str_new(ptr, strlen(ptr), enc);
800 }
801 
802 static VALUE
803 str_new_static(VALUE klass, const char *ptr, long len, int encindex)
804 {
805  VALUE str;
806 
807  if (len < 0) {
808  rb_raise(rb_eArgError, "negative string size (or size too big)");
809  }
810 
811  if (!ptr) {
812  rb_encoding *enc = rb_enc_get_from_index(encindex);
813  str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
814  }
815  else {
817  str = str_alloc(klass);
818  RSTRING(str)->as.heap.len = len;
819  RSTRING(str)->as.heap.ptr = (char *)ptr;
820  RSTRING(str)->as.heap.aux.capa = len;
821  STR_SET_NOEMBED(str);
822  RBASIC(str)->flags |= STR_NOFREE;
823  }
824  rb_enc_associate_index(str, encindex);
825  return str;
826 }
827 
828 VALUE
829 rb_str_new_static(const char *ptr, long len)
830 {
831  return str_new_static(rb_cString, ptr, len, 0);
832 }
833 
834 VALUE
836 {
837  return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
838 }
839 
840 VALUE
841 rb_utf8_str_new_static(const char *ptr, long len)
842 {
843  return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
844 }
845 
846 VALUE
847 rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
848 {
849  return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
850 }
851 
852 VALUE
853 rb_tainted_str_new(const char *ptr, long len)
854 {
855  VALUE str = rb_str_new(ptr, len);
856 
857  OBJ_TAINT(str);
858  return str;
859 }
860 
861 static VALUE
863 {
864  VALUE str = rb_enc_str_new(ptr, len, enc);
865 
866  OBJ_TAINT(str);
867  return str;
868 }
869 
870 VALUE
872 {
873  VALUE str = rb_str_new_cstr(ptr);
874 
875  OBJ_TAINT(str);
876  return str;
877 }
878 
879 static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
880  rb_encoding *from, rb_encoding *to,
881  int ecflags, VALUE ecopts);
882 
883 VALUE
884 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
885 {
886  long len;
887  const char *ptr;
888  VALUE newstr;
889 
890  if (!to) return str;
891  if (!from) from = rb_enc_get(str);
892  if (from == to) return str;
893  if ((rb_enc_asciicompat(to) && is_ascii_string(str)) ||
894  to == rb_ascii8bit_encoding()) {
895  if (STR_ENC_GET(str) != to) {
896  str = rb_str_dup(str);
897  rb_enc_associate(str, to);
898  }
899  return str;
900  }
901 
902  RSTRING_GETMEM(str, ptr, len);
903  newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
904  from, to, ecflags, ecopts);
905  if (NIL_P(newstr)) {
906  /* some error, return original */
907  return str;
908  }
909  OBJ_INFECT(newstr, str);
910  return newstr;
911 }
912 
913 VALUE
914 rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
915  rb_encoding *from, int ecflags, VALUE ecopts)
916 {
917  long olen;
918 
919  olen = RSTRING_LEN(newstr);
920  if (ofs < -olen || olen <= ofs)
921  rb_raise(rb_eIndexError, "index %ld out of string", ofs);
922  if (ofs < 0) ofs += olen;
923  if (!from) {
924  STR_SET_LEN(newstr, ofs);
925  return rb_str_cat(newstr, ptr, len);
926  }
927 
928  rb_str_modify(newstr);
929  return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
930  rb_enc_get(newstr),
931  ecflags, ecopts);
932 }
933 
934 static VALUE
935 str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
936  rb_encoding *from, rb_encoding *to,
937  int ecflags, VALUE ecopts)
938 {
939  rb_econv_t *ec;
940  rb_econv_result_t ret;
941  long olen;
942  VALUE econv_wrapper;
943  const unsigned char *start, *sp;
944  unsigned char *dest, *dp;
945  size_t converted_output = (size_t)ofs;
946 
947  olen = rb_str_capacity(newstr);
948 
949  econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
950  RBASIC_CLEAR_CLASS(econv_wrapper);
951  ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
952  if (!ec) return Qnil;
953  DATA_PTR(econv_wrapper) = ec;
954 
955  sp = (unsigned char*)ptr;
956  start = sp;
957  while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
958  (dp = dest + converted_output),
959  (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
961  /* destination buffer short */
962  size_t converted_input = sp - start;
963  size_t rest = len - converted_input;
964  converted_output = dp - dest;
965  rb_str_set_len(newstr, converted_output);
966  if (converted_input && converted_output &&
967  rest < (LONG_MAX / converted_output)) {
968  rest = (rest * converted_output) / converted_input;
969  }
970  else {
971  rest = olen;
972  }
973  olen += rest < 2 ? 2 : rest;
974  rb_str_resize(newstr, olen);
975  }
976  DATA_PTR(econv_wrapper) = 0;
977  rb_econv_close(ec);
978  rb_gc_force_recycle(econv_wrapper);
979  switch (ret) {
980  case econv_finished:
981  len = dp - (unsigned char*)RSTRING_PTR(newstr);
982  rb_str_set_len(newstr, len);
983  rb_enc_associate(newstr, to);
984  return newstr;
985 
986  default:
987  return Qnil;
988  }
989 }
990 
991 VALUE
993 {
994  return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
995 }
996 
997 VALUE
999 {
1000  VALUE str;
1001 
1002  str = rb_tainted_str_new_with_enc(ptr, len, eenc);
1003  return rb_external_str_with_enc(str, eenc);
1004 }
1005 
1006 VALUE
1008 {
1009  int eidx = rb_enc_to_index(eenc);
1010  if (eidx == rb_usascii_encindex() &&
1013  return str;
1014  }
1015  rb_enc_associate_index(str, eidx);
1016  return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1017 }
1018 
1019 VALUE
1020 rb_external_str_new(const char *ptr, long len)
1021 {
1023 }
1024 
1025 VALUE
1027 {
1029 }
1030 
1031 VALUE
1032 rb_locale_str_new(const char *ptr, long len)
1033 {
1035 }
1036 
1037 VALUE
1039 {
1041 }
1042 
1043 VALUE
1044 rb_filesystem_str_new(const char *ptr, long len)
1045 {
1047 }
1048 
1049 VALUE
1051 {
1053 }
1054 
1055 VALUE
1057 {
1059 }
1060 
1061 VALUE
1063 {
1064  return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
1065 }
1066 
1067 VALUE
1069 {
1070  return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1071 }
1072 
1073 static VALUE
1075 {
1076  const int termlen = TERM_LEN(str);
1077  char *ptr;
1078  long len;
1079 
1080  RSTRING_GETMEM(str, ptr, len);
1081  if (STR_EMBEDDABLE_P(len, termlen)) {
1082  char *ptr2 = RSTRING(str2)->as.ary;
1083  STR_SET_EMBED(str2);
1084  memcpy(ptr2, RSTRING_PTR(str), len);
1085  STR_SET_EMBED_LEN(str2, len);
1086  TERM_FILL(ptr2+len, termlen);
1087  }
1088  else {
1089  str = rb_str_new_frozen(str);
1090  FL_SET(str2, STR_NOEMBED);
1091  RSTRING_GETMEM(str, ptr, len);
1092  RSTRING(str2)->as.heap.len = len;
1093  RSTRING(str2)->as.heap.ptr = ptr;
1094  STR_SET_SHARED(str2, str);
1095  }
1096  return str2;
1097 }
1098 
1099 static VALUE
1101 {
1102  str_replace_shared_without_enc(str2, str);
1103  rb_enc_cr_str_exact_copy(str2, str);
1104  return str2;
1105 }
1106 
1107 static VALUE
1109 {
1110  return str_replace_shared(str_alloc(klass), str);
1111 }
1112 
1113 VALUE
1115 {
1116  VALUE str2 = str_new_shared(rb_obj_class(str), str);
1117 
1118  OBJ_INFECT(str2, str);
1119  return str2;
1120 }
1121 
1122 VALUE
1124 {
1125  VALUE str;
1126 
1127  if (OBJ_FROZEN(orig)) return orig;
1128 
1129  str = str_new_frozen(rb_obj_class(orig), orig);
1130  OBJ_INFECT(str, orig);
1131  return str;
1132 }
1133 
1134 VALUE
1136 {
1137  VALUE tmp;
1138 
1139  if (OBJ_FROZEN_RAW(orig)) return orig;
1140 
1141  tmp = str_new_frozen(0, orig);
1142  OBJ_INFECT(tmp, orig);
1143 
1144  return tmp;
1145 }
1146 
1147 void
1149 {
1150  if (RBASIC_CLASS(tmp) != 0)
1151  return;
1152 
1153  if (STR_EMBED_P(tmp)) {
1154  assert(OBJ_FROZEN_RAW(tmp));
1155  rb_gc_force_recycle(tmp);
1156  }
1157  else if (FL_TEST_RAW(orig, STR_SHARED) &&
1159  VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1160 
1161  if (shared == tmp && !FL_TEST_RAW(tmp, STR_IS_SHARED_M)) {
1162  FL_UNSET_RAW(orig, STR_SHARED);
1163  assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1164  assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1165  RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1166  RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1167  assert(OBJ_FROZEN_RAW(tmp));
1168  rb_gc_force_recycle(tmp);
1169  }
1170  }
1171 }
1172 
1173 static VALUE
1175 {
1176  VALUE str;
1177 
1178  if (STR_EMBED_P(orig)) {
1179  str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
1180  }
1181  else {
1182  if (FL_TEST_RAW(orig, STR_SHARED)) {
1183  VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1184  long ofs = RSTRING(orig)->as.heap.ptr - RSTRING(shared)->as.heap.ptr;
1185  long rest = RSTRING(shared)->as.heap.len - ofs - RSTRING(orig)->as.heap.len;
1186  assert(!STR_EMBED_P(shared));
1187  assert(OBJ_FROZEN(shared));
1188 
1189  if ((ofs > 0) || (rest > 0) ||
1190  (klass != RBASIC(shared)->klass) ||
1191  ((RBASIC(shared)->flags ^ RBASIC(orig)->flags) & FL_TAINT) ||
1192  ENCODING_GET(shared) != ENCODING_GET(orig)) {
1193  str = str_new_shared(klass, shared);
1194  RSTRING(str)->as.heap.ptr += ofs;
1195  RSTRING(str)->as.heap.len -= ofs + rest;
1196  }
1197  else {
1198  if (RBASIC_CLASS(shared) == 0)
1199  FL_SET_RAW(shared, STR_IS_SHARED_M);
1200  return shared;
1201  }
1202  }
1203  else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1204  str = str_alloc(klass);
1205  STR_SET_EMBED(str);
1206  memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1207  STR_SET_EMBED_LEN(str, RSTRING_LEN(orig));
1208  TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1209  }
1210  else {
1211  str = str_alloc(klass);
1212  STR_SET_NOEMBED(str);
1213  RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1214  RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1215  RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1216  RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1217  RBASIC(orig)->flags &= ~STR_NOFREE;
1218  STR_SET_SHARED(orig, str);
1219  if (klass == 0)
1221  }
1222  }
1223 
1224  rb_enc_cr_str_exact_copy(str, orig);
1225  OBJ_FREEZE(str);
1226  return str;
1227 }
1228 
1229 VALUE
1230 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1231 {
1232  return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1233 }
1234 
1235 static VALUE
1237 {
1238  VALUE v = rb_str_new_with_class(str, 0, 0);
1239  rb_enc_copy(v, str);
1240  OBJ_INFECT(v, str);
1241  return v;
1242 }
1243 
1244 #define STR_BUF_MIN_SIZE 127
1245 
1246 VALUE
1248 {
1249  VALUE str = str_alloc(rb_cString);
1250 
1251  if (capa < STR_BUF_MIN_SIZE) {
1252  capa = STR_BUF_MIN_SIZE;
1253  }
1254  FL_SET(str, STR_NOEMBED);
1255  RSTRING(str)->as.heap.aux.capa = capa;
1256  RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1257  RSTRING(str)->as.heap.ptr[0] = '\0';
1258 
1259  return str;
1260 }
1261 
1262 VALUE
1264 {
1265  VALUE str;
1266  long len = strlen(ptr);
1267 
1268  str = rb_str_buf_new(len);
1269  rb_str_buf_cat(str, ptr, len);
1270 
1271  return str;
1272 }
1273 
1274 VALUE
1276 {
1277  return str_new(0, 0, len);
1278 }
1279 
1280 void
1282 {
1283  if (FL_TEST(str, RSTRING_FSTR)) {
1284  st_data_t fstr = (st_data_t)str;
1285  st_delete(rb_vm_fstring_table(), &fstr, NULL);
1286  }
1287 
1288  if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1290  }
1291 }
1292 
1293 RUBY_FUNC_EXPORTED size_t
1295 {
1297  return STR_HEAP_SIZE(str);
1298  }
1299  else {
1300  return 0;
1301  }
1302 }
1303 
1304 VALUE
1306 {
1307  return rb_convert_type(str, T_STRING, "String", "to_str");
1308 }
1309 
1310 static inline void str_discard(VALUE str);
1311 static void str_shared_replace(VALUE str, VALUE str2);
1312 
1313 void
1315 {
1316  if (str != str2) str_shared_replace(str, str2);
1317 }
1318 
1319 static void
1321 {
1322  rb_encoding *enc;
1323  int cr;
1324  int termlen;
1325 
1326  ASSUME(str2 != str);
1327  enc = STR_ENC_GET(str2);
1328  cr = ENC_CODERANGE(str2);
1329  str_discard(str);
1330  OBJ_INFECT(str, str2);
1331  termlen = rb_enc_mbminlen(enc);
1332 
1333  if (STR_EMBEDDABLE_P(RSTRING_LEN(str2), termlen)) {
1334  STR_SET_EMBED(str);
1335  memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1336  STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
1337  rb_enc_associate(str, enc);
1338  ENC_CODERANGE_SET(str, cr);
1339  }
1340  else {
1341  STR_SET_NOEMBED(str);
1342  FL_UNSET(str, STR_SHARED);
1343  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1344  RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1345 
1346  if (FL_TEST(str2, STR_SHARED)) {
1347  VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1348  STR_SET_SHARED(str, shared);
1349  }
1350  else {
1351  RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1352  }
1353 
1354  /* abandon str2 */
1355  STR_SET_EMBED(str2);
1356  RSTRING_PTR(str2)[0] = 0;
1357  STR_SET_EMBED_LEN(str2, 0);
1358  rb_enc_associate(str, enc);
1359  ENC_CODERANGE_SET(str, cr);
1360  }
1361 }
1362 
1363 VALUE
1365 {
1366  VALUE str;
1367 
1368  if (RB_TYPE_P(obj, T_STRING)) {
1369  return obj;
1370  }
1371  str = rb_funcall(obj, idTo_s, 0);
1372  if (!RB_TYPE_P(str, T_STRING))
1373  return rb_any_to_s(obj);
1374  if (!FL_TEST_RAW(str, RSTRING_FSTR) && FL_ABLE(obj))
1375  /* fstring must not be tainted, at least */
1376  OBJ_INFECT_RAW(str, obj);
1377  return str;
1378 }
1379 
1380 static VALUE
1382 {
1383  long len;
1384 
1385  len = RSTRING_LEN(str2);
1386  if (STR_SHARED_P(str2)) {
1387  VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1388  assert(OBJ_FROZEN(shared));
1389  STR_SET_NOEMBED(str);
1390  RSTRING(str)->as.heap.len = len;
1391  RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1392  STR_SET_SHARED(str, shared);
1393  rb_enc_cr_str_exact_copy(str, str2);
1394  }
1395  else {
1396  str_replace_shared(str, str2);
1397  }
1398 
1399  OBJ_INFECT(str, str2);
1400  return str;
1401 }
1402 
1403 static inline VALUE
1405 {
1406  enum {embed_size = RSTRING_EMBED_LEN_MAX + 1};
1407  const VALUE flag_mask =
1411  ;
1412  VALUE flags = FL_TEST_RAW(str, flag_mask);
1413  VALUE dup = str_alloc(klass);
1414  MEMCPY(RSTRING(dup)->as.ary, RSTRING(str)->as.ary,
1415  char, embed_size);
1416  if (flags & STR_NOEMBED) {
1417  if (UNLIKELY(!(flags & FL_FREEZE))) {
1418  str = str_new_frozen(klass, str);
1419  FL_SET_RAW(str, flags & FL_TAINT);
1420  flags = FL_TEST_RAW(str, flag_mask);
1421  }
1422  if (flags & STR_NOEMBED) {
1423  RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, str);
1424  flags |= STR_SHARED;
1425  }
1426  else {
1427  MEMCPY(RSTRING(dup)->as.ary, RSTRING(str)->as.ary,
1428  char, embed_size);
1429  }
1430  }
1431  FL_SET_RAW(dup, flags & ~FL_FREEZE);
1432  return dup;
1433 }
1434 
1435 VALUE
1437 {
1438  return str_duplicate(rb_obj_class(str), str);
1439 }
1440 
1441 VALUE
1443 {
1445  return str_duplicate(rb_cString, str);
1446 }
1447 
1448 /*
1449  * call-seq:
1450  * String.new(str="") -> new_str
1451  * String.new(str="", encoding: enc) -> new_str
1452  * String.new(str="", capacity: size) -> new_str
1453  *
1454  * Returns a new string object containing a copy of <i>str</i>.
1455  *
1456  * The optional <i>enc</i> argument specifies the encoding of the new string.
1457  * If not specified, the encoding of <i>str</i> (or ASCII-8BIT, if <i>str</i>
1458  * is not specified) is used.
1459  *
1460  * The optional <i>size</i> argument specifies the size of internal buffer.
1461  * This may improve performance, when the string will be concatenated many
1462  * times (and call many realloc).
1463  */
1464 
1465 static VALUE
1467 {
1468  static ID keyword_ids[2];
1469  VALUE orig, opt, venc, vcapa;
1470  VALUE kwargs[2];
1471  rb_encoding *enc = 0;
1472  int n;
1473 
1474  if (!keyword_ids[0]) {
1475  keyword_ids[0] = rb_id_encoding();
1476  CONST_ID(keyword_ids[1], "capacity");
1477  }
1478 
1479  n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1480  if (!NIL_P(opt)) {
1481  rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1482  venc = kwargs[0];
1483  vcapa = kwargs[1];
1484  if (venc != Qundef && !NIL_P(venc)) {
1485  enc = rb_to_encoding(venc);
1486  }
1487  if (vcapa != Qundef && !NIL_P(vcapa)) {
1488  long capa = NUM2LONG(vcapa);
1489  long len = 0;
1490  int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1491 
1492  if (capa < STR_BUF_MIN_SIZE) {
1493  capa = STR_BUF_MIN_SIZE;
1494  }
1495  if (n == 1) {
1496  StringValue(orig);
1497  len = RSTRING_LEN(orig);
1498  if (capa < len) {
1499  capa = len;
1500  }
1501  if (orig == str) n = 0;
1502  }
1503  str_modifiable(str);
1504  if (STR_EMBED_P(str)) { /* make noembed always */
1505  RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + termlen);
1506  }
1507  else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1508  REALLOC_N(RSTRING(str)->as.heap.ptr, char, (size_t)capa + termlen);
1509  }
1510  RSTRING(str)->as.heap.len = len;
1511  TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1512  if (n == 1) {
1513  memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1514  rb_enc_cr_str_exact_copy(str, orig);
1515  }
1516  FL_SET(str, STR_NOEMBED);
1517  RSTRING(str)->as.heap.aux.capa = capa;
1518  }
1519  else if (n == 1) {
1520  rb_str_replace(str, orig);
1521  }
1522  if (enc) {
1523  rb_enc_associate(str, enc);
1524  ENC_CODERANGE_CLEAR(str);
1525  }
1526  }
1527  else if (n == 1) {
1528  rb_str_replace(str, orig);
1529  }
1530  return str;
1531 }
1532 
1533 #ifdef NONASCII_MASK
1534 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1535 
1536 /*
1537  * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1538  * bit representation. (see http://en.wikipedia.org/wiki/UTF-8)
1539  * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1540  *
1541  * if (!(byte & 0x80))
1542  * byte |= 0x40; // turn on bit6
1543  * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1544  *
1545  * This function calculates whether a byte is leading or not for all bytes
1546  * in the argument word by concurrently using the above logic, and then
1547  * adds up the number of leading bytes in the word.
1548  */
1549 static inline uintptr_t
1550 count_utf8_lead_bytes_with_word(const uintptr_t *s)
1551 {
1552  uintptr_t d = *s;
1553 
1554  /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1555  d = (d>>6) | (~d>>7);
1556  d &= NONASCII_MASK >> 7;
1557 
1558  /* Gather all bytes. */
1559 #if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1560  /* use only if it can use POPCNT */
1561  return rb_popcount_intptr(d);
1562 #else
1563  d += (d>>8);
1564  d += (d>>16);
1565 # if SIZEOF_VOIDP == 8
1566  d += (d>>32);
1567 # endif
1568  return (d&0xF);
1569 #endif
1570 }
1571 #endif
1572 
1573 static inline long
1574 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
1575 {
1576  long c;
1577  const char *q;
1578 
1579  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1580  long diff = (long)(e - p);
1581  return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1582  }
1583 #ifdef NONASCII_MASK
1584  else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
1585  uintptr_t len = 0;
1586  if ((int)sizeof(uintptr_t) * 2 < e - p) {
1587  const uintptr_t *s, *t;
1588  const uintptr_t lowbits = sizeof(uintptr_t) - 1;
1589  s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
1590  t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
1591  while (p < (const char *)s) {
1592  if (is_utf8_lead_byte(*p)) len++;
1593  p++;
1594  }
1595  while (s < t) {
1596  len += count_utf8_lead_bytes_with_word(s);
1597  s++;
1598  }
1599  p = (const char *)s;
1600  }
1601  while (p < e) {
1602  if (is_utf8_lead_byte(*p)) len++;
1603  p++;
1604  }
1605  return (long)len;
1606  }
1607 #endif
1608  else if (rb_enc_asciicompat(enc)) {
1609  c = 0;
1610  if (ENC_CODERANGE_CLEAN_P(cr)) {
1611  while (p < e) {
1612  if (ISASCII(*p)) {
1613  q = search_nonascii(p, e);
1614  if (!q)
1615  return c + (e - p);
1616  c += q - p;
1617  p = q;
1618  }
1619  p += rb_enc_fast_mbclen(p, e, enc);
1620  c++;
1621  }
1622  }
1623  else {
1624  while (p < e) {
1625  if (ISASCII(*p)) {
1626  q = search_nonascii(p, e);
1627  if (!q)
1628  return c + (e - p);
1629  c += q - p;
1630  p = q;
1631  }
1632  p += rb_enc_mbclen(p, e, enc);
1633  c++;
1634  }
1635  }
1636  return c;
1637  }
1638 
1639  for (c=0; p<e; c++) {
1640  p += rb_enc_mbclen(p, e, enc);
1641  }
1642  return c;
1643 }
1644 
1645 long
1646 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
1647 {
1648  return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
1649 }
1650 
1651 /* To get strlen with cr
1652  * Note that given cr is not used.
1653  */
1654 long
1655 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
1656 {
1657  long c;
1658  const char *q;
1659  int ret;
1660 
1661  *cr = 0;
1662  if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1663  long diff = (long)(e - p);
1664  return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1665  }
1666  else if (rb_enc_asciicompat(enc)) {
1667  c = 0;
1668  while (p < e) {
1669  if (ISASCII(*p)) {
1670  q = search_nonascii(p, e);
1671  if (!q) {
1672  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1673  return c + (e - p);
1674  }
1675  c += q - p;
1676  p = q;
1677  }
1678  ret = rb_enc_precise_mbclen(p, e, enc);
1679  if (MBCLEN_CHARFOUND_P(ret)) {
1680  *cr |= ENC_CODERANGE_VALID;
1681  p += MBCLEN_CHARFOUND_LEN(ret);
1682  }
1683  else {
1684  *cr = ENC_CODERANGE_BROKEN;
1685  p++;
1686  }
1687  c++;
1688  }
1689  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1690  return c;
1691  }
1692 
1693  for (c=0; p<e; c++) {
1694  ret = rb_enc_precise_mbclen(p, e, enc);
1695  if (MBCLEN_CHARFOUND_P(ret)) {
1696  *cr |= ENC_CODERANGE_VALID;
1697  p += MBCLEN_CHARFOUND_LEN(ret);
1698  }
1699  else {
1700  *cr = ENC_CODERANGE_BROKEN;
1701  if (p + rb_enc_mbminlen(enc) <= e)
1702  p += rb_enc_mbminlen(enc);
1703  else
1704  p = e;
1705  }
1706  }
1707  if (!*cr) *cr = ENC_CODERANGE_7BIT;
1708  return c;
1709 }
1710 
1711 /* enc must be str's enc or rb_enc_check(str, str2) */
1712 static long
1714 {
1715  const char *p, *e;
1716  int cr;
1717 
1718  if (single_byte_optimizable(str)) return RSTRING_LEN(str);
1719  if (!enc) enc = STR_ENC_GET(str);
1720  p = RSTRING_PTR(str);
1721  e = RSTRING_END(str);
1722  cr = ENC_CODERANGE(str);
1723 
1724  if (cr == ENC_CODERANGE_UNKNOWN) {
1725  long n = rb_enc_strlen_cr(p, e, enc, &cr);
1726  if (cr) ENC_CODERANGE_SET(str, cr);
1727  return n;
1728  }
1729  else {
1730  return enc_strlen(p, e, enc, cr);
1731  }
1732 }
1733 
1734 long
1736 {
1737  return str_strlen(str, NULL);
1738 }
1739 
1740 /*
1741  * call-seq:
1742  * str.length -> integer
1743  * str.size -> integer
1744  *
1745  * Returns the character length of <i>str</i>.
1746  */
1747 
1748 VALUE
1750 {
1751  return LONG2NUM(str_strlen(str, NULL));
1752 }
1753 
1754 /*
1755  * call-seq:
1756  * str.bytesize -> integer
1757  *
1758  * Returns the length of +str+ in bytes.
1759  *
1760  * "\x80\u3042".bytesize #=> 4
1761  * "hello".bytesize #=> 5
1762  */
1763 
1764 static VALUE
1766 {
1767  return LONG2NUM(RSTRING_LEN(str));
1768 }
1769 
1770 /*
1771  * call-seq:
1772  * str.empty? -> true or false
1773  *
1774  * Returns <code>true</code> if <i>str</i> has a length of zero.
1775  *
1776  * "hello".empty? #=> false
1777  * " ".empty? #=> false
1778  * "".empty? #=> true
1779  */
1780 
1781 static VALUE
1783 {
1784  if (RSTRING_LEN(str) == 0)
1785  return Qtrue;
1786  return Qfalse;
1787 }
1788 
1789 /*
1790  * call-seq:
1791  * str + other_str -> new_str
1792  *
1793  * Concatenation---Returns a new <code>String</code> containing
1794  * <i>other_str</i> concatenated to <i>str</i>.
1795  *
1796  * "Hello from " + self.to_s #=> "Hello from main"
1797  */
1798 
1799 VALUE
1801 {
1802  VALUE str3;
1803  rb_encoding *enc;
1804  char *ptr1, *ptr2, *ptr3;
1805  long len1, len2;
1806  int termlen;
1807 
1808  StringValue(str2);
1809  enc = rb_enc_check_str(str1, str2);
1810  RSTRING_GETMEM(str1, ptr1, len1);
1811  RSTRING_GETMEM(str2, ptr2, len2);
1812  termlen = rb_enc_mbminlen(enc);
1813  if (len1 > LONG_MAX - len2) {
1814  rb_raise(rb_eArgError, "string size too big");
1815  }
1816  str3 = str_new0(rb_cString, 0, len1+len2, termlen);
1817  ptr3 = RSTRING_PTR(str3);
1818  memcpy(ptr3, ptr1, len1);
1819  memcpy(ptr3+len1, ptr2, len2);
1820  TERM_FILL(&ptr3[len1+len2], termlen);
1821 
1822  FL_SET_RAW(str3, OBJ_TAINTED_RAW(str1) | OBJ_TAINTED_RAW(str2));
1825  RB_GC_GUARD(str1);
1826  RB_GC_GUARD(str2);
1827  return str3;
1828 }
1829 
1830 /*
1831  * call-seq:
1832  * str * integer -> new_str
1833  *
1834  * Copy --- Returns a new String containing +integer+ copies of the receiver.
1835  * +integer+ must be greater than or equal to 0.
1836  *
1837  * "Ho! " * 3 #=> "Ho! Ho! Ho! "
1838  * "Ho! " * 0 #=> ""
1839  */
1840 
1841 VALUE
1843 {
1844  VALUE str2;
1845  long n, len;
1846  char *ptr2;
1847  int termlen;
1848 
1849  if (times == INT2FIX(1)) {
1850  return rb_str_dup(str);
1851  }
1852  if (times == INT2FIX(0)) {
1853  str2 = str_alloc(rb_obj_class(str));
1854  rb_enc_copy(str2, str);
1855  OBJ_INFECT(str2, str);
1856  return str2;
1857  }
1858  len = NUM2LONG(times);
1859  if (len < 0) {
1860  rb_raise(rb_eArgError, "negative argument");
1861  }
1862  if (len && LONG_MAX/len < RSTRING_LEN(str)) {
1863  rb_raise(rb_eArgError, "argument too big");
1864  }
1865 
1866  len *= RSTRING_LEN(str);
1867  termlen = TERM_LEN(str);
1868  str2 = str_new0(rb_obj_class(str), 0, len, termlen);
1869  ptr2 = RSTRING_PTR(str2);
1870  if (len) {
1871  n = RSTRING_LEN(str);
1872  memcpy(ptr2, RSTRING_PTR(str), n);
1873  while (n <= len/2) {
1874  memcpy(ptr2 + n, ptr2, n);
1875  n *= 2;
1876  }
1877  memcpy(ptr2 + n, ptr2, len-n);
1878  }
1879  STR_SET_LEN(str2, len);
1880  TERM_FILL(&ptr2[len], termlen);
1881  OBJ_INFECT(str2, str);
1882  rb_enc_cr_str_copy_for_substr(str2, str);
1883 
1884  return str2;
1885 }
1886 
1887 /*
1888  * call-seq:
1889  * str % arg -> new_str
1890  *
1891  * Format---Uses <i>str</i> as a format specification, and returns the result
1892  * of applying it to <i>arg</i>. If the format specification contains more than
1893  * one substitution, then <i>arg</i> must be an <code>Array</code> or <code>Hash</code>
1894  * containing the values to be substituted. See <code>Kernel::sprintf</code> for
1895  * details of the format string.
1896  *
1897  * "%05d" % 123 #=> "00123"
1898  * "%-5s: %08x" % [ "ID", self.object_id ] #=> "ID : 200e14d6"
1899  * "foo = %{foo}" % { :foo => 'bar' } #=> "foo = bar"
1900  */
1901 
1902 static VALUE
1904 {
1905  VALUE tmp = rb_check_array_type(arg);
1906 
1907  if (!NIL_P(tmp)) {
1908  VALUE rv = rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
1909  RB_GC_GUARD(tmp);
1910  return rv;
1911  }
1912  return rb_str_format(1, &arg, str);
1913 }
1914 
1915 static inline void
1917 {
1918  if (FL_TEST(str, STR_TMPLOCK)) {
1919  rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
1920  }
1921 }
1922 
1923 static inline void
1925 {
1926  rb_check_lockedtmp(str);
1927  rb_check_frozen(str);
1928 }
1929 
1930 static inline int
1932 {
1933  if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1934  return 0;
1935  }
1936  else {
1937  return 1;
1938  }
1939 }
1940 
1941 static inline int
1943 {
1944  str_modifiable(str);
1945  return !str_dependent_p(str);
1946 }
1947 
1948 static void
1949 str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
1950 {
1951  char *ptr;
1952  const char *oldptr;
1953  long capa = len + expand;
1954 
1955  if (len > capa) len = capa;
1956 
1957  if (!STR_EMBED_P(str) && STR_EMBEDDABLE_P(capa, termlen)) {
1958  ptr = RSTRING(str)->as.heap.ptr;
1959  STR_SET_EMBED(str);
1960  memcpy(RSTRING(str)->as.ary, ptr, len);
1961  TERM_FILL(RSTRING(str)->as.ary + len, termlen);
1962  STR_SET_EMBED_LEN(str, len);
1963  return;
1964  }
1965 
1966  ptr = ALLOC_N(char, (size_t)capa + termlen);
1967  oldptr = RSTRING_PTR(str);
1968  if (oldptr) {
1969  memcpy(ptr, oldptr, len);
1970  }
1971  STR_SET_NOEMBED(str);
1973  TERM_FILL(ptr + len, termlen);
1974  RSTRING(str)->as.heap.ptr = ptr;
1975  RSTRING(str)->as.heap.len = len;
1976  RSTRING(str)->as.heap.aux.capa = capa;
1977 }
1978 
1979 void
1981 {
1982  if (!str_independent(str))
1983  str_make_independent(str);
1984  ENC_CODERANGE_CLEAR(str);
1985 }
1986 
1987 void
1988 rb_str_modify_expand(VALUE str, long expand)
1989 {
1990  int termlen = TERM_LEN(str);
1991  long len = RSTRING_LEN(str);
1992 
1993  if (expand < 0) {
1994  rb_raise(rb_eArgError, "negative expanding string size");
1995  }
1996  if (expand > LONG_MAX - len) {
1997  rb_raise(rb_eArgError, "string size too big");
1998  }
1999 
2000  if (!str_independent(str)) {
2001  str_make_independent_expand(str, len, expand, termlen);
2002  }
2003  else if (expand > 0) {
2004  RESIZE_CAPA_TERM(str, len + expand, termlen);
2005  }
2006  ENC_CODERANGE_CLEAR(str);
2007 }
2008 
2009 /* As rb_str_modify(), but don't clear coderange */
2010 static void
2012 {
2013  if (!str_independent(str))
2014  str_make_independent(str);
2015  if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
2016  /* Force re-scan later */
2017  ENC_CODERANGE_CLEAR(str);
2018 }
2019 
2020 static inline void
2022 {
2023  str_modifiable(str);
2024  if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2026  RSTRING(str)->as.heap.ptr = 0;
2027  RSTRING(str)->as.heap.len = 0;
2028  }
2029 }
2030 
2031 void
2033 {
2034  rb_encoding *enc = rb_enc_get(str);
2035  if (!rb_enc_asciicompat(enc)) {
2036  rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2037  }
2038 }
2039 
2040 VALUE
2042 {
2043  VALUE s = *ptr;
2044  if (!RB_TYPE_P(s, T_STRING)) {
2045  s = rb_str_to_str(s);
2046  *ptr = s;
2047  }
2048  return s;
2049 }
2050 
2051 char *
2053 {
2054  VALUE str = rb_string_value(ptr);
2055  return RSTRING_PTR(str);
2056 }
2057 
2058 static int
2059 zero_filled(const char *s, int n)
2060 {
2061  for (; n > 0; --n) {
2062  if (*s++) return 0;
2063  }
2064  return 1;
2065 }
2066 
2067 static const char *
2068 str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2069 {
2070  const char *e = s + len;
2071 
2072  for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2073  if (zero_filled(s, minlen)) return s;
2074  }
2075  return 0;
2076 }
2077 
2078 static char *
2079 str_fill_term(VALUE str, char *s, long len, int termlen)
2080 {
2081  long capa = str_capacity(str, termlen);
2082 
2083  /* This function assumes that (capa + termlen) bytes of memory
2084  * is allocated, like many other functions in this file.
2085  */
2086 
2087  if (capa < len) {
2088  rb_check_lockedtmp(str);
2089  str_make_independent_expand(str, len, 0L, termlen);
2090  }
2091  else if (str_dependent_p(str)) {
2092  if (!zero_filled(s + len, termlen))
2093  str_make_independent_expand(str, len, 0L, termlen);
2094  }
2095  else {
2096  TERM_FILL(s + len, termlen);
2097  return s;
2098  }
2099  return RSTRING_PTR(str);
2100 }
2101 
2102 void
2103 rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2104 {
2105  long capa = str_capacity(str, oldtermlen) + oldtermlen;
2106  long len = RSTRING_LEN(str);
2107 
2108  assert(capa >= len);
2109  if (capa - len < termlen) {
2110  rb_check_lockedtmp(str);
2111  str_make_independent_expand(str, len, 0L, termlen);
2112  }
2113  else if (str_dependent_p(str)) {
2114  if (termlen > oldtermlen)
2115  str_make_independent_expand(str, len, 0L, termlen);
2116  }
2117  else {
2118  if (!STR_EMBED_P(str)) {
2119  /* modify capa instead of realloc */
2120  assert(!FL_TEST((str), STR_SHARED));
2121  RSTRING(str)->as.heap.aux.capa = capa - termlen;
2122  }
2123  if (termlen > oldtermlen) {
2124  TERM_FILL(RSTRING_PTR(str) + len, termlen);
2125  }
2126  }
2127 
2128  return;
2129 }
2130 
2131 char *
2133 {
2134  VALUE str = rb_string_value(ptr);
2135  char *s = RSTRING_PTR(str);
2136  long len = RSTRING_LEN(str);
2137  rb_encoding *enc = rb_enc_get(str);
2138  const int minlen = rb_enc_mbminlen(enc);
2139 
2140  if (minlen > 1) {
2141  if (str_null_char(s, len, minlen, enc)) {
2142  rb_raise(rb_eArgError, "string contains null char");
2143  }
2144  return str_fill_term(str, s, len, minlen);
2145  }
2146  if (!s || memchr(s, 0, len)) {
2147  rb_raise(rb_eArgError, "string contains null byte");
2148  }
2149  if (s[len]) {
2150  s = str_fill_term(str, s, len, minlen);
2151  }
2152  return s;
2153 }
2154 
2155 char *
2156 rb_str_fill_terminator(VALUE str, const int newminlen)
2157 {
2158  char *s = RSTRING_PTR(str);
2159  long len = RSTRING_LEN(str);
2160  return str_fill_term(str, s, len, newminlen);
2161 }
2162 
2163 VALUE
2165 {
2166  str = rb_check_convert_type(str, T_STRING, "String", "to_str");
2167  return str;
2168 }
2169 
2170 /*
2171  * call-seq:
2172  * String.try_convert(obj) -> string or nil
2173  *
2174  * Try to convert <i>obj</i> into a String, using to_str method.
2175  * Returns converted string or nil if <i>obj</i> cannot be converted
2176  * for any reason.
2177  *
2178  * String.try_convert("str") #=> "str"
2179  * String.try_convert(/re/) #=> nil
2180  */
2181 static VALUE
2183 {
2184  return rb_check_string_type(str);
2185 }
2186 
2187 static char*
2188 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2189 {
2190  long nth = *nthp;
2191  if (rb_enc_mbmaxlen(enc) == 1) {
2192  p += nth;
2193  }
2194  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2195  p += nth * rb_enc_mbmaxlen(enc);
2196  }
2197  else if (rb_enc_asciicompat(enc)) {
2198  const char *p2, *e2;
2199  int n;
2200 
2201  while (p < e && 0 < nth) {
2202  e2 = p + nth;
2203  if (e < e2) {
2204  *nthp = nth;
2205  return (char *)e;
2206  }
2207  if (ISASCII(*p)) {
2208  p2 = search_nonascii(p, e2);
2209  if (!p2) {
2210  nth -= e2 - p;
2211  *nthp = nth;
2212  return (char *)e2;
2213  }
2214  nth -= p2 - p;
2215  p = p2;
2216  }
2217  n = rb_enc_mbclen(p, e, enc);
2218  p += n;
2219  nth--;
2220  }
2221  *nthp = nth;
2222  if (nth != 0) {
2223  return (char *)e;
2224  }
2225  return (char *)p;
2226  }
2227  else {
2228  while (p < e && nth--) {
2229  p += rb_enc_mbclen(p, e, enc);
2230  }
2231  }
2232  if (p > e) p = e;
2233  *nthp = nth;
2234  return (char*)p;
2235 }
2236 
2237 char*
2238 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2239 {
2240  return str_nth_len(p, e, &nth, enc);
2241 }
2242 
2243 static char*
2244 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2245 {
2246  if (singlebyte)
2247  p += nth;
2248  else {
2249  p = str_nth_len(p, e, &nth, enc);
2250  }
2251  if (!p) return 0;
2252  if (p > e) p = e;
2253  return (char *)p;
2254 }
2255 
2256 /* char offset to byte offset */
2257 static long
2258 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2259 {
2260  const char *pp = str_nth(p, e, nth, enc, singlebyte);
2261  if (!pp) return e - p;
2262  return pp - p;
2263 }
2264 
2265 long
2266 rb_str_offset(VALUE str, long pos)
2267 {
2268  return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2270 }
2271 
2272 #ifdef NONASCII_MASK
2273 static char *
2274 str_utf8_nth(const char *p, const char *e, long *nthp)
2275 {
2276  long nth = *nthp;
2277  if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2278  const uintptr_t *s, *t;
2279  const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2280  s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2281  t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2282  while (p < (const char *)s) {
2283  if (is_utf8_lead_byte(*p)) nth--;
2284  p++;
2285  }
2286  do {
2287  nth -= count_utf8_lead_bytes_with_word(s);
2288  s++;
2289  } while (s < t && (int)SIZEOF_VOIDP <= nth);
2290  p = (char *)s;
2291  }
2292  while (p < e) {
2293  if (is_utf8_lead_byte(*p)) {
2294  if (nth == 0) break;
2295  nth--;
2296  }
2297  p++;
2298  }
2299  *nthp = nth;
2300  return (char *)p;
2301 }
2302 
2303 static long
2304 str_utf8_offset(const char *p, const char *e, long nth)
2305 {
2306  const char *pp = str_utf8_nth(p, e, &nth);
2307  return pp - p;
2308 }
2309 #endif
2310 
2311 /* byte offset to char offset */
2312 long
2313 rb_str_sublen(VALUE str, long pos)
2314 {
2315  if (single_byte_optimizable(str) || pos < 0)
2316  return pos;
2317  else {
2318  char *p = RSTRING_PTR(str);
2319  return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2320  }
2321 }
2322 
2323 VALUE
2324 rb_str_subseq(VALUE str, long beg, long len)
2325 {
2326  VALUE str2;
2327 
2328  if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2329  SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
2330  long olen;
2331  str2 = rb_str_new_shared(rb_str_new_frozen(str));
2332  RSTRING(str2)->as.heap.ptr += beg;
2333  olen = RSTRING(str2)->as.heap.len;
2334  if (olen > len) RSTRING(str2)->as.heap.len = len;
2335  }
2336  else {
2337  str2 = rb_str_new_with_class(str, RSTRING_PTR(str)+beg, len);
2338  RB_GC_GUARD(str);
2339  }
2340 
2341  rb_enc_cr_str_copy_for_substr(str2, str);
2342  OBJ_INFECT(str2, str);
2343 
2344  return str2;
2345 }
2346 
2347 char *
2348 rb_str_subpos(VALUE str, long beg, long *lenp)
2349 {
2350  long len = *lenp;
2351  long slen = -1L;
2352  long blen = RSTRING_LEN(str);
2353  rb_encoding *enc = STR_ENC_GET(str);
2354  char *p, *s = RSTRING_PTR(str), *e = s + blen;
2355 
2356  if (len < 0) return 0;
2357  if (!blen) {
2358  len = 0;
2359  }
2360  if (single_byte_optimizable(str)) {
2361  if (beg > blen) return 0;
2362  if (beg < 0) {
2363  beg += blen;
2364  if (beg < 0) return 0;
2365  }
2366  if (len > blen - beg)
2367  len = blen - beg;
2368  if (len < 0) return 0;
2369  p = s + beg;
2370  goto end;
2371  }
2372  if (beg < 0) {
2373  if (len > -beg) len = -beg;
2374  if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2375  beg = -beg;
2376  while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2377  p = e;
2378  if (!p) return 0;
2379  while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2380  if (!p) return 0;
2381  len = e - p;
2382  goto end;
2383  }
2384  else {
2385  slen = str_strlen(str, enc);
2386  beg += slen;
2387  if (beg < 0) return 0;
2388  p = s + beg;
2389  if (len == 0) goto end;
2390  }
2391  }
2392  else if (beg > 0 && beg > RSTRING_LEN(str)) {
2393  return 0;
2394  }
2395  if (len == 0) {
2396  if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2397  p = s + beg;
2398  }
2399 #ifdef NONASCII_MASK
2400  else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2401  enc == rb_utf8_encoding()) {
2402  p = str_utf8_nth(s, e, &beg);
2403  if (beg > 0) return 0;
2404  len = str_utf8_offset(p, e, len);
2405  }
2406 #endif
2407  else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2408  int char_sz = rb_enc_mbmaxlen(enc);
2409 
2410  p = s + beg * char_sz;
2411  if (p > e) {
2412  return 0;
2413  }
2414  else if (len * char_sz > e - p)
2415  len = e - p;
2416  else
2417  len *= char_sz;
2418  }
2419  else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2420  if (beg > 0) return 0;
2421  len = 0;
2422  }
2423  else {
2424  len = str_offset(p, e, len, enc, 0);
2425  }
2426  end:
2427  *lenp = len;
2428  RB_GC_GUARD(str);
2429  return p;
2430 }
2431 
2432 static VALUE str_substr(VALUE str, long beg, long len, int empty);
2433 
2434 VALUE
2435 rb_str_substr(VALUE str, long beg, long len)
2436 {
2437  return str_substr(str, beg, len, TRUE);
2438 }
2439 
2440 static VALUE
2441 str_substr(VALUE str, long beg, long len, int empty)
2442 {
2443  VALUE str2;
2444  char *p = rb_str_subpos(str, beg, &len);
2445 
2446  if (!p) return Qnil;
2447  if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) &&
2448  SHARABLE_SUBSTRING_P(p, len, RSTRING_END(str))) {
2449  long ofs = p - RSTRING_PTR(str);
2450  str2 = rb_str_new_frozen(str);
2451  str2 = str_new_shared(rb_obj_class(str2), str2);
2452  RSTRING(str2)->as.heap.ptr += ofs;
2453  RSTRING(str2)->as.heap.len = len;
2454  }
2455  else {
2456  if (!len && !empty) return Qnil;
2457  str2 = rb_str_new_with_class(str, p, len);
2458  OBJ_INFECT(str2, str);
2459  RB_GC_GUARD(str);
2460  }
2461  rb_enc_cr_str_copy_for_substr(str2, str);
2462 
2463  return str2;
2464 }
2465 
2466 VALUE
2468 {
2469  if (OBJ_FROZEN(str)) return str;
2470  rb_str_resize(str, RSTRING_LEN(str));
2471  return rb_obj_freeze(str);
2472 }
2473 
2474 
2475 /*
2476  * call-seq:
2477  * +str -> str (mutable)
2478  *
2479  * If the string is frozen, then return duplicated mutable string.
2480  *
2481  * If the string is not frozen, then return the string itself.
2482  */
2483 static VALUE
2485 {
2486  if (OBJ_FROZEN(str)) {
2487  return rb_str_dup(str);
2488  }
2489  else {
2490  return str;
2491  }
2492 }
2493 
2494 /*
2495  * call-seq:
2496  * -str -> str (frozen)
2497  *
2498  * If the string is frozen, then return the string itself.
2499  *
2500  * If the string is not frozen, then duplicate the string
2501  * freeze it and return it.
2502  */
2503 static VALUE
2505 {
2506  if (OBJ_FROZEN(str)) {
2507  return str;
2508  }
2509  else {
2510  return rb_str_freeze(rb_str_dup(str));
2511  }
2512 }
2513 
2515 #define rb_str_dup_frozen rb_str_new_frozen
2516 
2517 VALUE
2518 rb_str_locktmp(VALUE str)
2519 {
2520  if (FL_TEST(str, STR_TMPLOCK)) {
2521  rb_raise(rb_eRuntimeError, "temporal locking already locked string");
2522  }
2523  FL_SET(str, STR_TMPLOCK);
2524  return str;
2525 }
2526 
2527 VALUE
2529 {
2530  if (!FL_TEST(str, STR_TMPLOCK)) {
2531  rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
2532  }
2533  FL_UNSET(str, STR_TMPLOCK);
2534  return str;
2535 }
2536 
2539 {
2540  rb_str_locktmp(str);
2541  return rb_ensure(func, arg, rb_str_unlocktmp, str);
2542 }
2543 
2544 void
2546 {
2547  long capa;
2548  const int termlen = TERM_LEN(str);
2549 
2550  str_modifiable(str);
2551  if (STR_SHARED_P(str)) {
2552  rb_raise(rb_eRuntimeError, "can't set length of shared string");
2553  }
2554  if (len > (capa = (long)str_capacity(str, termlen))) {
2555  rb_bug("probable buffer overflow: %ld for %ld", len, capa);
2556  }
2557  STR_SET_LEN(str, len);
2558  TERM_FILL(&RSTRING_PTR(str)[len], termlen);
2559 }
2560 
2561 VALUE
2563 {
2564  long slen;
2565  int independent;
2566 
2567  if (len < 0) {
2568  rb_raise(rb_eArgError, "negative string size (or size too big)");
2569  }
2570 
2571  independent = str_independent(str);
2572  ENC_CODERANGE_CLEAR(str);
2573  slen = RSTRING_LEN(str);
2574 
2575  {
2576  long capa;
2577  const int termlen = TERM_LEN(str);
2578  if (STR_EMBED_P(str)) {
2579  if (len == slen) return str;
2580  if (STR_EMBEDDABLE_P(len, termlen)) {
2581  STR_SET_EMBED_LEN(str, len);
2582  TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2583  return str;
2584  }
2585  str_make_independent_expand(str, slen, len - slen, termlen);
2586  }
2587  else if (STR_EMBEDDABLE_P(len, termlen)) {
2588  char *ptr = STR_HEAP_PTR(str);
2589  STR_SET_EMBED(str);
2590  if (slen > len) slen = len;
2591  if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
2592  TERM_FILL(RSTRING(str)->as.ary + len, termlen);
2593  STR_SET_EMBED_LEN(str, len);
2594  if (independent) ruby_xfree(ptr);
2595  return str;
2596  }
2597  else if (!independent) {
2598  if (len == slen) return str;
2599  str_make_independent_expand(str, slen, len - slen, termlen);
2600  }
2601  else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
2602  (capa - len) > (len < 1024 ? len : 1024)) {
2603  REALLOC_N(RSTRING(str)->as.heap.ptr, char, (size_t)len + termlen);
2604  RSTRING(str)->as.heap.aux.capa = len;
2605  }
2606  else if (len == slen) return str;
2607  RSTRING(str)->as.heap.len = len;
2608  TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
2609  }
2610  return str;
2611 }
2612 
2613 static VALUE
2614 str_buf_cat(VALUE str, const char *ptr, long len)
2615 {
2616  long capa, total, olen, off = -1;
2617  char *sptr;
2618  const int termlen = TERM_LEN(str);
2619  assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
2620 
2621  RSTRING_GETMEM(str, sptr, olen);
2622  if (ptr >= sptr && ptr <= sptr + olen) {
2623  off = ptr - sptr;
2624  }
2625  rb_str_modify(str);
2626  if (len == 0) return 0;
2627  if (STR_EMBED_P(str)) {
2628  capa = RSTRING_EMBED_LEN_MAX + 1 - termlen;
2629  sptr = RSTRING(str)->as.ary;
2630  olen = RSTRING_EMBED_LEN(str);
2631  }
2632  else {
2633  capa = RSTRING(str)->as.heap.aux.capa;
2634  sptr = RSTRING(str)->as.heap.ptr;
2635  olen = RSTRING(str)->as.heap.len;
2636  }
2637  if (olen > LONG_MAX - len) {
2638  rb_raise(rb_eArgError, "string sizes too big");
2639  }
2640  total = olen + len;
2641  if (capa < total) {
2642  if (total >= LONG_MAX / 2) {
2643  capa = total;
2644  }
2645  while (total > capa) {
2646  capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
2647  }
2648  RESIZE_CAPA_TERM(str, capa, termlen);
2649  sptr = RSTRING_PTR(str);
2650  }
2651  if (off != -1) {
2652  ptr = sptr + off;
2653  }
2654  memcpy(sptr + olen, ptr, len);
2655  STR_SET_LEN(str, total);
2656  TERM_FILL(sptr + total, termlen); /* sentinel */
2657 
2658  return str;
2659 }
2660 
2661 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
2662 
2663 VALUE
2664 rb_str_cat(VALUE str, const char *ptr, long len)
2665 {
2666  if (len == 0) return str;
2667  if (len < 0) {
2668  rb_raise(rb_eArgError, "negative string size (or size too big)");
2669  }
2670  return str_buf_cat(str, ptr, len);
2671 }
2672 
2673 VALUE
2674 rb_str_cat_cstr(VALUE str, const char *ptr)
2675 {
2676  must_not_null(ptr);
2677  return rb_str_buf_cat(str, ptr, strlen(ptr));
2678 }
2679 
2680 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
2681 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
2682 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
2683 
2684 static VALUE
2685 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
2686  int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
2687 {
2688  int str_encindex = ENCODING_GET(str);
2689  int res_encindex;
2690  int str_cr, res_cr;
2691  rb_encoding *str_enc, *ptr_enc;
2692 
2693  str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
2694 
2695  if (str_encindex == ptr_encindex) {
2696  if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
2697  ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
2698  }
2699  }
2700  else {
2701  str_enc = rb_enc_from_index(str_encindex);
2702  ptr_enc = rb_enc_from_index(ptr_encindex);
2703  if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
2704  if (len == 0)
2705  return str;
2706  if (RSTRING_LEN(str) == 0) {
2707  rb_str_buf_cat(str, ptr, len);
2708  ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
2709  return str;
2710  }
2711  goto incompatible;
2712  }
2713  if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
2714  ptr_cr = coderange_scan(ptr, len, ptr_enc);
2715  }
2716  if (str_cr == ENC_CODERANGE_UNKNOWN) {
2717  if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
2718  str_cr = rb_enc_str_coderange(str);
2719  }
2720  }
2721  }
2722  if (ptr_cr_ret)
2723  *ptr_cr_ret = ptr_cr;
2724 
2725  if (str_encindex != ptr_encindex &&
2726  str_cr != ENC_CODERANGE_7BIT &&
2727  ptr_cr != ENC_CODERANGE_7BIT) {
2728  str_enc = rb_enc_from_index(str_encindex);
2729  ptr_enc = rb_enc_from_index(ptr_encindex);
2730  incompatible:
2731  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
2732  rb_enc_name(str_enc), rb_enc_name(ptr_enc));
2733  }
2734 
2735  if (str_cr == ENC_CODERANGE_UNKNOWN) {
2736  res_encindex = str_encindex;
2737  res_cr = ENC_CODERANGE_UNKNOWN;
2738  }
2739  else if (str_cr == ENC_CODERANGE_7BIT) {
2740  if (ptr_cr == ENC_CODERANGE_7BIT) {
2741  res_encindex = str_encindex;
2742  res_cr = ENC_CODERANGE_7BIT;
2743  }
2744  else {
2745  res_encindex = ptr_encindex;
2746  res_cr = ptr_cr;
2747  }
2748  }
2749  else if (str_cr == ENC_CODERANGE_VALID) {
2750  res_encindex = str_encindex;
2751  if (ENC_CODERANGE_CLEAN_P(ptr_cr))
2752  res_cr = str_cr;
2753  else
2754  res_cr = ptr_cr;
2755  }
2756  else { /* str_cr == ENC_CODERANGE_BROKEN */
2757  res_encindex = str_encindex;
2758  res_cr = str_cr;
2759  if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
2760  }
2761 
2762  if (len < 0) {
2763  rb_raise(rb_eArgError, "negative string size (or size too big)");
2764  }
2765  str_buf_cat(str, ptr, len);
2766  ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
2767  return str;
2768 }
2769 
2770 VALUE
2771 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
2772 {
2773  return rb_enc_cr_str_buf_cat(str, ptr, len,
2775 }
2776 
2777 VALUE
2778 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
2779 {
2780  /* ptr must reference NUL terminated ASCII string. */
2781  int encindex = ENCODING_GET(str);
2782  rb_encoding *enc = rb_enc_from_index(encindex);
2783  if (rb_enc_asciicompat(enc)) {
2784  return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
2785  encindex, ENC_CODERANGE_7BIT, 0);
2786  }
2787  else {
2788  char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
2789  while (*ptr) {
2790  unsigned int c = (unsigned char)*ptr;
2791  int len = rb_enc_codelen(c, enc);
2792  rb_enc_mbcput(c, buf, enc);
2793  rb_enc_cr_str_buf_cat(str, buf, len,
2794  encindex, ENC_CODERANGE_VALID, 0);
2795  ptr++;
2796  }
2797  return str;
2798  }
2799 }
2800 
2801 VALUE
2803 {
2804  int str2_cr;
2805 
2806  str2_cr = ENC_CODERANGE(str2);
2807 
2808  rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
2809  ENCODING_GET(str2), str2_cr, &str2_cr);
2810 
2811  OBJ_INFECT(str, str2);
2812  ENC_CODERANGE_SET(str2, str2_cr);
2813 
2814  return str;
2815 }
2816 
2817 VALUE
2819 {
2820  StringValue(str2);
2821  return rb_str_buf_append(str, str2);
2822 }
2823 
2824 VALUE
2825 rb_str_concat_literals(size_t num, const VALUE *strary)
2826 {
2827  VALUE str;
2828  size_t i;
2829 
2830  if (!num) return rb_str_new(0, 0);
2831  str = rb_str_resurrect(strary[0]);
2832  for (i = 1; i < num; ++i) {
2833  const VALUE v = strary[i];
2834  int encidx = ENCODING_GET(v);
2835 
2836  rb_enc_cr_str_buf_cat(str, RSTRING_PTR(v), RSTRING_LEN(v),
2837  encidx, ENC_CODERANGE(v), NULL);
2838  OBJ_INFECT_RAW(str, v);
2839  if (encidx != ENCINDEX_US_ASCII) {
2841  rb_enc_set_index(str, encidx);
2842  }
2843  }
2844  return str;
2845 }
2846 
2847 /*
2848  * Document-method: String#<<
2849  * Document-method: String#concat
2850  *
2851  * call-seq:
2852  * str << integer -> str
2853  * str.concat(integer1, integer2,...) -> str
2854  * str << obj -> str
2855  * str.concat(obj1, obj2,...) -> str
2856  *
2857  * Append---Concatenates the given object to <i>str</i>. If the object is an
2858  * <code>Integer</code>, it is considered as a codepoint, and is converted
2859  * to a character before concatenation. Concat can take multiple arguments.
2860  * All the arguments are concatenated in order.
2861  *
2862  * a = "hello "
2863  * a << "world" #=> "hello world"
2864  * a.concat(33) #=> "hello world!"
2865  * a #=> "hello world!"
2866  *
2867  * b = "sn"
2868  * b.concat(b, b) #=> "snsnsn"
2869  */
2870 
2871 static VALUE
2873 {
2874  str_modifiable(str);
2875 
2876  if (argc > 0) {
2877  int i;
2878  VALUE arg_str = rb_str_tmp_new(0);
2879  rb_enc_copy(arg_str, str);
2880  for (i = 0; i < argc; i++) {
2881  rb_str_concat(arg_str, argv[i]);
2882  }
2883  rb_str_buf_append(str, arg_str);
2884  }
2885 
2886  return str;
2887 }
2888 
2889 VALUE
2891 {
2892  unsigned int code;
2893  rb_encoding *enc = STR_ENC_GET(str1);
2894  int encidx;
2895 
2896  if (RB_INTEGER_TYPE_P(str2)) {
2897  if (rb_num_to_uint(str2, &code) == 0) {
2898  }
2899  else if (FIXNUM_P(str2)) {
2900  rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
2901  }
2902  else {
2903  rb_raise(rb_eRangeError, "bignum out of char range");
2904  }
2905  }
2906  else {
2907  return rb_str_append(str1, str2);
2908  }
2909 
2910  encidx = rb_enc_to_index(enc);
2911  if (encidx == ENCINDEX_ASCII || encidx == ENCINDEX_US_ASCII) {
2912  /* US-ASCII automatically extended to ASCII-8BIT */
2913  char buf[1];
2914  buf[0] = (char)code;
2915  if (code > 0xFF) {
2916  rb_raise(rb_eRangeError, "%u out of char range", code);
2917  }
2918  rb_str_cat(str1, buf, 1);
2919  if (encidx == ENCINDEX_US_ASCII && code > 127) {
2922  }
2923  }
2924  else {
2925  long pos = RSTRING_LEN(str1);
2926  int cr = ENC_CODERANGE(str1);
2927  int len;
2928  char *buf;
2929 
2930  switch (len = rb_enc_codelen(code, enc)) {
2932  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
2933  break;
2935  case 0:
2936  rb_raise(rb_eRangeError, "%u out of char range", code);
2937  break;
2938  }
2939  buf = ALLOCA_N(char, len + 1);
2940  rb_enc_mbcput(code, buf, enc);
2941  if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
2942  rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
2943  }
2944  rb_str_resize(str1, pos+len);
2945  memcpy(RSTRING_PTR(str1) + pos, buf, len);
2946  if (cr == ENC_CODERANGE_7BIT && code > 127)
2947  cr = ENC_CODERANGE_VALID;
2948  ENC_CODERANGE_SET(str1, cr);
2949  }
2950  return str1;
2951 }
2952 
2953 /*
2954  * call-seq:
2955  * str.prepend(other_str1, other_str2,...) -> str
2956  *
2957  * Prepend---Prepend the given strings to <i>str</i>.
2958  *
2959  * a = "!"
2960  * a.prepend("hello ", "world") #=> "hello world!"
2961  * a #=> "hello world!"
2962  *
2963  * See also String#concat.
2964  */
2965 
2966 static VALUE
2968 {
2969  str_modifiable(str);
2970 
2971  if (argc > 0) {
2972  int i;
2973  VALUE arg_str = rb_str_tmp_new(0);
2974  rb_enc_copy(arg_str, str);
2975  for (i = 0; i < argc; i++) {
2976  rb_str_append(arg_str, argv[i]);
2977  }
2978  rb_str_update(str, 0L, 0L, arg_str);
2979  }
2980 
2981  return str;
2982 }
2983 
2984 st_index_t
2986 {
2987  int e = ENCODING_GET(str);
2988  if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
2989  e = 0;
2990  }
2991  return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
2992 }
2993 
2994 int
2996 {
2997  long len1, len2;
2998  const char *ptr1, *ptr2;
2999  RSTRING_GETMEM(str1, ptr1, len1);
3000  RSTRING_GETMEM(str2, ptr2, len2);
3001  return (len1 != len2 ||
3002  !rb_str_comparable(str1, str2) ||
3003  memcmp(ptr1, ptr2, len1) != 0);
3004 }
3005 
3006 /*
3007  * call-seq:
3008  * str.hash -> integer
3009  *
3010  * Return a hash based on the string's length, content and encoding.
3011  *
3012  * See also Object#hash.
3013  */
3014 
3015 static VALUE
3017 {
3018  st_index_t hval = rb_str_hash(str);
3019  return ST2FIX(hval);
3020 }
3021 
3022 #define lesser(a,b) (((a)>(b))?(b):(a))
3023 
3024 int
3026 {
3027  int idx1, idx2;
3028  int rc1, rc2;
3029 
3030  if (RSTRING_LEN(str1) == 0) return TRUE;
3031  if (RSTRING_LEN(str2) == 0) return TRUE;
3032  idx1 = ENCODING_GET(str1);
3033  idx2 = ENCODING_GET(str2);
3034  if (idx1 == idx2) return TRUE;
3035  rc1 = rb_enc_str_coderange(str1);
3036  rc2 = rb_enc_str_coderange(str2);
3037  if (rc1 == ENC_CODERANGE_7BIT) {
3038  if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3040  return TRUE;
3041  }
3042  if (rc2 == ENC_CODERANGE_7BIT) {
3044  return TRUE;
3045  }
3046  return FALSE;
3047 }
3048 
3049 int
3051 {
3052  long len1, len2;
3053  const char *ptr1, *ptr2;
3054  int retval;
3055 
3056  if (str1 == str2) return 0;
3057  RSTRING_GETMEM(str1, ptr1, len1);
3058  RSTRING_GETMEM(str2, ptr2, len2);
3059  if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3060  if (len1 == len2) {
3061  if (!rb_str_comparable(str1, str2)) {
3062  if (ENCODING_GET(str1) > ENCODING_GET(str2))
3063  return 1;
3064  return -1;
3065  }
3066  return 0;
3067  }
3068  if (len1 > len2) return 1;
3069  return -1;
3070  }
3071  if (retval > 0) return 1;
3072  return -1;
3073 }
3074 
3075 /* expect tail call optimization */
3076 static VALUE
3077 str_eql(const VALUE str1, const VALUE str2)
3078 {
3079  const long len = RSTRING_LEN(str1);
3080  const char *ptr1, *ptr2;
3081 
3082  if (len != RSTRING_LEN(str2)) return Qfalse;
3083  if (!rb_str_comparable(str1, str2)) return Qfalse;
3084  if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
3085  return Qtrue;
3086  if (memcmp(ptr1, ptr2, len) == 0)
3087  return Qtrue;
3088  return Qfalse;
3089 }
3090 
3091 /*
3092  * call-seq:
3093  * str == obj -> true or false
3094  * str === obj -> true or false
3095  *
3096  * Equality---Returns whether +str+ == +obj+, similar to Object#==.
3097  *
3098  * If +obj+ is not an instance of String but responds to +to_str+, then the
3099  * two strings are compared using <code>obj.==</code>.
3100  *
3101  * Otherwise, returns similarly to String#eql?, comparing length and content.
3102  */
3103 
3104 VALUE
3106 {
3107  if (str1 == str2) return Qtrue;
3108  if (!RB_TYPE_P(str2, T_STRING)) {
3109  if (!rb_respond_to(str2, idTo_str)) {
3110  return Qfalse;
3111  }
3112  return rb_equal(str2, str1);
3113  }
3114  return str_eql(str1, str2);
3115 }
3116 
3117 /*
3118  * call-seq:
3119  * str.eql?(other) -> true or false
3120  *
3121  * Two strings are equal if they have the same length and content.
3122  */
3123 
3124 static VALUE
3126 {
3127  if (str1 == str2) return Qtrue;
3128  if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3129  return str_eql(str1, str2);
3130 }
3131 
3132 /*
3133  * call-seq:
3134  * string <=> other_string -> -1, 0, +1 or nil
3135  *
3136  *
3137  * Comparison---Returns -1, 0, +1 or nil depending on whether +string+ is less
3138  * than, equal to, or greater than +other_string+.
3139  *
3140  * +nil+ is returned if the two values are incomparable.
3141  *
3142  * If the strings are of different lengths, and the strings are equal when
3143  * compared up to the shortest length, then the longer string is considered
3144  * greater than the shorter one.
3145  *
3146  * <code><=></code> is the basis for the methods <code><</code>,
3147  * <code><=</code>, <code>></code>, <code>>=</code>, and
3148  * <code>between?</code>, included from module Comparable. The method
3149  * String#== does not use Comparable#==.
3150  *
3151  * "abcdef" <=> "abcde" #=> 1
3152  * "abcdef" <=> "abcdef" #=> 0
3153  * "abcdef" <=> "abcdefg" #=> -1
3154  * "abcdef" <=> "ABCDEF" #=> 1
3155  * "abcdef" <=> 1 #=> nil
3156  */
3157 
3158 static VALUE
3160 {
3161  int result;
3162 
3163  if (!RB_TYPE_P(str2, T_STRING)) {
3164  VALUE tmp = rb_check_funcall(str2, idTo_str, 0, 0);
3165  if (RB_TYPE_P(tmp, T_STRING)) {
3166  result = rb_str_cmp(str1, tmp);
3167  }
3168  else {
3169  return rb_invcmp(str1, str2);
3170  }
3171  }
3172  else {
3173  result = rb_str_cmp(str1, str2);
3174  }
3175  return INT2FIX(result);
3176 }
3177 
3178 /*
3179  * call-seq:
3180  * str.casecmp(other_str) -> -1, 0, +1 or nil
3181  *
3182  * Case-insensitive version of <code>String#<=></code>.
3183  * Currently, case-insensitivity only works on characters A-Z/a-z,
3184  * not all of Unicode. This is different from <code>casecmp?</code>.
3185  *
3186  * "abcdef".casecmp("abcde") #=> 1
3187  * "aBcDeF".casecmp("abcdef") #=> 0
3188  * "abcdef".casecmp("abcdefg") #=> -1
3189  * "abcdef".casecmp("ABCDEF") #=> 0
3190  */
3191 
3192 static VALUE
3194 {
3195  long len;
3196  rb_encoding *enc;
3197  char *p1, *p1end, *p2, *p2end;
3198 
3199  StringValue(str2);
3200  enc = rb_enc_compatible(str1, str2);
3201  if (!enc) {
3202  return Qnil;
3203  }
3204 
3205  p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3206  p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3207  if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3208  while (p1 < p1end && p2 < p2end) {
3209  if (*p1 != *p2) {
3210  unsigned int c1 = TOUPPER(*p1 & 0xff);
3211  unsigned int c2 = TOUPPER(*p2 & 0xff);
3212  if (c1 != c2)
3213  return INT2FIX(c1 < c2 ? -1 : 1);
3214  }
3215  p1++;
3216  p2++;
3217  }
3218  }
3219  else {
3220  while (p1 < p1end && p2 < p2end) {
3221  int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3222  int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3223 
3224  if (0 <= c1 && 0 <= c2) {
3225  c1 = TOUPPER(c1);
3226  c2 = TOUPPER(c2);
3227  if (c1 != c2)
3228  return INT2FIX(c1 < c2 ? -1 : 1);
3229  }
3230  else {
3231  int r;
3232  l1 = rb_enc_mbclen(p1, p1end, enc);
3233  l2 = rb_enc_mbclen(p2, p2end, enc);
3234  len = l1 < l2 ? l1 : l2;
3235  r = memcmp(p1, p2, len);
3236  if (r != 0)
3237  return INT2FIX(r < 0 ? -1 : 1);
3238  if (l1 != l2)
3239  return INT2FIX(l1 < l2 ? -1 : 1);
3240  }
3241  p1 += l1;
3242  p2 += l2;
3243  }
3244  }
3245  if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3246  if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3247  return INT2FIX(-1);
3248 }
3249 
3250 /*
3251  * call-seq:
3252  * str.casecmp?(other_str) -> true, false, or nil
3253  *
3254  * Returns true if str and other_other_str are equal after Unicode case folding,
3255  * false if they are not equal, and nil if other_str is not a string.
3256  *
3257  * "abcdef".casecmp?("abcde") #=> false
3258  * "aBcDeF".casecmp?("abcdef") #=> true
3259  * "abcdef".casecmp?("abcdefg") #=> false
3260  * "abcdef".casecmp?("ABCDEF") #=> true
3261  * "\u{e4 f6 fc}".casecmp?("\u{c4 d6 dc}") #=> true
3262  */
3263 
3264 static VALUE
3266 {
3267  rb_encoding *enc;
3268  VALUE folded_str1, folded_str2;
3269  VALUE fold_opt = sym_fold;
3270 
3271  StringValue(str2);
3272  enc = rb_enc_compatible(str1, str2);
3273  if (!enc) {
3274  return Qnil;
3275  }
3276 
3277  folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3278  folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3279 
3280  return rb_str_eql(folded_str1, folded_str2);
3281 }
3282 
3283 #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3284 
3285 static long
3286 rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3287 {
3288  const char *s, *sptr, *e;
3289  long pos, len, slen;
3290  int single_byte = single_byte_optimizable(str);
3291  rb_encoding *enc;
3292 
3293  enc = rb_enc_check(str, sub);
3294  if (is_broken_string(sub)) return -1;
3295 
3296  len = (in_byte || single_byte) ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
3297  slen = in_byte ? RSTRING_LEN(sub) : str_strlen(sub, enc); /* rb_enc_check */
3298  if (offset < 0) {
3299  offset += len;
3300  if (offset < 0) return -1;
3301  }
3302  if (len - offset < slen) return -1;
3303 
3304  s = RSTRING_PTR(str);
3305  e = RSTRING_END(str);
3306  if (offset) {
3307  if (!in_byte) offset = str_offset(s, e, offset, enc, single_byte);
3308  s += offset;
3309  }
3310  if (slen == 0) return offset;
3311  /* need proceed one character at a time */
3312  sptr = RSTRING_PTR(sub);
3313  slen = RSTRING_LEN(sub);
3314  len = RSTRING_LEN(str) - offset;
3315  for (;;) {
3316  const char *t;
3317  pos = rb_memsearch(sptr, slen, s, len, enc);
3318  if (pos < 0) return pos;
3319  t = rb_enc_right_char_head(s, s+pos, e, enc);
3320  if (t == s + pos) break;
3321  len -= t - s;
3322  if (len <= 0) return -1;
3323  offset += t - s;
3324  s = t;
3325  }
3326  return pos + offset;
3327 }
3328 
3329 
3330 /*
3331  * call-seq:
3332  * str.index(substring [, offset]) -> integer or nil
3333  * str.index(regexp [, offset]) -> integer or nil
3334  *
3335  * Returns the index of the first occurrence of the given <i>substring</i> or
3336  * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
3337  * found. If the second parameter is present, it specifies the position in the
3338  * string to begin the search.
3339  *
3340  * "hello".index('e') #=> 1
3341  * "hello".index('lo') #=> 3
3342  * "hello".index('a') #=> nil
3343  * "hello".index(?e) #=> 1
3344  * "hello".index(/[aeiou]/, -3) #=> 4
3345  */
3346 
3347 static VALUE
3349 {
3350  VALUE sub;
3351  VALUE initpos;
3352  long pos;
3353 
3354  if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3355  pos = NUM2LONG(initpos);
3356  }
3357  else {
3358  pos = 0;
3359  }
3360  if (pos < 0) {
3361  pos += str_strlen(str, NULL);
3362  if (pos < 0) {
3363  if (RB_TYPE_P(sub, T_REGEXP)) {
3365  }
3366  return Qnil;
3367  }
3368  }
3369 
3370  if (SPECIAL_CONST_P(sub)) goto generic;
3371  switch (BUILTIN_TYPE(sub)) {
3372  case T_REGEXP:
3373  if (pos > str_strlen(str, NULL))
3374  return Qnil;
3375  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3376  rb_enc_check(str, sub), single_byte_optimizable(str));
3377 
3378  pos = rb_reg_search(sub, str, pos, 0);
3379  pos = rb_str_sublen(str, pos);
3380  break;
3381 
3382  generic:
3383  default: {
3384  VALUE tmp;
3385 
3386  tmp = rb_check_string_type(sub);
3387  if (NIL_P(tmp)) {
3388  rb_raise(rb_eTypeError, "type mismatch: %s given",
3389  rb_obj_classname(sub));
3390  }
3391  sub = tmp;
3392  }
3393  /* fall through */
3394  case T_STRING:
3395  pos = rb_str_index(str, sub, pos);
3396  pos = rb_str_sublen(str, pos);
3397  break;
3398  }
3399 
3400  if (pos == -1) return Qnil;
3401  return LONG2NUM(pos);
3402 }
3403 
3404 #ifdef HAVE_MEMRCHR
3405 static long
3406 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3407 {
3408  char *hit, *adjusted;
3409  int c;
3410  long slen, searchlen;
3411  char *sbeg, *e, *t;
3412 
3413  slen = RSTRING_LEN(sub);
3414  if (slen == 0) return pos;
3415  sbeg = RSTRING_PTR(str);
3416  e = RSTRING_END(str);
3417  t = RSTRING_PTR(sub);
3418  c = *t & 0xff;
3419  searchlen = s - sbeg + 1;
3420 
3421  do {
3422  hit = memrchr(sbeg, c, searchlen);
3423  if (!hit) break;
3424  adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
3425  if (hit != adjusted) {
3426  searchlen = adjusted - sbeg;
3427  continue;
3428  }
3429  if (memcmp(hit, t, slen) == 0)
3430  return rb_str_sublen(str, hit - sbeg);
3431  searchlen = adjusted - sbeg;
3432  } while (searchlen > 0);
3433 
3434  return -1;
3435 }
3436 #else
3437 static long
3438 str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
3439 {
3440  long slen;
3441  char *sbeg, *e, *t;
3442 
3443  sbeg = RSTRING_PTR(str);
3444  e = RSTRING_END(str);
3445  t = RSTRING_PTR(sub);
3446  slen = RSTRING_LEN(sub);
3447 
3448  while (s) {
3449  if (memcmp(s, t, slen) == 0) {
3450  return pos;
3451  }
3452  if (pos == 0) break;
3453  pos--;
3454  s = rb_enc_prev_char(sbeg, s, e, enc);
3455  }
3456 
3457  return -1;
3458 }
3459 #endif
3460 
3461 static long
3462 rb_str_rindex(VALUE str, VALUE sub, long pos)
3463 {
3464  long len, slen;
3465  char *sbeg, *s;
3466  rb_encoding *enc;
3467  int singlebyte;
3468 
3469  enc = rb_enc_check(str, sub);
3470  if (is_broken_string(sub)) return -1;
3471  singlebyte = single_byte_optimizable(str);
3472  len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
3473  slen = str_strlen(sub, enc); /* rb_enc_check */
3474 
3475  /* substring longer than string */
3476  if (len < slen) return -1;
3477  if (len - pos < slen) pos = len - slen;
3478  if (len == 0) return pos;
3479 
3480  sbeg = RSTRING_PTR(str);
3481 
3482  if (pos == 0) {
3483  if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
3484  return 0;
3485  else
3486  return -1;
3487  }
3488 
3489  s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
3490  return str_rindex(str, sub, s, pos, enc);
3491 }
3492 
3493 
3494 /*
3495  * call-seq:
3496  * str.rindex(substring [, integer]) -> integer or nil
3497  * str.rindex(regexp [, integer]) -> integer or nil
3498  *
3499  * Returns the index of the last occurrence of the given <i>substring</i> or
3500  * pattern (<i>regexp</i>) in <i>str</i>. Returns <code>nil</code> if not
3501  * found. If the second parameter is present, it specifies the position in the
3502  * string to end the search---characters beyond this point will not be
3503  * considered.
3504  *
3505  * "hello".rindex('e') #=> 1
3506  * "hello".rindex('l') #=> 3
3507  * "hello".rindex('a') #=> nil
3508  * "hello".rindex(?e) #=> 1
3509  * "hello".rindex(/[aeiou]/, -2) #=> 1
3510  */
3511 
3512 static VALUE
3514 {
3515  VALUE sub;
3516  VALUE vpos;
3517  rb_encoding *enc = STR_ENC_GET(str);
3518  long pos, len = str_strlen(str, enc); /* str's enc */
3519 
3520  if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
3521  pos = NUM2LONG(vpos);
3522  if (pos < 0) {
3523  pos += len;
3524  if (pos < 0) {
3525  if (RB_TYPE_P(sub, T_REGEXP)) {
3527  }
3528  return Qnil;
3529  }
3530  }
3531  if (pos > len) pos = len;
3532  }
3533  else {
3534  pos = len;
3535  }
3536 
3537  if (SPECIAL_CONST_P(sub)) goto generic;
3538  switch (BUILTIN_TYPE(sub)) {
3539  case T_REGEXP:
3540  /* enc = rb_get_check(str, sub); */
3541  pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3542  enc, single_byte_optimizable(str));
3543 
3544  pos = rb_reg_search(sub, str, pos, 1);
3545  pos = rb_str_sublen(str, pos);
3546  if (pos >= 0) return LONG2NUM(pos);
3547  break;
3548 
3549  generic:
3550  default: {
3551  VALUE tmp;
3552 
3553  tmp = rb_check_string_type(sub);
3554  if (NIL_P(tmp)) {
3555  rb_raise(rb_eTypeError, "type mismatch: %s given",
3556  rb_obj_classname(sub));
3557  }
3558  sub = tmp;
3559  }
3560  /* fall through */
3561  case T_STRING:
3562  pos = rb_str_rindex(str, sub, pos);
3563  if (pos >= 0) return LONG2NUM(pos);
3564  break;
3565  }
3566  return Qnil;
3567 }
3568 
3569 /*
3570  * call-seq:
3571  * str =~ obj -> integer or nil
3572  *
3573  * Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
3574  * against <i>str</i>,and returns the position the match starts, or
3575  * <code>nil</code> if there is no match. Otherwise, invokes
3576  * <i>obj.=~</i>, passing <i>str</i> as an argument. The default
3577  * <code>=~</code> in <code>Object</code> returns <code>nil</code>.
3578  *
3579  * Note: <code>str =~ regexp</code> is not the same as
3580  * <code>regexp =~ str</code>. Strings captured from named capture groups
3581  * are assigned to local variables only in the second case.
3582  *
3583  * "cat o' 9 tails" =~ /\d/ #=> 7
3584  * "cat o' 9 tails" =~ 9 #=> nil
3585  */
3586 
3587 static VALUE
3589 {
3590  if (SPECIAL_CONST_P(y)) goto generic;
3591  switch (BUILTIN_TYPE(y)) {
3592  case T_STRING:
3593  rb_raise(rb_eTypeError, "type mismatch: String given");
3594 
3595  case T_REGEXP:
3596  return rb_reg_match(y, x);
3597 
3598  generic:
3599  default:
3600  return rb_funcall(y, idEqTilde, 1, x);
3601  }
3602 }
3603 
3604 
3605 static VALUE get_pat(VALUE);
3606 
3607 
3608 /*
3609  * call-seq:
3610  * str.match(pattern) -> matchdata or nil
3611  * str.match(pattern, pos) -> matchdata or nil
3612  *
3613  * Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
3614  * then invokes its <code>match</code> method on <i>str</i>. If the second
3615  * parameter is present, it specifies the position in the string to begin the
3616  * search.
3617  *
3618  * 'hello'.match('(.)\1') #=> #<MatchData "ll" 1:"l">
3619  * 'hello'.match('(.)\1')[0] #=> "ll"
3620  * 'hello'.match(/(.)\1/)[0] #=> "ll"
3621  * 'hello'.match('xx') #=> nil
3622  *
3623  * If a block is given, invoke the block with MatchData if match succeed, so
3624  * that you can write
3625  *
3626  * str.match(pat) {|m| ...}
3627  *
3628  * instead of
3629  *
3630  * if m = str.match(pat)
3631  * ...
3632  * end
3633  *
3634  * The return value is a value from block execution in this case.
3635  */
3636 
3637 static VALUE
3639 {
3640  VALUE re, result;
3641  if (argc < 1)
3642  rb_check_arity(argc, 1, 2);
3643  re = argv[0];
3644  argv[0] = str;
3645  result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
3646  if (!NIL_P(result) && rb_block_given_p()) {
3647  return rb_yield(result);
3648  }
3649  return result;
3650 }
3651 
3652 /*
3653  * call-seq:
3654  * str.match?(pattern) -> true or false
3655  * str.match?(pattern, pos) -> true or false
3656  *
3657  * Converts _pattern_ to a +Regexp+ (if it isn't already one), then
3658  * returns a +true+ or +false+ indicates whether the regexp is
3659  * matched _str_ or not without updating <code>$~</code> and other
3660  * related variables. If the second parameter is present, it
3661  * specifies the position in the string to begin the search.
3662  *
3663  * "Ruby".match?(/R.../) #=> true
3664  * "Ruby".match?(/R.../, 1) #=> false
3665  * "Ruby".match?(/P.../) #=> false
3666  * $& #=> nil
3667  */
3668 
3669 static VALUE
3671 {
3672  VALUE re;
3673  rb_check_arity(argc, 1, 2);
3674  re = get_pat(argv[0]);
3675  return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
3676 }
3677 
3682 };
3683 
3684 static enum neighbor_char
3685 enc_succ_char(char *p, long len, rb_encoding *enc)
3686 {
3687  long i;
3688  int l;
3689 
3690  if (rb_enc_mbminlen(enc) > 1) {
3691  /* wchar, trivial case */
3692  int r = rb_enc_precise_mbclen(p, p + len, enc), c;
3693  if (!MBCLEN_CHARFOUND_P(r)) {
3694  return NEIGHBOR_NOT_CHAR;
3695  }
3696  c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
3697  l = rb_enc_code_to_mbclen(c, enc);
3698  if (!l) return NEIGHBOR_NOT_CHAR;
3699  if (l != len) return NEIGHBOR_WRAPPED;
3700  rb_enc_mbcput(c, p, enc);
3701  r = rb_enc_precise_mbclen(p, p + len, enc);
3702  if (!MBCLEN_CHARFOUND_P(r)) {
3703  return NEIGHBOR_NOT_CHAR;
3704  }
3705  return NEIGHBOR_FOUND;
3706  }
3707  while (1) {
3708  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
3709  p[i] = '\0';
3710  if (i < 0)
3711  return NEIGHBOR_WRAPPED;
3712  ++((unsigned char*)p)[i];
3713  l = rb_enc_precise_mbclen(p, p+len, enc);
3714  if (MBCLEN_CHARFOUND_P(l)) {
3715  l = MBCLEN_CHARFOUND_LEN(l);
3716  if (l == len) {
3717  return NEIGHBOR_FOUND;
3718  }
3719  else {
3720  memset(p+l, 0xff, len-l);
3721  }
3722  }
3723  if (MBCLEN_INVALID_P(l) && i < len-1) {
3724  long len2;
3725  int l2;
3726  for (len2 = len-1; 0 < len2; len2--) {
3727  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
3728  if (!MBCLEN_INVALID_P(l2))
3729  break;
3730  }
3731  memset(p+len2+1, 0xff, len-(len2+1));
3732  }
3733  }
3734 }
3735 
3736 static enum neighbor_char
3737 enc_pred_char(char *p, long len, rb_encoding *enc)
3738 {
3739  long i;
3740  int l;
3741  if (rb_enc_mbminlen(enc) > 1) {
3742  /* wchar, trivial case */
3743  int r = rb_enc_precise_mbclen(p, p + len, enc), c;
3744  if (!MBCLEN_CHARFOUND_P(r)) {
3745  return NEIGHBOR_NOT_CHAR;
3746  }
3747  c = rb_enc_mbc_to_codepoint(p, p + len, enc);
3748  if (!c) return NEIGHBOR_NOT_CHAR;
3749  --c;
3750  l = rb_enc_code_to_mbclen(c, enc);
3751  if (!l) return NEIGHBOR_NOT_CHAR;
3752  if (l != len) return NEIGHBOR_WRAPPED;
3753  rb_enc_mbcput(c, p, enc);
3754  r = rb_enc_precise_mbclen(p, p + len, enc);
3755  if (!MBCLEN_CHARFOUND_P(r)) {
3756  return NEIGHBOR_NOT_CHAR;
3757  }
3758  return NEIGHBOR_FOUND;
3759  }
3760  while (1) {
3761  for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
3762  p[i] = '\xff';
3763  if (i < 0)
3764  return NEIGHBOR_WRAPPED;
3765  --((unsigned char*)p)[i];
3766  l = rb_enc_precise_mbclen(p, p+len, enc);
3767  if (MBCLEN_CHARFOUND_P(l)) {
3768  l = MBCLEN_CHARFOUND_LEN(l);
3769  if (l == len) {
3770  return NEIGHBOR_FOUND;
3771  }
3772  else {
3773  memset(p+l, 0, len-l);
3774  }
3775  }
3776  if (MBCLEN_INVALID_P(l) && i < len-1) {
3777  long len2;
3778  int l2;
3779  for (len2 = len-1; 0 < len2; len2--) {
3780  l2 = rb_enc_precise_mbclen(p, p+len2, enc);
3781  if (!MBCLEN_INVALID_P(l2))
3782  break;
3783  }
3784  memset(p+len2+1, 0, len-(len2+1));
3785  }
3786  }
3787 }
3788 
3789 /*
3790  overwrite +p+ by succeeding letter in +enc+ and returns
3791  NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
3792  When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
3793  assuming each ranges are successive, and mbclen
3794  never change in each ranges.
3795  NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
3796  character.
3797  */
3798 static enum neighbor_char
3799 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
3800 {
3801  enum neighbor_char ret;
3802  unsigned int c;
3803  int ctype;
3804  int range;
3805  char save[ONIGENC_CODE_TO_MBC_MAXLEN];
3806 
3807  /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
3808  int try;
3809  const int max_gaps = 1;
3810 
3811  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
3812  if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
3813  ctype = ONIGENC_CTYPE_DIGIT;
3814  else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
3815  ctype = ONIGENC_CTYPE_ALPHA;
3816  else
3817  return NEIGHBOR_NOT_CHAR;
3818 
3819  MEMCPY(save, p, char, len);
3820  for (try = 0; try <= max_gaps; ++try) {
3821  ret = enc_succ_char(p, len, enc);
3822  if (ret == NEIGHBOR_FOUND) {
3823  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
3824  if (rb_enc_isctype(c, ctype, enc))
3825  return NEIGHBOR_FOUND;
3826  }
3827  }
3828  MEMCPY(p, save, char, len);
3829  range = 1;
3830  while (1) {
3831  MEMCPY(save, p, char, len);
3832  ret = enc_pred_char(p, len, enc);
3833  if (ret == NEIGHBOR_FOUND) {
3834  c = rb_enc_mbc_to_codepoint(p, p+len, enc);
3835  if (!rb_enc_isctype(c, ctype, enc)) {
3836  MEMCPY(p, save, char, len);
3837  break;
3838  }
3839  }
3840  else {
3841  MEMCPY(p, save, char, len);
3842  break;
3843  }
3844  range++;
3845  }
3846  if (range == 1) {
3847  return NEIGHBOR_NOT_CHAR;
3848  }
3849 
3850  if (ctype != ONIGENC_CTYPE_DIGIT) {
3851  MEMCPY(carry, p, char, len);
3852  return NEIGHBOR_WRAPPED;
3853  }
3854 
3855  MEMCPY(carry, p, char, len);
3856  enc_succ_char(carry, len, enc);
3857  return NEIGHBOR_WRAPPED;
3858 }
3859 
3860 
3861 static VALUE str_succ(VALUE str);
3862 
3863 /*
3864  * call-seq:
3865  * str.succ -> new_str
3866  * str.next -> new_str
3867  *
3868  * Returns the successor to <i>str</i>. The successor is calculated by
3869  * incrementing characters starting from the rightmost alphanumeric (or
3870  * the rightmost character if there are no alphanumerics) in the
3871  * string. Incrementing a digit always results in another digit, and
3872  * incrementing a letter results in another letter of the same case.
3873  * Incrementing nonalphanumerics uses the underlying character set's
3874  * collating sequence.
3875  *
3876  * If the increment generates a ``carry,'' the character to the left of
3877  * it is incremented. This process repeats until there is no carry,
3878  * adding an additional character if necessary.
3879  *
3880  * "abcd".succ #=> "abce"
3881  * "THX1138".succ #=> "THX1139"
3882  * "<<koala>>".succ #=> "<<koalb>>"
3883  * "1999zzz".succ #=> "2000aaa"
3884  * "ZZZ9999".succ #=> "AAAA0000"
3885  * "***".succ #=> "**+"
3886  */
3887 
3888 VALUE
3890 {
3891  VALUE str;
3892  str = rb_str_new_with_class(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
3893  rb_enc_cr_str_copy_for_substr(str, orig);
3894  OBJ_INFECT(str, orig);
3895  return str_succ(str);
3896 }
3897 
3898 static VALUE
3900 {
3901  rb_encoding *enc;
3902  char *sbeg, *s, *e, *last_alnum = 0;
3903  int c = -1;
3904  long l, slen;
3905  char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
3906  long carry_pos = 0, carry_len = 1;
3907  enum neighbor_char neighbor = NEIGHBOR_FOUND;
3908 
3909  slen = RSTRING_LEN(str);
3910  if (slen == 0) return str;
3911 
3912  enc = STR_ENC_GET(str);
3913  sbeg = RSTRING_PTR(str);
3914  s = e = sbeg + slen;
3915 
3916  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
3917  if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
3918  if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
3919  ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
3920  s = last_alnum;
3921  break;
3922  }
3923  }
3924  l = rb_enc_precise_mbclen(s, e, enc);
3925  if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
3927  neighbor = enc_succ_alnum_char(s, l, enc, carry);
3928  switch (neighbor) {
3929  case NEIGHBOR_NOT_CHAR:
3930  continue;
3931  case NEIGHBOR_FOUND:
3932  return str;
3933  case NEIGHBOR_WRAPPED:
3934  last_alnum = s;
3935  break;
3936  }
3937  c = 1;
3938  carry_pos = s - sbeg;
3939  carry_len = l;
3940  }
3941  if (c == -1) { /* str contains no alnum */
3942  s = e;
3943  while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
3944  enum neighbor_char neighbor;
3945  char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
3946  l = rb_enc_precise_mbclen(s, e, enc);
3947  if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
3949  MEMCPY(tmp, s, char, l);
3950  neighbor = enc_succ_char(tmp, l, enc);
3951  switch (neighbor) {
3952  case NEIGHBOR_FOUND:
3953  MEMCPY(s, tmp, char, l);
3954  return str;
3955  break;
3956  case NEIGHBOR_WRAPPED:
3957  MEMCPY(s, tmp, char, l);
3958  break;
3959  case NEIGHBOR_NOT_CHAR:
3960  break;
3961  }
3962  if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
3963  /* wrapped to \0...\0. search next valid char. */
3964  enc_succ_char(s, l, enc);
3965  }
3966  if (!rb_enc_asciicompat(enc)) {
3967  MEMCPY(carry, s, char, l);
3968  carry_len = l;
3969  }
3970  carry_pos = s - sbeg;
3971  }
3972  }
3973  RESIZE_CAPA(str, slen + carry_len);
3974  sbeg = RSTRING_PTR(str);
3975  s = sbeg + carry_pos;
3976  memmove(s + carry_len, s, slen - carry_pos);
3977  memmove(s, carry, carry_len);
3978  slen += carry_len;
3979  STR_SET_LEN(str, slen);
3980  TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
3981  rb_enc_str_coderange(str);
3982  return str;
3983 }
3984 
3985 
3986 /*
3987  * call-seq:
3988  * str.succ! -> str
3989  * str.next! -> str
3990  *
3991  * Equivalent to <code>String#succ</code>, but modifies the receiver in
3992  * place.
3993  */
3994 
3995 static VALUE
3997 {
3998  rb_str_modify(str);
3999  str_succ(str);
4000  return str;
4001 }
4002 
4003 static int
4004 all_digits_p(const char *s, long len)
4005 {
4006  while (len-- > 0) {
4007  if (!ISDIGIT(*s)) return 0;
4008  s++;
4009  }
4010  return 1;
4011 }
4012 
4013 static VALUE str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE);
4014 
4015 static int
4017 {
4018  rb_yield(str);
4019  return 0;
4020 }
4021 
4022 /*
4023  * call-seq:
4024  * str.upto(other_str, exclusive=false) {|s| block } -> str
4025  * str.upto(other_str, exclusive=false) -> an_enumerator
4026  *
4027  * Iterates through successive values, starting at <i>str</i> and
4028  * ending at <i>other_str</i> inclusive, passing each value in turn to
4029  * the block. The <code>String#succ</code> method is used to generate
4030  * each value. If optional second argument exclusive is omitted or is false,
4031  * the last value will be included; otherwise it will be excluded.
4032  *
4033  * If no block is given, an enumerator is returned instead.
4034  *
4035  * "a8".upto("b6") {|s| print s, ' ' }
4036  * for s in "a8".."b6"
4037  * print s, ' '
4038  * end
4039  *
4040  * <em>produces:</em>
4041  *
4042  * a8 a9 b0 b1 b2 b3 b4 b5 b6
4043  * a8 a9 b0 b1 b2 b3 b4 b5 b6
4044  *
4045  * If <i>str</i> and <i>other_str</i> contains only ascii numeric characters,
4046  * both are recognized as decimal numbers. In addition, the width of
4047  * string (e.g. leading zeros) is handled appropriately.
4048  *
4049  * "9".upto("11").to_a #=> ["9", "10", "11"]
4050  * "25".upto("5").to_a #=> []
4051  * "07".upto("11").to_a #=> ["07", "08", "09", "10", "11"]
4052  */
4053 
4054 static VALUE
4056 {
4057  VALUE end, exclusive;
4058 
4059  rb_scan_args(argc, argv, "11", &end, &exclusive);
4060  RETURN_ENUMERATOR(beg, argc, argv);
4061  return str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4062 }
4063 
4064 static VALUE
4065 str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
4066 {
4067  VALUE current, after_end;
4068  ID succ;
4069  int n, ascii;
4070  rb_encoding *enc;
4071 
4072  CONST_ID(succ, "succ");
4073  StringValue(end);
4074  enc = rb_enc_check(beg, end);
4075  ascii = (is_ascii_string(beg) && is_ascii_string(end));
4076  /* single character */
4077  if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
4078  char c = RSTRING_PTR(beg)[0];
4079  char e = RSTRING_PTR(end)[0];
4080 
4081  if (c > e || (excl && c == e)) return beg;
4082  for (;;) {
4083  if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
4084  if (!excl && c == e) break;
4085  c++;
4086  if (excl && c == e) break;
4087  }
4088  return beg;
4089  }
4090  /* both edges are all digits */
4091  if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
4092  all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
4093  all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
4094  VALUE b, e;
4095  int width;
4096 
4097  width = RSTRING_LENINT(beg);
4098  b = rb_str_to_inum(beg, 10, FALSE);
4099  e = rb_str_to_inum(end, 10, FALSE);
4100  if (FIXNUM_P(b) && FIXNUM_P(e)) {
4101  long bi = FIX2LONG(b);
4102  long ei = FIX2LONG(e);
4103  rb_encoding *usascii = rb_usascii_encoding();
4104 
4105  while (bi <= ei) {
4106  if (excl && bi == ei) break;
4107  if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
4108  bi++;
4109  }
4110  }
4111  else {
4112  ID op = excl ? '<' : idLE;
4113  VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
4114 
4115  args[0] = INT2FIX(width);
4116  while (rb_funcall(b, op, 1, e)) {
4117  args[1] = b;
4118  if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
4119  b = rb_funcallv(b, succ, 0, 0);
4120  }
4121  }
4122  return beg;
4123  }
4124  /* normal case */
4125  n = rb_str_cmp(beg, end);
4126  if (n > 0 || (excl && n == 0)) return beg;
4127 
4128  after_end = rb_funcallv(end, succ, 0, 0);
4129  current = rb_str_dup(beg);
4130  while (!rb_str_equal(current, after_end)) {
4131  VALUE next = Qnil;
4132  if (excl || !rb_str_equal(current, end))
4133  next = rb_funcallv(current, succ, 0, 0);
4134  if ((*each)(current, arg)) break;
4135  if (NIL_P(next)) break;
4136  current = next;
4137  StringValue(current);
4138  if (excl && rb_str_equal(current, end)) break;
4139  if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
4140  break;
4141  }
4142 
4143  return beg;
4144 }
4145 
4146 static int
4148 {
4149  VALUE *argp = (VALUE *)arg;
4150  if (!rb_equal(str, *argp)) return 0;
4151  *argp = Qnil;
4152  return 1;
4153 }
4154 
4155 VALUE
4157 {
4158  beg = rb_str_new_frozen(beg);
4159  StringValue(end);
4160  end = rb_str_new_frozen(end);
4161  if (NIL_P(val)) return Qfalse;
4162  val = rb_check_string_type(val);
4163  if (NIL_P(val)) return Qfalse;
4164  if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
4167  const char *bp = RSTRING_PTR(beg);
4168  const char *ep = RSTRING_PTR(end);
4169  const char *vp = RSTRING_PTR(val);
4170  if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
4171  if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
4172  return Qfalse;
4173  else {
4174  char b = *bp;
4175  char e = *ep;
4176  char v = *vp;
4177 
4178  if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
4179  if (b <= v && v < e) return Qtrue;
4180  if (!RTEST(exclusive) && v == e) return Qtrue;
4181  return Qfalse;
4182  }
4183  }
4184  }
4185 #if 0
4186  /* both edges are all digits */
4187  if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
4188  all_digits_p(bp, RSTRING_LEN(beg)) &&
4189  all_digits_p(ep, RSTRING_LEN(end))) {
4190  /* TODO */
4191  }
4192 #endif
4193  }
4194  str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
4195 
4196  return NIL_P(val) ? Qtrue : Qfalse;
4197 }
4198 
4199 static VALUE
4200 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
4201 {
4202  if (rb_reg_search(re, str, 0, 0) >= 0) {
4204  int nth = rb_reg_backref_number(match, backref);
4205  return rb_reg_nth_match(nth, match);
4206  }
4207  return Qnil;
4208 }
4209 
4210 static VALUE
4212 {
4213  long idx;
4214 
4215  if (FIXNUM_P(indx)) {
4216  idx = FIX2LONG(indx);
4217  }
4218  else if (RB_TYPE_P(indx, T_REGEXP)) {
4219  return rb_str_subpat(str, indx, INT2FIX(0));
4220  }
4221  else if (RB_TYPE_P(indx, T_STRING)) {
4222  if (rb_str_index(str, indx, 0) != -1)
4223  return rb_str_dup(indx);
4224  return Qnil;
4225  }
4226  else {
4227  /* check if indx is Range */
4228  long beg, len = str_strlen(str, NULL);
4229  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
4230  case Qfalse:
4231  break;
4232  case Qnil:
4233  return Qnil;
4234  default:
4235  return rb_str_substr(str, beg, len);
4236  }
4237  idx = NUM2LONG(indx);
4238  }
4239 
4240  return str_substr(str, idx, 1, FALSE);
4241 }
4242 
4243 
4244 /*
4245  * call-seq:
4246  * str[index] -> new_str or nil
4247  * str[start, length] -> new_str or nil
4248  * str[range] -> new_str or nil
4249  * str[regexp] -> new_str or nil
4250  * str[regexp, capture] -> new_str or nil
4251  * str[match_str] -> new_str or nil
4252  * str.slice(index) -> new_str or nil
4253  * str.slice(start, length) -> new_str or nil
4254  * str.slice(range) -> new_str or nil
4255  * str.slice(regexp) -> new_str or nil
4256  * str.slice(regexp, capture) -> new_str or nil
4257  * str.slice(match_str) -> new_str or nil
4258  *
4259  * Element Reference --- If passed a single +index+, returns a substring of
4260  * one character at that index. If passed a +start+ index and a +length+,
4261  * returns a substring containing +length+ characters starting at the
4262  * +start+ index. If passed a +range+, its beginning and end are interpreted as
4263  * offsets delimiting the substring to be returned.
4264  *
4265  * In these three cases, if an index is negative, it is counted from the end
4266  * of the string. For the +start+ and +range+ cases the starting index
4267  * is just before a character and an index matching the string's size.
4268  * Additionally, an empty string is returned when the starting index for a
4269  * character range is at the end of the string.
4270  *
4271  * Returns +nil+ if the initial index falls outside the string or the length
4272  * is negative.
4273  *
4274  * If a +Regexp+ is supplied, the matching portion of the string is
4275  * returned. If a +capture+ follows the regular expression, which may be a
4276  * capture group index or name, follows the regular expression that component
4277  * of the MatchData is returned instead.
4278  *
4279  * If a +match_str+ is given, that string is returned if it occurs in
4280  * the string.
4281  *
4282  * Returns +nil+ if the regular expression does not match or the match string
4283  * cannot be found.
4284  *
4285  * a = "hello there"
4286  *
4287  * a[1] #=> "e"
4288  * a[2, 3] #=> "llo"
4289  * a[2..3] #=> "ll"
4290  *
4291  * a[-3, 2] #=> "er"
4292  * a[7..-2] #=> "her"
4293  * a[-4..-2] #=> "her"
4294  * a[-2..-4] #=> ""
4295  *
4296  * a[11, 0] #=> ""
4297  * a[11] #=> nil
4298  * a[12, 0] #=> nil
4299  * a[12..-1] #=> nil
4300  *
4301  * a[/[aeiou](.)\1/] #=> "ell"
4302  * a[/[aeiou](.)\1/, 0] #=> "ell"
4303  * a[/[aeiou](.)\1/, 1] #=> "l"
4304  * a[/[aeiou](.)\1/, 2] #=> nil
4305  *
4306  * a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "non_vowel"] #=> "l"
4307  * a[/(?<vowel>[aeiou])(?<non_vowel>[^aeiou])/, "vowel"] #=> "e"
4308  *
4309  * a["lo"] #=> "lo"
4310  * a["bye"] #=> nil
4311  */
4312 
4313 static VALUE
4315 {
4316  if (argc == 2) {
4317  if (RB_TYPE_P(argv[0], T_REGEXP)) {
4318  return rb_str_subpat(str, argv[0], argv[1]);
4319  }
4320  else {
4321  long beg = NUM2LONG(argv[0]);
4322  long len = NUM2LONG(argv[1]);
4323  return rb_str_substr(str, beg, len);
4324  }
4325  }
4326  rb_check_arity(argc, 1, 2);
4327  return rb_str_aref(str, argv[0]);
4328 }
4329 
4330 VALUE
4331 rb_str_drop_bytes(VALUE str, long len)
4332 {
4333  char *ptr = RSTRING_PTR(str);
4334  long olen = RSTRING_LEN(str), nlen;
4335 
4336  str_modifiable(str);
4337  if (len > olen) len = olen;
4338  nlen = olen - len;
4339  if (STR_EMBEDDABLE_P(nlen, TERM_LEN(str))) {
4340  char *oldptr = ptr;
4341  int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
4342  STR_SET_EMBED(str);
4343  STR_SET_EMBED_LEN(str, nlen);
4344  ptr = RSTRING(str)->as.ary;
4345  memmove(ptr, oldptr + len, nlen);
4346  if (fl == STR_NOEMBED) xfree(oldptr);
4347  }
4348  else {
4349  if (!STR_SHARED_P(str)) rb_str_new_frozen(str);
4350  ptr = RSTRING(str)->as.heap.ptr += len;
4351  RSTRING(str)->as.heap.len = nlen;
4352  }
4353  ptr[nlen] = 0;
4354  ENC_CODERANGE_CLEAR(str);
4355  return str;
4356 }
4357 
4358 static void
4359 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
4360 {
4361  char *sptr;
4362  long slen, vlen = RSTRING_LEN(val);
4363 
4364  if (beg == 0 && vlen == 0) {
4365  rb_str_drop_bytes(str, len);
4366  OBJ_INFECT(str, val);
4367  return;
4368  }
4369 
4370  rb_str_modify(str);
4371  RSTRING_GETMEM(str, sptr, slen);
4372  if (len < vlen) {
4373  /* expand string */
4374  RESIZE_CAPA(str, slen + vlen - len);
4375  sptr = RSTRING_PTR(str);
4376  }
4377 
4378  if (vlen != len) {
4379  memmove(sptr + beg + vlen,
4380  sptr + beg + len,
4381  slen - (beg + len));
4382  }
4383  if (vlen < beg && len < 0) {
4384  MEMZERO(sptr + slen, char, -len);
4385  }
4386  if (vlen > 0) {
4387  memmove(sptr + beg, RSTRING_PTR(val), vlen);
4388  }
4389  slen += vlen - len;
4390  STR_SET_LEN(str, slen);
4391  TERM_FILL(&sptr[slen], TERM_LEN(str));
4392  OBJ_INFECT(str, val);
4393 }
4394 
4395 void
4396 rb_str_update(VALUE str, long beg, long len, VALUE val)
4397 {
4398  long slen;
4399  char *p, *e;
4400  rb_encoding *enc;
4401  int singlebyte = single_byte_optimizable(str);
4402  int cr;
4403 
4404  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
4405 
4406  StringValue(val);
4407  enc = rb_enc_check(str, val);
4408  slen = str_strlen(str, enc); /* rb_enc_check */
4409 
4410  if (slen < beg) {
4411  out_of_range:
4412  rb_raise(rb_eIndexError, "index %ld out of string", beg);
4413  }
4414  if (beg < 0) {
4415  if (beg + slen < 0) {
4416  goto out_of_range;
4417  }
4418  beg += slen;
4419  }
4420  assert(beg >= 0);
4421  assert(beg <= slen);
4422  if (len > slen - beg) {
4423  len = slen - beg;
4424  }
4425  str_modify_keep_cr(str);
4426  p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
4427  if (!p) p = RSTRING_END(str);
4428  e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
4429  if (!e) e = RSTRING_END(str);
4430  /* error check */
4431  beg = p - RSTRING_PTR(str); /* physical position */
4432  len = e - p; /* physical length */
4433  rb_str_splice_0(str, beg, len, val);
4434  rb_enc_associate(str, enc);
4436  if (cr != ENC_CODERANGE_BROKEN)
4437  ENC_CODERANGE_SET(str, cr);
4438 }
4439 
4440 #define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
4441 
4442 static void
4444 {
4445  int nth;
4446  VALUE match;
4447  long start, end, len;
4448  rb_encoding *enc;
4449  struct re_registers *regs;
4450 
4451  if (rb_reg_search(re, str, 0, 0) < 0) {
4452  rb_raise(rb_eIndexError, "regexp not matched");
4453  }
4454  match = rb_backref_get();
4455  nth = rb_reg_backref_number(match, backref);
4456  regs = RMATCH_REGS(match);
4457  if (nth >= regs->num_regs) {
4458  out_of_range:
4459  rb_raise(rb_eIndexError, "index %d out of regexp", nth);
4460  }
4461  if (nth < 0) {
4462  if (-nth >= regs->num_regs) {
4463  goto out_of_range;
4464  }
4465  nth += regs->num_regs;
4466  }
4467 
4468  start = BEG(nth);
4469  if (start == -1) {
4470  rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
4471  }
4472  end = END(nth);
4473  len = end - start;
4474  StringValue(val);
4475  enc = rb_enc_check_str(str, val);
4476  rb_str_splice_0(str, start, len, val);
4477  rb_enc_associate(str, enc);
4478 }
4479 
4480 static VALUE
4482 {
4483  long idx, beg;
4484 
4485  if (FIXNUM_P(indx)) {
4486  idx = FIX2LONG(indx);
4487  num_index:
4488  rb_str_splice(str, idx, 1, val);
4489  return val;
4490  }
4491 
4492  if (SPECIAL_CONST_P(indx)) goto generic;
4493  switch (TYPE(indx)) {
4494  case T_REGEXP:
4495  rb_str_subpat_set(str, indx, INT2FIX(0), val);
4496  return val;
4497 
4498  case T_STRING:
4499  beg = rb_str_index(str, indx, 0);
4500  if (beg < 0) {
4501  rb_raise(rb_eIndexError, "string not matched");
4502  }
4503  beg = rb_str_sublen(str, beg);
4504  rb_str_splice(str, beg, str_strlen(indx, NULL), val);
4505  return val;
4506 
4507  generic:
4508  default:
4509  /* check if indx is Range */
4510  {
4511  long beg, len;
4512  if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
4513  rb_str_splice(str, beg, len, val);
4514  return val;
4515  }
4516  }
4517  idx = NUM2LONG(indx);
4518  goto num_index;
4519  }
4520 }
4521 
4522 /*
4523  * call-seq:
4524  * str[integer] = new_str
4525  * str[integer, integer] = new_str
4526  * str[range] = aString
4527  * str[regexp] = new_str
4528  * str[regexp, integer] = new_str
4529  * str[regexp, name] = new_str
4530  * str[other_str] = new_str
4531  *
4532  * Element Assignment---Replaces some or all of the content of <i>str</i>. The
4533  * portion of the string affected is determined using the same criteria as
4534  * <code>String#[]</code>. If the replacement string is not the same length as
4535  * the text it is replacing, the string will be adjusted accordingly. If the
4536  * regular expression or string is used as the index doesn't match a position
4537  * in the string, <code>IndexError</code> is raised. If the regular expression
4538  * form is used, the optional second <code>Integer</code> allows you to specify
4539  * which portion of the match to replace (effectively using the
4540  * <code>MatchData</code> indexing rules. The forms that take an
4541  * <code>Integer</code> will raise an <code>IndexError</code> if the value is
4542  * out of range; the <code>Range</code> form will raise a
4543  * <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
4544  * will raise an <code>IndexError</code> on negative match.
4545  */
4546 
4547 static VALUE
4549 {
4550  if (argc == 3) {
4551  if (RB_TYPE_P(argv[0], T_REGEXP)) {
4552  rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
4553  }
4554  else {
4555  rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
4556  }
4557  return argv[2];
4558  }
4559  rb_check_arity(argc, 2, 3);
4560  return rb_str_aset(str, argv[0], argv[1]);
4561 }
4562 
4563 /*
4564  * call-seq:
4565  * str.insert(index, other_str) -> str
4566  *
4567  * Inserts <i>other_str</i> before the character at the given
4568  * <i>index</i>, modifying <i>str</i>. Negative indices count from the
4569  * end of the string, and insert <em>after</em> the given character.
4570  * The intent is insert <i>aString</i> so that it starts at the given
4571  * <i>index</i>.
4572  *
4573  * "abcd".insert(0, 'X') #=> "Xabcd"
4574  * "abcd".insert(3, 'X') #=> "abcXd"
4575  * "abcd".insert(4, 'X') #=> "abcdX"
4576  * "abcd".insert(-3, 'X') #=> "abXcd"
4577  * "abcd".insert(-1, 'X') #=> "abcdX"
4578  */
4579 
4580 static VALUE
4582 {
4583  long pos = NUM2LONG(idx);
4584 
4585  if (pos == -1) {
4586  return rb_str_append(str, str2);
4587  }
4588  else if (pos < 0) {
4589  pos++;
4590  }
4591  rb_str_splice(str, pos, 0, str2);
4592  return str;
4593 }
4594 
4595 
4596 /*
4597  * call-seq:
4598  * str.slice!(integer) -> new_str or nil
4599  * str.slice!(integer, integer) -> new_str or nil
4600  * str.slice!(range) -> new_str or nil
4601  * str.slice!(regexp) -> new_str or nil
4602  * str.slice!(other_str) -> new_str or nil
4603  *
4604  * Deletes the specified portion from <i>str</i>, and returns the portion
4605  * deleted.
4606  *
4607  * string = "this is a string"
4608  * string.slice!(2) #=> "i"
4609  * string.slice!(3..6) #=> " is "
4610  * string.slice!(/s.*t/) #=> "sa st"
4611  * string.slice!("r") #=> "r"
4612  * string #=> "thing"
4613  */
4614 
4615 static VALUE
4617 {
4618  VALUE result;
4619  VALUE buf[3];
4620  int i;
4621 
4622  rb_check_arity(argc, 1, 2);
4623  for (i=0; i<argc; i++) {
4624  buf[i] = argv[i];
4625  }
4626  str_modify_keep_cr(str);
4627  result = rb_str_aref_m(argc, buf, str);
4628  if (!NIL_P(result)) {
4629  buf[i] = rb_str_new(0,0);
4630  rb_str_aset_m(argc+1, buf, str);
4631  }
4632  return result;
4633 }
4634 
4635 static VALUE
4637 {
4638  VALUE val;
4639 
4640  if (SPECIAL_CONST_P(pat)) goto to_string;
4641  switch (BUILTIN_TYPE(pat)) {
4642  case T_REGEXP:
4643  return pat;
4644 
4645  case T_STRING:
4646  break;
4647 
4648  default:
4649  to_string:
4650  val = rb_check_string_type(pat);
4651  if (NIL_P(val)) {
4652  Check_Type(pat, T_REGEXP);
4653  }
4654  pat = val;
4655  }
4656 
4657  return rb_reg_regcomp(pat);
4658 }
4659 
4660 static VALUE
4661 get_pat_quoted(VALUE pat, int check)
4662 {
4663  VALUE val;
4664 
4665  if (SPECIAL_CONST_P(pat)) goto to_string;
4666  switch (BUILTIN_TYPE(pat)) {
4667  case T_REGEXP:
4668  return pat;
4669 
4670  case T_STRING:
4671  break;
4672 
4673  default:
4674  to_string:
4675  val = rb_check_string_type(pat);
4676  if (NIL_P(val)) {
4677  Check_Type(pat, T_REGEXP);
4678  }
4679  pat = val;
4680  }
4681  if (check && is_broken_string(pat)) {
4683  }
4684  return pat;
4685 }
4686 
4687 static long
4688 rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
4689 {
4690  if (BUILTIN_TYPE(pat) == T_STRING) {
4691  pos = rb_strseq_index(str, pat, pos, 1);
4692  if (set_backref_str) {
4693  if (pos >= 0) {
4694  VALUE match;
4695  str = rb_str_new_frozen(str);
4696  rb_backref_set_string(str, pos, RSTRING_LEN(pat));
4697  match = rb_backref_get();
4698  OBJ_INFECT(match, pat);
4699  }
4700  else {
4702  }
4703  }
4704  return pos;
4705  }
4706  else {
4707  return rb_reg_search0(pat, str, pos, 0, set_backref_str);
4708  }
4709 }
4710 
4711 
4712 /*
4713  * call-seq:
4714  * str.sub!(pattern, replacement) -> str or nil
4715  * str.sub!(pattern) {|match| block } -> str or nil
4716  *
4717  * Performs the same substitution as String#sub in-place.
4718  *
4719  * Returns +str+ if a substitution was performed or +nil+ if no substitution
4720  * was performed.
4721  */
4722 
4723 static VALUE
4725 {
4726  VALUE pat, repl, hash = Qnil;
4727  int iter = 0;
4728  int tainted = 0;
4729  long plen;
4730  int min_arity = rb_block_given_p() ? 1 : 2;
4731  long beg;
4732 
4733  rb_check_arity(argc, min_arity, 2);
4734  if (argc == 1) {
4735  iter = 1;
4736  }
4737  else {
4738  repl = argv[1];
4739  hash = rb_check_hash_type(argv[1]);
4740  if (NIL_P(hash)) {
4741  StringValue(repl);
4742  }
4743  tainted = OBJ_TAINTED_RAW(repl);
4744  }
4745 
4746  pat = get_pat_quoted(argv[0], 1);
4747 
4748  str_modifiable(str);
4749  beg = rb_pat_search(pat, str, 0, 1);
4750  if (beg >= 0) {
4751  rb_encoding *enc;
4752  int cr = ENC_CODERANGE(str);
4753  long beg0, end0;
4754  VALUE match, match0 = Qnil;
4755  struct re_registers *regs;
4756  char *p, *rp;
4757  long len, rlen;
4758 
4759  match = rb_backref_get();
4760  regs = RMATCH_REGS(match);
4761  if (RB_TYPE_P(pat, T_STRING)) {
4762  beg0 = beg;
4763  end0 = beg0 + RSTRING_LEN(pat);
4764  match0 = pat;
4765  }
4766  else {
4767  beg0 = BEG(0);
4768  end0 = END(0);
4769  if (iter) match0 = rb_reg_nth_match(0, match);
4770  }
4771 
4772  if (iter || !NIL_P(hash)) {
4773  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
4774 
4775  if (iter) {
4776  repl = rb_obj_as_string(rb_yield(match0));
4777  }
4778  else {
4779  repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
4780  repl = rb_obj_as_string(repl);
4781  }
4782  str_mod_check(str, p, len);
4783  rb_check_frozen(str);
4784  }
4785  else {
4786  repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
4787  }
4788 
4789  enc = rb_enc_compatible(str, repl);
4790  if (!enc) {
4791  rb_encoding *str_enc = STR_ENC_GET(str);
4792  p = RSTRING_PTR(str); len = RSTRING_LEN(str);
4793  if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
4794  coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
4795  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
4796  rb_enc_name(str_enc),
4797  rb_enc_name(STR_ENC_GET(repl)));
4798  }
4799  enc = STR_ENC_GET(repl);
4800  }
4801  rb_str_modify(str);
4802  rb_enc_associate(str, enc);
4803  tainted |= OBJ_TAINTED_RAW(repl);
4804  if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
4805  int cr2 = ENC_CODERANGE(repl);
4806  if (cr2 == ENC_CODERANGE_BROKEN ||
4807  (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
4808  cr = ENC_CODERANGE_UNKNOWN;
4809  else
4810  cr = cr2;
4811  }
4812  plen = end0 - beg0;
4813  rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
4814  len = RSTRING_LEN(str);
4815  if (rlen > plen) {
4816  RESIZE_CAPA(str, len + rlen - plen);
4817  }
4818  p = RSTRING_PTR(str);
4819  if (rlen != plen) {
4820  memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
4821  }
4822  memcpy(p + beg0, rp, rlen);
4823  len += rlen - plen;
4824  STR_SET_LEN(str, len);
4825  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
4826  ENC_CODERANGE_SET(str, cr);
4827  FL_SET_RAW(str, tainted);
4828 
4829  return str;
4830  }
4831  return Qnil;
4832 }
4833 
4834 
4835 /*
4836  * call-seq:
4837  * str.sub(pattern, replacement) -> new_str
4838  * str.sub(pattern, hash) -> new_str
4839  * str.sub(pattern) {|match| block } -> new_str
4840  *
4841  * Returns a copy of +str+ with the _first_ occurrence of +pattern+
4842  * replaced by the second argument. The +pattern+ is typically a Regexp; if
4843  * given as a String, any regular expression metacharacters it contains will
4844  * be interpreted literally, e.g. <code>'\\\d'</code> will match a backslash
4845  * followed by 'd', instead of a digit.
4846  *
4847  * If +replacement+ is a String it will be substituted for the matched text.
4848  * It may contain back-references to the pattern's capture groups of the form
4849  * <code>"\\d"</code>, where <i>d</i> is a group number, or
4850  * <code>"\\k<n>"</code>, where <i>n</i> is a group name. If it is a
4851  * double-quoted string, both back-references must be preceded by an
4852  * additional backslash. However, within +replacement+ the special match
4853  * variables, such as <code>$&</code>, will not refer to the current match.
4854  * If +replacement+ is a String that looks like a pattern's capture group but
4855  * is actually not a pattern capture group e.g. <code>"\\'"</code>, then it
4856  * will have to be preceded by two backslashes like so <code>"\\\\'"</code>.
4857  *
4858  * If the second argument is a Hash, and the matched text is one of its keys,
4859  * the corresponding value is the replacement string.
4860  *
4861  * In the block form, the current match string is passed in as a parameter,
4862  * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
4863  * <code>$&</code>, and <code>$'</code> will be set appropriately. The value
4864  * returned by the block will be substituted for the match on each call.
4865  *
4866  * The result inherits any tainting in the original string or any supplied
4867  * replacement string.
4868  *
4869  * "hello".sub(/[aeiou]/, '*') #=> "h*llo"
4870  * "hello".sub(/([aeiou])/, '<\1>') #=> "h<e>llo"
4871  * "hello".sub(/./) {|s| s.ord.to_s + ' ' } #=> "104 ello"
4872  * "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*') #=> "h*e*llo"
4873  * 'Is SHELL your preferred shell?'.sub(/[[:upper:]]{2,}/, ENV)
4874  * #=> "Is /bin/bash your preferred shell?"
4875  */
4876 
4877 static VALUE
4879 {
4880  str = rb_str_dup(str);
4881  rb_str_sub_bang(argc, argv, str);
4882  return str;
4883 }
4884 
4885 static VALUE
4886 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
4887 {
4888  VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
4889  struct re_registers *regs;
4890  long beg, beg0, end0;
4891  long offset, blen, slen, len, last;
4892  enum {STR, ITER, MAP} mode = STR;
4893  char *sp, *cp;
4894  int tainted = 0;
4895  int need_backref = -1;
4896  rb_encoding *str_enc;
4897 
4898  switch (argc) {
4899  case 1:
4900  RETURN_ENUMERATOR(str, argc, argv);
4901  mode = ITER;
4902  break;
4903  case 2:
4904  repl = argv[1];
4905  hash = rb_check_hash_type(argv[1]);
4906  if (NIL_P(hash)) {
4907  StringValue(repl);
4908  }
4909  else {
4910  mode = MAP;
4911  }
4912  tainted = OBJ_TAINTED_RAW(repl);
4913  break;
4914  default:
4915  rb_check_arity(argc, 1, 2);
4916  }
4917 
4918  pat = get_pat_quoted(argv[0], 1);
4919  beg = rb_pat_search(pat, str, 0, need_backref);
4920  if (beg < 0) {
4921  if (bang) return Qnil; /* no match, no substitution */
4922  return rb_str_dup(str);
4923  }
4924 
4925  offset = 0;
4926  blen = RSTRING_LEN(str) + 30; /* len + margin */
4927  dest = rb_str_buf_new(blen);
4928  sp = RSTRING_PTR(str);
4929  slen = RSTRING_LEN(str);
4930  cp = sp;
4931  str_enc = STR_ENC_GET(str);
4932  rb_enc_associate(dest, str_enc);
4934 
4935  do {
4936  match = rb_backref_get();
4937  regs = RMATCH_REGS(match);
4938  if (RB_TYPE_P(pat, T_STRING)) {
4939  beg0 = beg;
4940  end0 = beg0 + RSTRING_LEN(pat);
4941  match0 = pat;
4942  }
4943  else {
4944  beg0 = BEG(0);
4945  end0 = END(0);
4946  if (mode == ITER) match0 = rb_reg_nth_match(0, match);
4947  }
4948 
4949  if (mode) {
4950  if (mode == ITER) {
4951  val = rb_obj_as_string(rb_yield(match0));
4952  }
4953  else {
4954  val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
4955  val = rb_obj_as_string(val);
4956  }
4957  str_mod_check(str, sp, slen);
4958  if (val == dest) { /* paranoid check [ruby-dev:24827] */
4959  rb_raise(rb_eRuntimeError, "block should not cheat");
4960  }
4961  }
4962  else if (need_backref) {
4963  val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
4964  if (need_backref < 0) {
4965  need_backref = val != repl;
4966  }
4967  }
4968  else {
4969  val = repl;
4970  }
4971 
4972  tainted |= OBJ_TAINTED_RAW(val);
4973 
4974  len = beg0 - offset; /* copy pre-match substr */
4975  if (len) {
4976  rb_enc_str_buf_cat(dest, cp, len, str_enc);
4977  }
4978 
4979  rb_str_buf_append(dest, val);
4980 
4981  last = offset;
4982  offset = end0;
4983  if (beg0 == end0) {
4984  /*
4985  * Always consume at least one character of the input string
4986  * in order to prevent infinite loops.
4987  */
4988  if (RSTRING_LEN(str) <= end0) break;
4989  len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
4990  rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
4991  offset = end0 + len;
4992  }
4993  cp = RSTRING_PTR(str) + offset;
4994  if (offset > RSTRING_LEN(str)) break;
4995  beg = rb_pat_search(pat, str, offset, need_backref);
4996  } while (beg >= 0);
4997  if (RSTRING_LEN(str) > offset) {
4998  rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
4999  }
5000  rb_pat_search(pat, str, last, 1);
5001  if (bang) {
5002  str_shared_replace(str, dest);
5003  }
5004  else {
5005  RBASIC_SET_CLASS(dest, rb_obj_class(str));
5006  tainted |= OBJ_TAINTED_RAW(str);
5007  str = dest;
5008  }
5009 
5010  FL_SET_RAW(str, tainted);
5011  return str;
5012 }
5013 
5014 
5015 /*
5016  * call-seq:
5017  * str.gsub!(pattern, replacement) -> str or nil
5018  * str.gsub!(pattern, hash) -> str or nil
5019  * str.gsub!(pattern) {|match| block } -> str or nil
5020  * str.gsub!(pattern) -> an_enumerator
5021  *
5022  * Performs the substitutions of <code>String#gsub</code> in place, returning
5023  * <i>str</i>, or <code>nil</code> if no substitutions were performed.
5024  * If no block and no <i>replacement</i> is given, an enumerator is returned instead.
5025  */
5026 
5027 static VALUE
5029 {
5030  str_modify_keep_cr(str);
5031  return str_gsub(argc, argv, str, 1);
5032 }
5033 
5034 
5035 /*
5036  * call-seq:
5037  * str.gsub(pattern, replacement) -> new_str
5038  * str.gsub(pattern, hash) -> new_str
5039  * str.gsub(pattern) {|match| block } -> new_str
5040  * str.gsub(pattern) -> enumerator
5041  *
5042  * Returns a copy of <i>str</i> with the <em>all</em> occurrences of
5043  * <i>pattern</i> substituted for the second argument. The <i>pattern</i> is
5044  * typically a <code>Regexp</code>; if given as a <code>String</code>, any
5045  * regular expression metacharacters it contains will be interpreted
5046  * literally, e.g. <code>'\\\d'</code> will match a backslash followed by 'd',
5047  * instead of a digit.
5048  *
5049  * If <i>replacement</i> is a <code>String</code> it will be substituted for
5050  * the matched text. It may contain back-references to the pattern's capture
5051  * groups of the form <code>\\\d</code>, where <i>d</i> is a group number, or
5052  * <code>\\\k<n></code>, where <i>n</i> is a group name. If it is a
5053  * double-quoted string, both back-references must be preceded by an
5054  * additional backslash. However, within <i>replacement</i> the special match
5055  * variables, such as <code>$&</code>, will not refer to the current match.
5056  *
5057  * If the second argument is a <code>Hash</code>, and the matched text is one
5058  * of its keys, the corresponding value is the replacement string.
5059  *
5060  * In the block form, the current match string is passed in as a parameter,
5061  * and variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
5062  * <code>$&</code>, and <code>$'</code> will be set appropriately. The value
5063  * returned by the block will be substituted for the match on each call.
5064  *
5065  * The result inherits any tainting in the original string or any supplied
5066  * replacement string.
5067  *
5068  * When neither a block nor a second argument is supplied, an
5069  * <code>Enumerator</code> is returned.
5070  *
5071  * "hello".gsub(/[aeiou]/, '*') #=> "h*ll*"
5072  * "hello".gsub(/([aeiou])/, '<\1>') #=> "h<e>ll<o>"
5073  * "hello".gsub(/./) {|s| s.ord.to_s + ' '} #=> "104 101 108 108 111 "
5074  * "hello".gsub(/(?<foo>[aeiou])/, '{\k<foo>}') #=> "h{e}ll{o}"
5075  * 'hello'.gsub(/[eo]/, 'e' => 3, 'o' => '*') #=> "h3ll*"
5076  */
5077 
5078 static VALUE
5080 {
5081  return str_gsub(argc, argv, str, 0);
5082 }
5083 
5084 
5085 /*
5086  * call-seq:
5087  * str.replace(other_str) -> str
5088  *
5089  * Replaces the contents and taintedness of <i>str</i> with the corresponding
5090  * values in <i>other_str</i>.
5091  *
5092  * s = "hello" #=> "hello"
5093  * s.replace "world" #=> "world"
5094  */
5095 
5096 VALUE
5098 {
5099  str_modifiable(str);
5100  if (str == str2) return str;
5101 
5102  StringValue(str2);
5103  str_discard(str);
5104  return str_replace(str, str2);
5105 }
5106 
5107 /*
5108  * call-seq:
5109  * string.clear -> string
5110  *
5111  * Makes string empty.
5112  *
5113  * a = "abcde"
5114  * a.clear #=> ""
5115  */
5116 
5117 static VALUE
5119 {
5120  str_discard(str);
5121  STR_SET_EMBED(str);
5122  STR_SET_EMBED_LEN(str, 0);
5123  RSTRING_PTR(str)[0] = 0;
5124  if (rb_enc_asciicompat(STR_ENC_GET(str)))
5126  else
5128  return str;
5129 }
5130 
5131 /*
5132  * call-seq:
5133  * string.chr -> string
5134  *
5135  * Returns a one-character string at the beginning of the string.
5136  *
5137  * a = "abcde"
5138  * a.chr #=> "a"
5139  */
5140 
5141 static VALUE
5143 {
5144  return rb_str_substr(str, 0, 1);
5145 }
5146 
5147 /*
5148  * call-seq:
5149  * str.getbyte(index) -> 0 .. 255
5150  *
5151  * returns the <i>index</i>th byte as an integer.
5152  */
5153 static VALUE
5155 {
5156  long pos = NUM2LONG(index);
5157 
5158  if (pos < 0)
5159  pos += RSTRING_LEN(str);
5160  if (pos < 0 || RSTRING_LEN(str) <= pos)
5161  return Qnil;
5162 
5163  return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
5164 }
5165 
5166 /*
5167  * call-seq:
5168  * str.setbyte(index, integer) -> integer
5169  *
5170  * modifies the <i>index</i>th byte as <i>integer</i>.
5171  */
5172 static VALUE
5173 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
5174 {
5175  long pos = NUM2LONG(index);
5176  int byte = NUM2INT(value);
5177  long len = RSTRING_LEN(str);
5178  char *head, *ptr, *left = 0;
5179  rb_encoding *enc;
5180  int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
5181 
5182  if (pos < -len || len <= pos)
5183  rb_raise(rb_eIndexError, "index %ld out of string", pos);
5184  if (pos < 0)
5185  pos += len;
5186 
5187  if (!str_independent(str))
5188  str_make_independent(str);
5189  enc = STR_ENC_GET(str);
5190  head = RSTRING_PTR(str);
5191  ptr = &head[pos];
5192  if (!STR_EMBEDDABLE_P(len, rb_enc_mbminlen(enc))) {
5193  cr = ENC_CODERANGE(str);
5194  switch (cr) {
5195  case ENC_CODERANGE_7BIT:
5196  left = ptr;
5197  *ptr = byte;
5198  if (ISASCII(byte)) break;
5199  nlen = rb_enc_precise_mbclen(left, head+len, enc);
5200  if (!MBCLEN_CHARFOUND_P(nlen))
5202  else
5204  goto end;
5205  case ENC_CODERANGE_VALID:
5206  left = rb_enc_left_char_head(head, ptr, head+len, enc);
5207  width = rb_enc_precise_mbclen(left, head+len, enc);
5208  *ptr = byte;
5209  nlen = rb_enc_precise_mbclen(left, head+len, enc);
5210  if (!MBCLEN_CHARFOUND_P(nlen))
5212  else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
5213  ENC_CODERANGE_CLEAR(str);
5214  goto end;
5215  }
5216  }
5217  ENC_CODERANGE_CLEAR(str);
5218  *ptr = byte;
5219 
5220  end:
5221  return value;
5222 }
5223 
5224 static VALUE
5225 str_byte_substr(VALUE str, long beg, long len, int empty)
5226 {
5227  char *p, *s = RSTRING_PTR(str);
5228  long n = RSTRING_LEN(str);
5229  VALUE str2;
5230 
5231  if (beg > n || len < 0) return Qnil;
5232  if (beg < 0) {
5233  beg += n;
5234  if (beg < 0) return Qnil;
5235  }
5236  if (len > n - beg)
5237  len = n - beg;
5238  if (len <= 0) {
5239  if (!empty) return Qnil;
5240  len = 0;
5241  p = 0;
5242  }
5243  else
5244  p = s + beg;
5245 
5246  if (!STR_EMBEDDABLE_P(len, TERM_LEN(str)) && SHARABLE_SUBSTRING_P(beg, len, n)) {
5247  str2 = rb_str_new_frozen(str);
5248  str2 = str_new_shared(rb_obj_class(str2), str2);
5249  RSTRING(str2)->as.heap.ptr += beg;
5250  RSTRING(str2)->as.heap.len = len;
5251  }
5252  else {
5253  str2 = rb_str_new_with_class(str, p, len);
5254  }
5255 
5256  str_enc_copy(str2, str);
5257 
5258  if (RSTRING_LEN(str2) == 0) {
5259  if (!rb_enc_asciicompat(STR_ENC_GET(str)))
5261  else
5263  }
5264  else {
5265  switch (ENC_CODERANGE(str)) {
5266  case ENC_CODERANGE_7BIT:
5268  break;
5269  default:
5271  break;
5272  }
5273  }
5274 
5275  OBJ_INFECT_RAW(str2, str);
5276 
5277  return str2;
5278 }
5279 
5280 static VALUE
5282 {
5283  long idx;
5284  if (FIXNUM_P(indx)) {
5285  idx = FIX2LONG(indx);
5286  }
5287  else {
5288  /* check if indx is Range */
5289  long beg, len = RSTRING_LEN(str);
5290 
5291  switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5292  case Qfalse:
5293  break;
5294  case Qnil:
5295  return Qnil;
5296  default:
5297  return str_byte_substr(str, beg, len, TRUE);
5298  }
5299 
5300  idx = NUM2LONG(indx);
5301  }
5302  return str_byte_substr(str, idx, 1, FALSE);
5303 }
5304 
5305 /*
5306  * call-seq:
5307  * str.byteslice(integer) -> new_str or nil
5308  * str.byteslice(integer, integer) -> new_str or nil
5309  * str.byteslice(range) -> new_str or nil
5310  *
5311  * Byte Reference---If passed a single <code>Integer</code>, returns a
5312  * substring of one byte at that position. If passed two <code>Integer</code>
5313  * objects, returns a substring starting at the offset given by the first, and
5314  * a length given by the second. If given a <code>Range</code>, a substring containing
5315  * bytes at offsets given by the range is returned. In all three cases, if
5316  * an offset is negative, it is counted from the end of <i>str</i>. Returns
5317  * <code>nil</code> if the initial offset falls outside the string, the length
5318  * is negative, or the beginning of the range is greater than the end.
5319  * The encoding of the resulted string keeps original encoding.
5320  *
5321  * "hello".byteslice(1) #=> "e"
5322  * "hello".byteslice(-1) #=> "o"
5323  * "hello".byteslice(1, 2) #=> "el"
5324  * "\x80\u3042".byteslice(1, 3) #=> "\u3042"
5325  * "\x03\u3042\xff".byteslice(1..3) #=> "\u3042"
5326  */
5327 
5328 static VALUE
5330 {
5331  if (argc == 2) {
5332  long beg = NUM2LONG(argv[0]);
5333  long end = NUM2LONG(argv[1]);
5334  return str_byte_substr(str, beg, end, TRUE);
5335  }
5336  rb_check_arity(argc, 1, 2);
5337  return str_byte_aref(str, argv[0]);
5338 }
5339 
5340 /*
5341  * call-seq:
5342  * str.reverse -> new_str
5343  *
5344  * Returns a new string with the characters from <i>str</i> in reverse order.
5345  *
5346  * "stressed".reverse #=> "desserts"
5347  */
5348 
5349 static VALUE
5351 {
5352  rb_encoding *enc;
5353  VALUE rev;
5354  char *s, *e, *p;
5355  int cr;
5356 
5357  if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
5358  enc = STR_ENC_GET(str);
5359  rev = rb_str_new_with_class(str, 0, RSTRING_LEN(str));
5360  s = RSTRING_PTR(str); e = RSTRING_END(str);
5361  p = RSTRING_END(rev);
5362  cr = ENC_CODERANGE(str);
5363 
5364  if (RSTRING_LEN(str) > 1) {
5365  if (single_byte_optimizable(str)) {
5366  while (s < e) {
5367  *--p = *s++;
5368  }
5369  }
5370  else if (cr == ENC_CODERANGE_VALID) {
5371  while (s < e) {
5372  int clen = rb_enc_fast_mbclen(s, e, enc);
5373 
5374  p -= clen;
5375  memcpy(p, s, clen);
5376  s += clen;
5377  }
5378  }
5379  else {
5380  cr = rb_enc_asciicompat(enc) ?
5382  while (s < e) {
5383  int clen = rb_enc_mbclen(s, e, enc);
5384 
5385  if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
5386  p -= clen;
5387  memcpy(p, s, clen);
5388  s += clen;
5389  }
5390  }
5391  }
5392  STR_SET_LEN(rev, RSTRING_LEN(str));
5393  OBJ_INFECT_RAW(rev, str);
5394  str_enc_copy(rev, str);
5395  ENC_CODERANGE_SET(rev, cr);
5396 
5397  return rev;
5398 }
5399 
5400 
5401 /*
5402  * call-seq:
5403  * str.reverse! -> str
5404  *
5405  * Reverses <i>str</i> in place.
5406  */
5407 
5408 static VALUE
5410 {
5411  if (RSTRING_LEN(str) > 1) {
5412  if (single_byte_optimizable(str)) {
5413  char *s, *e, c;
5414 
5415  str_modify_keep_cr(str);
5416  s = RSTRING_PTR(str);
5417  e = RSTRING_END(str) - 1;
5418  while (s < e) {
5419  c = *s;
5420  *s++ = *e;
5421  *e-- = c;
5422  }
5423  }
5424  else {
5426  }
5427  }
5428  else {
5429  str_modify_keep_cr(str);
5430  }
5431  return str;
5432 }
5433 
5434 
5435 /*
5436  * call-seq:
5437  * str.include? other_str -> true or false
5438  *
5439  * Returns <code>true</code> if <i>str</i> contains the given string or
5440  * character.
5441  *
5442  * "hello".include? "lo" #=> true
5443  * "hello".include? "ol" #=> false
5444  * "hello".include? ?h #=> true
5445  */
5446 
5447 static VALUE
5449 {
5450  long i;
5451 
5452  StringValue(arg);
5453  i = rb_str_index(str, arg, 0);
5454 
5455  if (i == -1) return Qfalse;
5456  return Qtrue;
5457 }
5458 
5459 
5460 /*
5461  * call-seq:
5462  * str.to_i(base=10) -> integer
5463  *
5464  * Returns the result of interpreting leading characters in <i>str</i> as an
5465  * integer base <i>base</i> (between 2 and 36). Extraneous characters past the
5466  * end of a valid number are ignored. If there is not a valid number at the
5467  * start of <i>str</i>, <code>0</code> is returned. This method never raises an
5468  * exception when <i>base</i> is valid.
5469  *
5470  * "12345".to_i #=> 12345
5471  * "99 red balloons".to_i #=> 99
5472  * "0a".to_i #=> 0
5473  * "0a".to_i(16) #=> 10
5474  * "hello".to_i #=> 0
5475  * "1100101".to_i(2) #=> 101
5476  * "1100101".to_i(8) #=> 294977
5477  * "1100101".to_i(10) #=> 1100101
5478  * "1100101".to_i(16) #=> 17826049
5479  */
5480 
5481 static VALUE
5483 {
5484  int base;
5485 
5486  if (argc == 0) base = 10;
5487  else {
5488  VALUE b;
5489 
5490  rb_scan_args(argc, argv, "01", &b);
5491  base = NUM2INT(b);
5492  }
5493  if (base < 0) {
5494  rb_raise(rb_eArgError, "invalid radix %d", base);
5495  }
5496  return rb_str_to_inum(str, base, FALSE);
5497 }
5498 
5499 
5500 /*
5501  * call-seq:
5502  * str.to_f -> float
5503  *
5504  * Returns the result of interpreting leading characters in <i>str</i> as a
5505  * floating point number. Extraneous characters past the end of a valid number
5506  * are ignored. If there is not a valid number at the start of <i>str</i>,
5507  * <code>0.0</code> is returned. This method never raises an exception.
5508  *
5509  * "123.45e1".to_f #=> 1234.5
5510  * "45.67 degrees".to_f #=> 45.67
5511  * "thx1138".to_f #=> 0.0
5512  */
5513 
5514 static VALUE
5516 {
5517  return DBL2NUM(rb_str_to_dbl(str, FALSE));
5518 }
5519 
5520 
5521 /*
5522  * call-seq:
5523  * str.to_s -> str
5524  * str.to_str -> str
5525  *
5526  * Returns +self+.
5527  *
5528  * If called on a subclass of String, converts the receiver to a String object.
5529  */
5530 
5531 static VALUE
5533 {
5534  if (rb_obj_class(str) != rb_cString) {
5535  return str_duplicate(rb_cString, str);
5536  }
5537  return str;
5538 }
5539 
5540 #if 0
5541 static void
5542 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
5543 {
5544  char s[RUBY_MAX_CHAR_LEN];
5545  int n = rb_enc_codelen(c, enc);
5546 
5547  rb_enc_mbcput(c, s, enc);
5548  rb_enc_str_buf_cat(str, s, n, enc);
5549 }
5550 #endif
5551 
5552 #define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
5553 
5554 int
5555 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
5556 {
5557  char buf[CHAR_ESC_LEN + 1];
5558  int l;
5559 
5560 #if SIZEOF_INT > 4
5561  c &= 0xffffffff;
5562 #endif
5563  if (unicode_p) {
5564  if (c < 0x7F && ISPRINT(c)) {
5565  snprintf(buf, CHAR_ESC_LEN, "%c", c);
5566  }
5567  else if (c < 0x10000) {
5568  snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
5569  }
5570  else {
5571  snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
5572  }
5573  }
5574  else {
5575  if (c < 0x100) {
5576  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
5577  }
5578  else {
5579  snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
5580  }
5581  }
5582  l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
5583  rb_str_buf_cat(result, buf, l);
5584  return l;
5585 }
5586 
5587 VALUE
5589 {
5590  int encidx = ENCODING_GET(str);
5591  rb_encoding *enc = rb_enc_from_index(encidx);
5592  const char *p = RSTRING_PTR(str);
5593  const char *pend = RSTRING_END(str);
5594  const char *prev = p;
5595  char buf[CHAR_ESC_LEN + 1];
5597  int unicode_p = rb_enc_unicode_p(enc);
5598  int asciicompat = rb_enc_asciicompat(enc);
5599 
5600  while (p < pend) {
5601  unsigned int c, cc;
5602  int n = rb_enc_precise_mbclen(p, pend, enc);
5603  if (!MBCLEN_CHARFOUND_P(n)) {
5604  if (p > prev) str_buf_cat(result, prev, p - prev);
5605  n = rb_enc_mbminlen(enc);
5606  if (pend < p + n)
5607  n = (int)(pend - p);
5608  while (n--) {
5609  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
5610  str_buf_cat(result, buf, strlen(buf));
5611  prev = ++p;
5612  }
5613  continue;
5614  }
5615  n = MBCLEN_CHARFOUND_LEN(n);
5616  c = rb_enc_mbc_to_codepoint(p, pend, enc);
5617  p += n;
5618  switch (c) {
5619  case '\n': cc = 'n'; break;
5620  case '\r': cc = 'r'; break;
5621  case '\t': cc = 't'; break;
5622  case '\f': cc = 'f'; break;
5623  case '\013': cc = 'v'; break;
5624  case '\010': cc = 'b'; break;
5625  case '\007': cc = 'a'; break;
5626  case 033: cc = 'e'; break;
5627  default: cc = 0; break;
5628  }
5629  if (cc) {
5630  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5631  buf[0] = '\\';
5632  buf[1] = (char)cc;
5633  str_buf_cat(result, buf, 2);
5634  prev = p;
5635  }
5636  else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
5637  }
5638  else {
5639  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5640  rb_str_buf_cat_escaped_char(result, c, unicode_p);
5641  prev = p;
5642  }
5643  }
5644  if (p > prev) str_buf_cat(result, prev, p - prev);
5646 
5647  OBJ_INFECT_RAW(result, str);
5648  return result;
5649 }
5650 
5651 /*
5652  * call-seq:
5653  * str.inspect -> string
5654  *
5655  * Returns a printable version of _str_, surrounded by quote marks,
5656  * with special characters escaped.
5657  *
5658  * str = "hello"
5659  * str[3] = "\b"
5660  * str.inspect #=> "\"hel\\bo\""
5661  */
5662 
5663 VALUE
5665 {
5666  int encidx = ENCODING_GET(str);
5667  rb_encoding *enc = rb_enc_from_index(encidx), *actenc;
5668  const char *p, *pend, *prev;
5669  char buf[CHAR_ESC_LEN + 1];
5672  int unicode_p = rb_enc_unicode_p(enc);
5673  int asciicompat = rb_enc_asciicompat(enc);
5674 
5675  if (resenc == NULL) resenc = rb_default_external_encoding();
5676  if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
5677  rb_enc_associate(result, resenc);
5678  str_buf_cat2(result, "\"");
5679 
5680  p = RSTRING_PTR(str); pend = RSTRING_END(str);
5681  prev = p;
5682  actenc = get_actual_encoding(encidx, str);
5683  if (actenc != enc) {
5684  enc = actenc;
5685  if (unicode_p) unicode_p = rb_enc_unicode_p(enc);
5686  }
5687  while (p < pend) {
5688  unsigned int c, cc;
5689  int n;
5690 
5691  n = rb_enc_precise_mbclen(p, pend, enc);
5692  if (!MBCLEN_CHARFOUND_P(n)) {
5693  if (p > prev) str_buf_cat(result, prev, p - prev);
5694  n = rb_enc_mbminlen(enc);
5695  if (pend < p + n)
5696  n = (int)(pend - p);
5697  while (n--) {
5698  snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
5699  str_buf_cat(result, buf, strlen(buf));
5700  prev = ++p;
5701  }
5702  continue;
5703  }
5704  n = MBCLEN_CHARFOUND_LEN(n);
5705  c = rb_enc_mbc_to_codepoint(p, pend, enc);
5706  p += n;
5707  if ((asciicompat || unicode_p) &&
5708  (c == '"'|| c == '\\' ||
5709  (c == '#' &&
5710  p < pend &&
5712  (cc = rb_enc_codepoint(p,pend,enc),
5713  (cc == '$' || cc == '@' || cc == '{'))))) {
5714  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5715  str_buf_cat2(result, "\\");
5716  if (asciicompat || enc == resenc) {
5717  prev = p - n;
5718  continue;
5719  }
5720  }
5721  switch (c) {
5722  case '\n': cc = 'n'; break;
5723  case '\r': cc = 'r'; break;
5724  case '\t': cc = 't'; break;
5725  case '\f': cc = 'f'; break;
5726  case '\013': cc = 'v'; break;
5727  case '\010': cc = 'b'; break;
5728  case '\007': cc = 'a'; break;
5729  case 033: cc = 'e'; break;
5730  default: cc = 0; break;
5731  }
5732  if (cc) {
5733  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5734  buf[0] = '\\';
5735  buf[1] = (char)cc;
5736  str_buf_cat(result, buf, 2);
5737  prev = p;
5738  continue;
5739  }
5740  if ((enc == resenc && rb_enc_isprint(c, enc)) ||
5741  (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
5742  continue;
5743  }
5744  else {
5745  if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
5746  rb_str_buf_cat_escaped_char(result, c, unicode_p);
5747  prev = p;
5748  continue;
5749  }
5750  }
5751  if (p > prev) str_buf_cat(result, prev, p - prev);
5752  str_buf_cat2(result, "\"");
5753 
5754  OBJ_INFECT_RAW(result, str);
5755  return result;
5756 }
5757 
5758 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
5759 
5760 /*
5761  * call-seq:
5762  * str.dump -> new_str
5763  *
5764  * Produces a version of +str+ with all non-printing characters replaced by
5765  * <code>\nnn</code> notation and all special characters escaped.
5766  *
5767  * "hello \n ''".dump #=> "\"hello \\n ''\""
5768  */
5769 
5770 VALUE
5772 {
5773  int encidx = rb_enc_get_index(str);
5774  rb_encoding *enc = rb_enc_from_index(encidx);
5775  long len;
5776  const char *p, *pend;
5777  char *q, *qend;
5778  VALUE result;
5779  int u8 = (encidx == rb_utf8_encindex());
5780  static const char nonascii_suffix[] = ".force_encoding(\"%s\")";
5781 
5782  len = 2; /* "" */
5783  if (!rb_enc_asciicompat(enc)) {
5784  len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
5785  len += strlen(enc->name);
5786  }
5787 
5788  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
5789  while (p < pend) {
5790  int clen;
5791  unsigned char c = *p++;
5792 
5793  switch (c) {
5794  case '"': case '\\':
5795  case '\n': case '\r':
5796  case '\t': case '\f':
5797  case '\013': case '\010': case '\007': case '\033':
5798  clen = 2;
5799  break;
5800 
5801  case '#':
5802  clen = IS_EVSTR(p, pend) ? 2 : 1;
5803  break;
5804 
5805  default:
5806  if (ISPRINT(c)) {
5807  clen = 1;
5808  }
5809  else {
5810  if (u8 && c > 0x7F) { /* \u notation */
5811  int n = rb_enc_precise_mbclen(p-1, pend, enc);
5812  if (MBCLEN_CHARFOUND_P(n)) {
5813  unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
5814  if (cc <= 0xFFFF)
5815  clen = 6; /* \uXXXX */
5816  else if (cc <= 0xFFFFF)
5817  clen = 9; /* \u{XXXXX} */
5818  else
5819  clen = 10; /* \u{XXXXXX} */
5820  p += MBCLEN_CHARFOUND_LEN(n)-1;
5821  break;
5822  }
5823  }
5824  clen = 4; /* \xNN */
5825  }
5826  break;
5827  }
5828 
5829  if (clen > LONG_MAX - len) {
5830  rb_raise(rb_eRuntimeError, "string size too big");
5831  }
5832  len += clen;
5833  }
5834 
5835  result = rb_str_new_with_class(str, 0, len);
5836  p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
5837  q = RSTRING_PTR(result); qend = q + len + 1;
5838 
5839  *q++ = '"';
5840  while (p < pend) {
5841  unsigned char c = *p++;
5842 
5843  if (c == '"' || c == '\\') {
5844  *q++ = '\\';
5845  *q++ = c;
5846  }
5847  else if (c == '#') {
5848  if (IS_EVSTR(p, pend)) *q++ = '\\';
5849  *q++ = '#';
5850  }
5851  else if (c == '\n') {
5852  *q++ = '\\';
5853  *q++ = 'n';
5854  }
5855  else if (c == '\r') {
5856  *q++ = '\\';
5857  *q++ = 'r';
5858  }
5859  else if (c == '\t') {
5860  *q++ = '\\';
5861  *q++ = 't';
5862  }
5863  else if (c == '\f') {
5864  *q++ = '\\';
5865  *q++ = 'f';
5866  }
5867  else if (c == '\013') {
5868  *q++ = '\\';
5869  *q++ = 'v';
5870  }
5871  else if (c == '\010') {
5872  *q++ = '\\';
5873  *q++ = 'b';
5874  }
5875  else if (c == '\007') {
5876  *q++ = '\\';
5877  *q++ = 'a';
5878  }
5879  else if (c == '\033') {
5880  *q++ = '\\';
5881  *q++ = 'e';
5882  }
5883  else if (ISPRINT(c)) {
5884  *q++ = c;
5885  }
5886  else {
5887  *q++ = '\\';
5888  if (u8) {
5889  int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
5890  if (MBCLEN_CHARFOUND_P(n)) {
5891  int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
5892  p += n;
5893  if (cc <= 0xFFFF)
5894  snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
5895  else
5896  snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
5897  q += strlen(q);
5898  continue;
5899  }
5900  }
5901  snprintf(q, qend-q, "x%02X", c);
5902  q += 3;
5903  }
5904  }
5905  *q++ = '"';
5906  *q = '\0';
5907  if (!rb_enc_asciicompat(enc)) {
5908  snprintf(q, qend-q, nonascii_suffix, enc->name);
5909  encidx = rb_ascii8bit_encindex();
5910  }
5911  OBJ_INFECT_RAW(result, str);
5912  /* result from dump is ASCII */
5913  rb_enc_associate_index(result, encidx);
5915  return result;
5916 }
5917 
5918 
5919 static void
5921 {
5922  if (rb_enc_dummy_p(enc)) {
5923  rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
5924  rb_enc_name(enc));
5925  }
5926 }
5927 
5928 static OnigCaseFoldType
5930 {
5931  if (argc==0)
5932  return flags;
5933  if (argc>2)
5934  rb_raise(rb_eArgError, "too many options");
5935  if (argv[0]==sym_turkic) {
5937  if (argc==2) {
5938  if (argv[1]==sym_lithuanian)
5940  else
5941  rb_raise(rb_eArgError, "invalid second option");
5942  }
5943  }
5944  else if (argv[0]==sym_lithuanian) {
5946  if (argc==2) {
5947  if (argv[1]==sym_turkic)
5949  else
5950  rb_raise(rb_eArgError, "invalid second option");
5951  }
5952  }
5953  else if (argc>1)
5954  rb_raise(rb_eArgError, "too many options");
5955  else if (argv[0]==sym_ascii)
5956  flags |= ONIGENC_CASE_ASCII_ONLY;
5957  else if (argv[0]==sym_fold) {
5960  else
5961  rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
5962  }
5963  else
5964  rb_raise(rb_eArgError, "invalid option");
5965  return flags;
5966 }
5967 
5968 /* 16 should be long enough to absorb any kind of single character length increase */
5969 #define CASE_MAPPING_ADDITIONAL_LENGTH 20
5970 #ifndef CASEMAP_DEBUG
5971 # define CASEMAP_DEBUG 0
5972 #endif
5973 
5974 struct mapping_buffer;
5975 typedef struct mapping_buffer {
5976  size_t capa;
5977  size_t used;
5979  OnigUChar space[1];
5980 } mapping_buffer;
5981 
5982 static VALUE
5984 {
5985  VALUE target;
5986 
5987  OnigUChar *source_current, *source_end;
5988  int target_length = 0;
5989  mapping_buffer pre_buffer, /* only next pointer used */
5990  *current_buffer = &pre_buffer;
5991  size_t buffer_count = 0;
5992  int buffer_length_or_invalid;
5993 
5994  if (RSTRING_LEN(source) == 0) return rb_str_dup(source);
5995 
5996  source_current = (OnigUChar*)RSTRING_PTR(source);
5997  source_end = (OnigUChar*)RSTRING_END(source);
5998 
5999  while (source_current < source_end) {
6000  /* increase multiplier using buffer count to converge quickly */
6001  size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
6002  if (CASEMAP_DEBUG) {
6003  fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
6004  }
6005  current_buffer->next = xmalloc(offsetof(mapping_buffer, space) + capa);
6006  current_buffer = current_buffer->next;
6007  current_buffer->next = NULL;
6008  current_buffer->capa = capa;
6009  buffer_length_or_invalid = enc->case_map(flags,
6010  (const OnigUChar**)&source_current, source_end,
6011  current_buffer->space,
6012  current_buffer->space+current_buffer->capa,
6013  enc);
6014  if (buffer_length_or_invalid < 0) {
6015  mapping_buffer *previous_buffer;
6016 
6017  current_buffer = pre_buffer.next;
6018  while (current_buffer) {
6019  previous_buffer = current_buffer;
6020  current_buffer = current_buffer->next;
6021  xfree(previous_buffer);
6022  }
6023  rb_raise(rb_eArgError, "input string invalid");
6024  }
6025  target_length += current_buffer->used = buffer_length_or_invalid;
6026  }
6027  if (CASEMAP_DEBUG) {
6028  fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
6029  }
6030 
6031  if (buffer_count==1) {
6032  target = rb_str_new_with_class(source, (const char*)current_buffer->space, target_length);
6033  xfree(current_buffer);
6034  }
6035  else {
6036  char *target_current;
6037  mapping_buffer *previous_buffer;
6038 
6039  target = rb_str_new_with_class(source, 0, target_length);
6040  target_current = RSTRING_PTR(target);
6041  current_buffer=pre_buffer.next;
6042  while (current_buffer) {
6043  memcpy(target_current, current_buffer->space, current_buffer->used);
6044  target_current += current_buffer->used;
6045  previous_buffer = current_buffer;
6046  current_buffer = current_buffer->next;
6047  xfree(previous_buffer);
6048  }
6049  }
6050 
6051  /* TODO: check about string terminator character */
6052  OBJ_INFECT_RAW(target, source);
6053  str_enc_copy(target, source);
6054  /*ENC_CODERANGE_SET(mapped, cr);*/
6055 
6056  return target;
6057 }
6058 
6059 static void
6061 {
6062  OnigUChar *source_current, *source_end;
6063  long old_length = RSTRING_LEN(source);
6064  int length_or_invalid;
6065 
6066  if (old_length == 0) return;
6067 
6068  source_current = (OnigUChar*)RSTRING_PTR(source);
6069  source_end = (OnigUChar*)RSTRING_END(source);
6070 
6071  length_or_invalid = onigenc_ascii_only_case_map(flags,
6072  (const OnigUChar**)&source_current, source_end,
6073  source_current, source_end, enc);
6074  if (length_or_invalid < 0)
6075  rb_raise(rb_eArgError, "input string invalid");
6076  if (CASEMAP_DEBUG && length_or_invalid != old_length) {
6077  fprintf(stderr, "problem with rb_str_ascii_casemap"
6078  "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
6079  rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
6080  "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
6081  }
6082 }
6083 
6084 /*
6085  * call-seq:
6086  * str.upcase! -> str or nil
6087  * str.upcase!([options]) -> str or nil
6088  *
6089  * Upcases the contents of <i>str</i>, returning <code>nil</code> if no changes
6090  * were made.
6091  *
6092  * See String#downcase for meaning of +options+ and use with different encodings.
6093  */
6094 
6095 static VALUE
6097 {
6098  rb_encoding *enc;
6100 
6101  flags = check_case_options(argc, argv, flags);
6102  str_modify_keep_cr(str);
6103  enc = STR_ENC_GET(str);
6105  if (((flags&ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc)==1))
6107  char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
6108 
6109  while (s < send) {
6110  unsigned int c = *(unsigned char*)s;
6111 
6112  if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
6113  *s = 'A' + (c - 'a');
6114  flags |= ONIGENC_CASE_MODIFIED;
6115  }
6116  s++;
6117  }
6118  }
6119  else if (flags&ONIGENC_CASE_ASCII_ONLY)
6120  rb_str_ascii_casemap(str, &flags, enc);
6121  else
6122  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6123 
6124  if (ONIGENC_CASE_MODIFIED&flags) return str;
6125  return Qnil;
6126 }
6127 
6128 
6129 /*
6130  * call-seq:
6131  * str.upcase -> new_str
6132  * str.upcase([options]) -> new_str
6133  *
6134  * Returns a copy of <i>str</i> with all lowercase letters replaced with their
6135  * uppercase counterparts.
6136  *
6137  * See String#downcase for meaning of +options+ and use with different encodings.
6138  *
6139  * "hEllO".upcase #=> "HELLO"
6140  */
6141 
6142 static VALUE
6144 {
6145  str = rb_str_dup(str);
6146  rb_str_upcase_bang(argc, argv, str);
6147  return str;
6148 }
6149 
6150 /*
6151  * call-seq:
6152  * str.downcase! -> str or nil
6153  * str.downcase!([options]) -> str or nil
6154  *
6155  * Downcases the contents of <i>str</i>, returning <code>nil</code> if no
6156  * changes were made.
6157  *
6158  * See String#downcase for meaning of +options+ and use with different encodings.
6159  */
6160 
6161 static VALUE
6163 {
6164  rb_encoding *enc;
6166 
6167  flags = check_case_options(argc, argv, flags);
6168  str_modify_keep_cr(str);
6169  enc = STR_ENC_GET(str);
6171  if (((flags&ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc)==1))
6173  char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
6174 
6175  while (s < send) {
6176  unsigned int c = *(unsigned char*)s;
6177 
6178  if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
6179  *s = 'a' + (c - 'A');
6180  flags |= ONIGENC_CASE_MODIFIED;
6181  }
6182  s++;
6183  }
6184  }
6185  else if (flags&ONIGENC_CASE_ASCII_ONLY)
6186  rb_str_ascii_casemap(str, &flags, enc);
6187  else
6188  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6189 
6190  if (ONIGENC_CASE_MODIFIED&flags) return str;
6191  return Qnil;
6192 }
6193 
6194 
6195 /*
6196  * call-seq:
6197  * str.downcase -> new_str
6198  * str.downcase([options]) -> new_str
6199  *
6200  * Returns a copy of <i>str</i> with all uppercase letters replaced with their
6201  * lowercase counterparts. Which letters exactly are replaced, and by which
6202  * other letters, depends on the presence or absence of options, and on the
6203  * +encoding+ of the string.
6204  *
6205  * The meaning of the +options+ is as follows:
6206  *
6207  * No option ::
6208  * Full Unicode case mapping, suitable for most languages
6209  * (see :turkic and :lithuanian options below for exceptions).
6210  * Context-dependent case mapping as described in Table 3-14 of the
6211  * Unicode standard is currently not supported.
6212  * :ascii ::
6213  * Only the ASCII region, i.e. the characters ``A'' to ``Z'' and
6214  * ``a'' to ``z'', are affected.
6215  * This option cannot be combined with any other option.
6216  * :turkic ::
6217  * Full Unicode case mapping, adapted for Turkic languages
6218  * (Turkish, Aserbaijani,...). This means that upper case I is mapped to
6219  * lower case dotless i, and so on.
6220  * :lithuanian ::
6221  * Currently, just full Unicode case mapping. In the future, full Unicode
6222  * case mapping adapted for Lithuanian (keeping the dot on the lower case
6223  * i even if there is an accent on top).
6224  * :fold ::
6225  * Only available on +downcase+ and +downcase!+. Unicode case <b>folding</b>,
6226  * which is more far-reaching than Unicode case mapping.
6227  * This option currently cannot be combined with any other option
6228  * (i.e. there is currenty no variant for turkic languages).
6229  *
6230  * Please note that several assumptions that are valid for ASCII-only case
6231  * conversions do not hold for more general case conversions. For example,
6232  * the length of the result may not be the same as the length of the input
6233  * (neither in characters nor in bytes), some roundtrip assumptions
6234  * (e.g. str.downcase == str.upcase.downcase) may not apply, and Unicode
6235  * normalization (i.e. String#unicode_normalize) is not necessarily maintained
6236  * by case mapping operations.
6237  *
6238  * Non-ASCII case mapping/folding is currently supported for UTF-8,
6239  * UTF-16BE/LE, UTF-32BE/LE, and ISO-8859-1~16 Strings/Symbols.
6240  * This support will be extended to other encodings.
6241  *
6242  * "hEllO".downcase #=> "hello"
6243  */
6244 
6245 static VALUE
6247 {
6248  str = rb_str_dup(str);
6249  rb_str_downcase_bang(argc, argv, str);
6250  return str;
6251 }
6252 
6253 
6254 /*
6255  * call-seq:
6256  * str.capitalize! -> str or nil
6257  * str.capitalize!([options]) -> str or nil
6258  *
6259  * Modifies <i>str</i> by converting the first character to uppercase and the
6260  * remainder to lowercase. Returns <code>nil</code> if no changes are made.
6261  *
6262  * See String#downcase for meaning of +options+ and use with different encodings.
6263  *
6264  * a = "hello"
6265  * a.capitalize! #=> "Hello"
6266  * a #=> "Hello"
6267  * a.capitalize! #=> nil
6268  */
6269 
6270 static VALUE
6272 {
6273  rb_encoding *enc;
6275 
6276  flags = check_case_options(argc, argv, flags);
6277  str_modify_keep_cr(str);
6278  enc = STR_ENC_GET(str);
6280  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
6281  if (flags&ONIGENC_CASE_ASCII_ONLY)
6282  rb_str_ascii_casemap(str, &flags, enc);
6283  else
6284  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6285 
6286  if (ONIGENC_CASE_MODIFIED&flags) return str;
6287  return Qnil;
6288 }
6289 
6290 
6291 /*
6292  * call-seq:
6293  * str.capitalize -> new_str
6294  * str.capitalize([options]) -> new_str
6295  *
6296  * Returns a copy of <i>str</i> with the first character converted to uppercase
6297  * and the remainder to lowercase.
6298  *
6299  * See String#downcase for meaning of +options+ and use with different encodings.
6300  *
6301  * "hello".capitalize #=> "Hello"
6302  * "HELLO".capitalize #=> "Hello"
6303  * "123ABC".capitalize #=> "123abc"
6304  */
6305 
6306 static VALUE
6308 {
6309  str = rb_str_dup(str);
6310  rb_str_capitalize_bang(argc, argv, str);
6311  return str;
6312 }
6313 
6314 
6315 /*
6316  * call-seq:
6317  * str.swapcase! -> str or nil
6318  * str.swapcase!([options]) -> str or nil
6319  *
6320  * Equivalent to <code>String#swapcase</code>, but modifies the receiver in
6321  * place, returning <i>str</i>, or <code>nil</code> if no changes were made.
6322  *
6323  * See String#downcase for meaning of +options+ and use with different encodings.
6324  */
6325 
6326 static VALUE
6328 {
6329  rb_encoding *enc;
6331 
6332  flags = check_case_options(argc, argv, flags);
6333  str_modify_keep_cr(str);
6334  enc = STR_ENC_GET(str);
6336  if (flags&ONIGENC_CASE_ASCII_ONLY)
6337  rb_str_ascii_casemap(str, &flags, enc);
6338  else
6339  str_shared_replace(str, rb_str_casemap(str, &flags, enc));
6340 
6341  if (ONIGENC_CASE_MODIFIED&flags) return str;
6342  return Qnil;
6343 }
6344 
6345 
6346 /*
6347  * call-seq:
6348  * str.swapcase -> new_str
6349  * str.swapcase([options]) -> new_str
6350  *
6351  * Returns a copy of <i>str</i> with uppercase alphabetic characters converted
6352  * to lowercase and lowercase characters converted to uppercase.
6353  *
6354  * See String#downcase for meaning of +options+ and use with different encodings.
6355  *
6356  * "Hello".swapcase #=> "hELLO"
6357  * "cYbEr_PuNk11".swapcase #=> "CyBeR_pUnK11"
6358  */
6359 
6360 static VALUE
6362 {
6363  str = rb_str_dup(str);
6364  rb_str_swapcase_bang(argc, argv, str);
6365  return str;
6366 }
6367 
6368 typedef unsigned char *USTR;
6369 
6370 struct tr {
6371  int gen;
6372  unsigned int now, max;
6373  char *p, *pend;
6374 };
6375 
6376 static unsigned int
6377 trnext(struct tr *t, rb_encoding *enc)
6378 {
6379  int n;
6380 
6381  for (;;) {
6382  if (!t->gen) {
6383 nextpart:
6384  if (t->p == t->pend) return -1;
6385  if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
6386  t->p += n;
6387  }
6388  t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
6389  t->p += n;
6390  if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
6391  t->p += n;
6392  if (t->p < t->pend) {
6393  unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
6394  t->p += n;
6395  if (t->now > c) {
6396  if (t->now < 0x80 && c < 0x80) {
6398  "invalid range \"%c-%c\" in string transliteration",
6399  t->now, c);
6400  }
6401  else {
6402  rb_raise(rb_eArgError, "invalid range in string transliteration");
6403  }
6404  continue; /* not reached */
6405  }
6406  t->gen = 1;
6407  t->max = c;
6408  }
6409  }
6410  return t->now;
6411  }
6412  else {
6413  while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
6414  if (t->now == t->max) {
6415  t->gen = 0;
6416  goto nextpart;
6417  }
6418  }
6419  if (t->now < t->max) {
6420  return t->now;
6421  }
6422  else {
6423  t->gen = 0;
6424  return t->max;
6425  }
6426  }
6427  }
6428 }
6429 
6430 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
6431 
6432 static VALUE
6433 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
6434 {
6435  const unsigned int errc = -1;
6436  unsigned int trans[256];
6437  rb_encoding *enc, *e1, *e2;
6438  struct tr trsrc, trrepl;
6439  int cflag = 0;
6440  unsigned int c, c0, last = 0;
6441  int modify = 0, i, l;
6442  char *s, *send;
6443  VALUE hash = 0;
6444  int singlebyte = single_byte_optimizable(str);
6445  int termlen;
6446  int cr;
6447 
6448 #define CHECK_IF_ASCII(c) \
6449  (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
6450  (cr = ENC_CODERANGE_VALID) : 0)
6451 
6452  StringValue(src);
6453  StringValue(repl);
6454  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
6455  if (RSTRING_LEN(repl) == 0) {
6456  return rb_str_delete_bang(1, &src, str);
6457  }
6458 
6459  cr = ENC_CODERANGE(str);
6460  e1 = rb_enc_check(str, src);
6461  e2 = rb_enc_check(str, repl);
6462  if (e1 == e2) {
6463  enc = e1;
6464  }
6465  else {
6466  enc = rb_enc_check(src, repl);
6467  }
6468  trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
6469  if (RSTRING_LEN(src) > 1 &&
6470  rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
6471  trsrc.p + l < trsrc.pend) {
6472  cflag = 1;
6473  trsrc.p += l;
6474  }
6475  trrepl.p = RSTRING_PTR(repl);
6476  trrepl.pend = trrepl.p + RSTRING_LEN(repl);
6477  trsrc.gen = trrepl.gen = 0;
6478  trsrc.now = trrepl.now = 0;
6479  trsrc.max = trrepl.max = 0;
6480 
6481  if (cflag) {
6482  for (i=0; i<256; i++) {
6483  trans[i] = 1;
6484  }
6485  while ((c = trnext(&trsrc, enc)) != errc) {
6486  if (c < 256) {
6487  trans[c] = errc;
6488  }
6489  else {
6490  if (!hash) hash = rb_hash_new();
6491  rb_hash_aset(hash, UINT2NUM(c), Qtrue);
6492  }
6493  }
6494  while ((c = trnext(&trrepl, enc)) != errc)
6495  /* retrieve last replacer */;
6496  last = trrepl.now;
6497  for (i=0; i<256; i++) {
6498  if (trans[i] != errc) {
6499  trans[i] = last;
6500  }
6501  }
6502  }
6503  else {
6504  unsigned int r;
6505 
6506  for (i=0; i<256; i++) {
6507  trans[i] = errc;
6508  }
6509  while ((c = trnext(&trsrc, enc)) != errc) {
6510  r = trnext(&trrepl, enc);
6511  if (r == errc) r = trrepl.now;
6512  if (c < 256) {
6513  trans[c] = r;
6514  if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
6515  }
6516  else {
6517  if (!hash) hash = rb_hash_new();
6518  rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
6519  }
6520  }
6521  }
6522 
6523  if (cr == ENC_CODERANGE_VALID)
6524  cr = ENC_CODERANGE_7BIT;
6525  str_modify_keep_cr(str);
6526  s = RSTRING_PTR(str); send = RSTRING_END(str);
6527  termlen = rb_enc_mbminlen(enc);
6528  if (sflag) {
6529  int clen, tlen;
6530  long offset, max = RSTRING_LEN(str);
6531  unsigned int save = -1;
6532  char *buf = ALLOC_N(char, max + termlen), *t = buf;
6533 
6534  while (s < send) {
6535  int may_modify = 0;
6536 
6537  c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
6538  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
6539 
6540  s += clen;
6541  if (c < 256) {
6542  c = trans[c];
6543  }
6544  else if (hash) {
6545  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
6546  if (NIL_P(tmp)) {
6547  if (cflag) c = last;
6548  else c = errc;
6549  }
6550  else if (cflag) c = errc;
6551  else c = NUM2INT(tmp);
6552  }
6553  else {
6554  c = errc;
6555  }
6556  if (c != (unsigned int)-1) {
6557  if (save == c) {
6558  CHECK_IF_ASCII(c);
6559  continue;
6560  }
6561  save = c;
6562  tlen = rb_enc_codelen(c, enc);
6563  modify = 1;
6564  }
6565  else {
6566  save = -1;
6567  c = c0;
6568  if (enc != e1) may_modify = 1;
6569  }
6570  if ((offset = t - buf) + tlen > max) {
6571  max = offset + tlen + (send - s);
6572  REALLOC_N(buf, char, max + termlen);
6573  t = buf + offset;
6574  }
6575  rb_enc_mbcput(c, t, enc);
6576  if (may_modify && memcmp(s, t, tlen) != 0) {
6577  modify = 1;
6578  }
6579  CHECK_IF_ASCII(c);
6580  t += tlen;
6581  }
6582  if (!STR_EMBED_P(str)) {
6584  }
6585  TERM_FILL(t, termlen);
6586  RSTRING(str)->as.heap.ptr = buf;
6587  RSTRING(str)->as.heap.len = t - buf;
6588  STR_SET_NOEMBED(str);
6589  RSTRING(str)->as.heap.aux.capa = max;
6590  }
6591  else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
6592  while (s < send) {
6593  c = (unsigned char)*s;
6594  if (trans[c] != errc) {
6595  if (!cflag) {
6596  c = trans[c];
6597  *s = c;
6598  modify = 1;
6599  }
6600  else {
6601  *s = last;
6602  modify = 1;
6603  }
6604  }
6605  CHECK_IF_ASCII(c);
6606  s++;
6607  }
6608  }
6609  else {
6610  int clen, tlen;
6611  long offset, max = (long)((send - s) * 1.2);
6612  char *buf = ALLOC_N(char, max + termlen), *t = buf;
6613 
6614  while (s < send) {
6615  int may_modify = 0;
6616  c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
6617  tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
6618 
6619  if (c < 256) {
6620  c = trans[c];
6621  }
6622  else if (hash) {
6623  VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
6624  if (NIL_P(tmp)) {
6625  if (cflag) c = last;
6626  else c = errc;
6627  }
6628  else if (cflag) c = errc;
6629  else c = NUM2INT(tmp);
6630  }
6631  else {
6632  c = cflag ? last : errc;
6633  }
6634  if (c != errc) {
6635  tlen = rb_enc_codelen(c, enc);
6636  modify = 1;
6637  }
6638  else {
6639  c = c0;
6640  if (enc != e1) may_modify = 1;
6641  }
6642  if ((offset = t - buf) + tlen > max) {
6643  max = offset + tlen + (long)((send - s) * 1.2);
6644  REALLOC_N(buf, char, max + termlen);
6645  t = buf + offset;
6646  }
6647  if (s != t) {
6648  rb_enc_mbcput(c, t, enc);
6649  if (may_modify && memcmp(s, t, tlen) != 0) {
6650  modify = 1;
6651  }
6652  }
6653  CHECK_IF_ASCII(c);
6654  s += clen;
6655  t += tlen;
6656  }
6657  if (!STR_EMBED_P(str)) {
6659  }
6660  TERM_FILL(t, termlen);
6661  RSTRING(str)->as.heap.ptr = buf;
6662  RSTRING(str)->as.heap.len = t - buf;
6663  STR_SET_NOEMBED(str);
6664  RSTRING(str)->as.heap.aux.capa = max;
6665  }
6666 
6667  if (modify) {
6668  if (cr != ENC_CODERANGE_BROKEN)
6669  ENC_CODERANGE_SET(str, cr);
6670  rb_enc_associate(str, enc);
6671  return str;
6672  }
6673  return Qnil;
6674 }
6675 
6676 
6677 /*
6678  * call-seq:
6679  * str.tr!(from_str, to_str) -> str or nil
6680  *
6681  * Translates <i>str</i> in place, using the same rules as
6682  * <code>String#tr</code>. Returns <i>str</i>, or <code>nil</code> if no
6683  * changes were made.
6684  */
6685 
6686 static VALUE
6688 {
6689  return tr_trans(str, src, repl, 0);
6690 }
6691 
6692 
6693 /*
6694  * call-seq:
6695  * str.tr(from_str, to_str) => new_str
6696  *
6697  * Returns a copy of +str+ with the characters in +from_str+ replaced by the
6698  * corresponding characters in +to_str+. If +to_str+ is shorter than
6699  * +from_str+, it is padded with its last character in order to maintain the
6700  * correspondence.
6701  *
6702  * "hello".tr('el', 'ip') #=> "hippo"
6703  * "hello".tr('aeiou', '*') #=> "h*ll*"
6704  * "hello".tr('aeiou', 'AA*') #=> "hAll*"
6705  *
6706  * Both strings may use the <code>c1-c2</code> notation to denote ranges of
6707  * characters, and +from_str+ may start with a <code>^</code>, which denotes
6708  * all characters except those listed.
6709  *
6710  * "hello".tr('a-y', 'b-z') #=> "ifmmp"
6711  * "hello".tr('^aeiou', '*') #=> "*e**o"
6712  *
6713  * The backslash character <code>\</code> can be used to escape
6714  * <code>^</code> or <code>-</code> and is otherwise ignored unless it
6715  * appears at the end of a range or the end of the +from_str+ or +to_str+:
6716  *
6717  * "hello^world".tr("\\^aeiou", "*") #=> "h*ll**w*rld"
6718  * "hello-world".tr("a\\-eo", "*") #=> "h*ll**w*rld"
6719  *
6720  * "hello\r\nworld".tr("\r", "") #=> "hello\nworld"
6721  * "hello\r\nworld".tr("\\r", "") #=> "hello\r\nwold"
6722  * "hello\r\nworld".tr("\\\r", "") #=> "hello\nworld"
6723  *
6724  * "X['\\b']".tr("X\\", "") #=> "['b']"
6725  * "X['\\b']".tr("X-\\]", "") #=> "'b'"
6726  */
6727 
6728 static VALUE
6729 rb_str_tr(VALUE str, VALUE src, VALUE repl)
6730 {
6731  str = rb_str_dup(str);
6732  tr_trans(str, src, repl, 0);
6733  return str;
6734 }
6735 
6736 #define TR_TABLE_SIZE 257
6737 static void
6738 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
6739  VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
6740 {
6741  const unsigned int errc = -1;
6742  char buf[256];
6743  struct tr tr;
6744  unsigned int c;
6745  VALUE table = 0, ptable = 0;
6746  int i, l, cflag = 0;
6747 
6748  tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
6749  tr.gen = tr.now = tr.max = 0;
6750 
6751  if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
6752  cflag = 1;
6753  tr.p += l;
6754  }
6755  if (first) {
6756  for (i=0; i<256; i++) {
6757  stable[i] = 1;
6758  }
6759  stable[256] = cflag;
6760  }
6761  else if (stable[256] && !cflag) {
6762  stable[256] = 0;
6763  }
6764  for (i=0; i<256; i++) {
6765  buf[i] = cflag;
6766  }
6767 
6768  while ((c = trnext(&tr, enc)) != errc) {
6769  if (c < 256) {
6770  buf[c & 0xff] = !cflag;
6771  }
6772  else {
6773  VALUE key = UINT2NUM(c);
6774 
6775  if (!table && (first || *tablep || stable[256])) {
6776  if (cflag) {
6777  ptable = *ctablep;
6778  table = ptable ? ptable : rb_hash_new();
6779  *ctablep = table;
6780  }
6781  else {
6782  table = rb_hash_new();
6783  ptable = *tablep;
6784  *tablep = table;
6785  }
6786  }
6787  if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
6788  rb_hash_aset(table, key, Qtrue);
6789  }
6790  }
6791  }
6792  for (i=0; i<256; i++) {
6793  stable[i] = stable[i] && buf[i];
6794  }
6795  if (!table && !cflag) {
6796  *tablep = 0;
6797  }
6798 }
6799 
6800 
6801 static int
6802 tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
6803 {
6804  if (c < 256) {
6805  return table[c] != 0;
6806  }
6807  else {
6808  VALUE v = UINT2NUM(c);
6809 
6810  if (del) {
6811  if (!NIL_P(rb_hash_lookup(del, v)) &&
6812  (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
6813  return TRUE;
6814  }
6815  }
6816  else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
6817  return FALSE;
6818  }
6819  return table[256] ? TRUE : FALSE;
6820  }
6821 }
6822 
6823 /*
6824  * call-seq:
6825  * str.delete!([other_str]+) -> str or nil
6826  *
6827  * Performs a <code>delete</code> operation in place, returning <i>str</i>, or
6828  * <code>nil</code> if <i>str</i> was not modified.
6829  */
6830 
6831 static VALUE
6833 {
6834  char squeez[TR_TABLE_SIZE];
6835  rb_encoding *enc = 0;
6836  char *s, *send, *t;
6837  VALUE del = 0, nodel = 0;
6838  int modify = 0;
6839  int i, ascompat, cr;
6840 
6841  if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
6843  for (i=0; i<argc; i++) {
6844  VALUE s = argv[i];
6845 
6846  StringValue(s);
6847  enc = rb_enc_check(str, s);
6848  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
6849  }
6850 
6851  str_modify_keep_cr(str);
6852  ascompat = rb_enc_asciicompat(enc);
6853  s = t = RSTRING_PTR(str);
6854  send = RSTRING_END(str);
6855  cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
6856  while (s < send) {
6857  unsigned int c;
6858  int clen;
6859 
6860  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
6861  if (squeez[c]) {
6862  modify = 1;
6863  }
6864  else {
6865  if (t != s) *t = c;
6866  t++;
6867  }
6868  s++;
6869  }
6870  else {
6871  c = rb_enc_codepoint_len(s, send, &clen, enc);
6872 
6873  if (tr_find(c, squeez, del, nodel)) {
6874  modify = 1;
6875  }
6876  else {
6877  if (t != s) rb_enc_mbcput(c, t, enc);
6878  t += clen;
6879  if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
6880  }
6881  s += clen;
6882  }
6883  }
6884  TERM_FILL(t, TERM_LEN(str));
6885  STR_SET_LEN(str, t - RSTRING_PTR(str));
6886  ENC_CODERANGE_SET(str, cr);
6887 
6888  if (modify) return str;
6889  return Qnil;
6890 }
6891 
6892 
6893 /*
6894  * call-seq:
6895  * str.delete([other_str]+) -> new_str
6896  *
6897  * Returns a copy of <i>str</i> with all characters in the intersection of its
6898  * arguments deleted. Uses the same rules for building the set of characters as
6899  * <code>String#count</code>.
6900  *
6901  * "hello".delete "l","lo" #=> "heo"
6902  * "hello".delete "lo" #=> "he"
6903  * "hello".delete "aeiou", "^e" #=> "hell"
6904  * "hello".delete "ej-m" #=> "ho"
6905  */
6906 
6907 static VALUE
6909 {
6910  str = rb_str_dup(str);
6911  rb_str_delete_bang(argc, argv, str);
6912  return str;
6913 }
6914 
6915 
6916 /*
6917  * call-seq:
6918  * str.squeeze!([other_str]*) -> str or nil
6919  *
6920  * Squeezes <i>str</i> in place, returning either <i>str</i>, or
6921  * <code>nil</code> if no changes were made.
6922  */
6923 
6924 static VALUE
6926 {
6927  char squeez[TR_TABLE_SIZE];
6928  rb_encoding *enc = 0;
6929  VALUE del = 0, nodel = 0;
6930  char *s, *send, *t;
6931  int i, modify = 0;
6932  int ascompat, singlebyte = single_byte_optimizable(str);
6933  unsigned int save;
6934 
6935  if (argc == 0) {
6936  enc = STR_ENC_GET(str);
6937  }
6938  else {
6939  for (i=0; i<argc; i++) {
6940  VALUE s = argv[i];
6941 
6942  StringValue(s);
6943  enc = rb_enc_check(str, s);
6944  if (singlebyte && !single_byte_optimizable(s))
6945  singlebyte = 0;
6946  tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
6947  }
6948  }
6949 
6950  str_modify_keep_cr(str);
6951  s = t = RSTRING_PTR(str);
6952  if (!s || RSTRING_LEN(str) == 0) return Qnil;
6953  send = RSTRING_END(str);
6954  save = -1;
6955  ascompat = rb_enc_asciicompat(enc);
6956 
6957  if (singlebyte) {
6958  while (s < send) {
6959  unsigned int c = *(unsigned char*)s++;
6960  if (c != save || (argc > 0 && !squeez[c])) {
6961  *t++ = save = c;
6962  }
6963  }
6964  }
6965  else {
6966  while (s < send) {
6967  unsigned int c;
6968  int clen;
6969 
6970  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
6971  if (c != save || (argc > 0 && !squeez[c])) {
6972  *t++ = save = c;
6973  }
6974  s++;
6975  }
6976  else {
6977  c = rb_enc_codepoint_len(s, send, &clen, enc);
6978 
6979  if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
6980  if (t != s) rb_enc_mbcput(c, t, enc);
6981  save = c;
6982  t += clen;
6983  }
6984  s += clen;
6985  }
6986  }
6987  }
6988 
6989  TERM_FILL(t, TERM_LEN(str));
6990  if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
6991  STR_SET_LEN(str, t - RSTRING_PTR(str));
6992  modify = 1;
6993  }
6994 
6995  if (modify) return str;
6996  return Qnil;
6997 }
6998 
6999 
7000 /*
7001  * call-seq:
7002  * str.squeeze([other_str]*) -> new_str
7003  *
7004  * Builds a set of characters from the <i>other_str</i> parameter(s) using the
7005  * procedure described for <code>String#count</code>. Returns a new string
7006  * where runs of the same character that occur in this set are replaced by a
7007  * single character. If no arguments are given, all runs of identical
7008  * characters are replaced by a single character.
7009  *
7010  * "yellow moon".squeeze #=> "yelow mon"
7011  * " now is the".squeeze(" ") #=> " now is the"
7012  * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
7013  */
7014 
7015 static VALUE
7017 {
7018  str = rb_str_dup(str);
7019  rb_str_squeeze_bang(argc, argv, str);
7020  return str;
7021 }
7022 
7023 
7024 /*
7025  * call-seq:
7026  * str.tr_s!(from_str, to_str) -> str or nil
7027  *
7028  * Performs <code>String#tr_s</code> processing on <i>str</i> in place,
7029  * returning <i>str</i>, or <code>nil</code> if no changes were made.
7030  */
7031 
7032 static VALUE
7034 {
7035  return tr_trans(str, src, repl, 1);
7036 }
7037 
7038 
7039 /*
7040  * call-seq:
7041  * str.tr_s(from_str, to_str) -> new_str
7042  *
7043  * Processes a copy of <i>str</i> as described under <code>String#tr</code>,
7044  * then removes duplicate characters in regions that were affected by the
7045  * translation.
7046  *
7047  * "hello".tr_s('l', 'r') #=> "hero"
7048  * "hello".tr_s('el', '*') #=> "h*o"
7049  * "hello".tr_s('el', 'hx') #=> "hhxo"
7050  */
7051 
7052 static VALUE
7053 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
7054 {
7055  str = rb_str_dup(str);
7056  tr_trans(str, src, repl, 1);
7057  return str;
7058 }
7059 
7060 
7061 /*
7062  * call-seq:
7063  * str.count([other_str]+) -> integer
7064  *
7065  * Each +other_str+ parameter defines a set of characters to count. The
7066  * intersection of these sets defines the characters to count in +str+. Any
7067  * +other_str+ that starts with a caret <code>^</code> is negated. The
7068  * sequence <code>c1-c2</code> means all characters between c1 and c2. The
7069  * backslash character <code>\</code> can be used to escape <code>^</code> or
7070  * <code>-</code> and is otherwise ignored unless it appears at the end of a
7071  * sequence or the end of a +other_str+.
7072  *
7073  * a = "hello world"
7074  * a.count "lo" #=> 5
7075  * a.count "lo", "o" #=> 2
7076  * a.count "hello", "^l" #=> 4
7077  * a.count "ej-m" #=> 4
7078  *
7079  * "hello^world".count "\\^aeiou" #=> 4
7080  * "hello-world".count "a\\-eo" #=> 4
7081  *
7082  * c = "hello world\\r\\n"
7083  * c.count "\\" #=> 2
7084  * c.count "\\A" #=> 0
7085  * c.count "X-\\w" #=> 3
7086  */
7087 
7088 static VALUE
7090 {
7091  char table[TR_TABLE_SIZE];
7092  rb_encoding *enc = 0;
7093  VALUE del = 0, nodel = 0, tstr;
7094  char *s, *send;
7095  int i;
7096  int ascompat;
7097 
7099 
7100  tstr = argv[0];
7101  StringValue(tstr);
7102  enc = rb_enc_check(str, tstr);
7103  if (argc == 1) {
7104  const char *ptstr;
7105  if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
7106  (ptstr = RSTRING_PTR(tstr),
7107  ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
7108  !is_broken_string(str)) {
7109  int n = 0;
7110  int clen;
7111  unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
7112 
7113  s = RSTRING_PTR(str);
7114  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
7115  send = RSTRING_END(str);
7116  while (s < send) {
7117  if (*(unsigned char*)s++ == c) n++;
7118  }
7119  return INT2NUM(n);
7120  }
7121  }
7122 
7123  tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
7124  for (i=1; i<argc; i++) {
7125  tstr = argv[i];
7126  StringValue(tstr);
7127  enc = rb_enc_check(str, tstr);
7128  tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
7129  }
7130 
7131  s = RSTRING_PTR(str);
7132  if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
7133  send = RSTRING_END(str);
7134  ascompat = rb_enc_asciicompat(enc);
7135  i = 0;
7136  while (s < send) {
7137  unsigned int c;
7138 
7139  if (ascompat && (c = *(unsigned char*)s) < 0x80) {
7140  if (table[c]) {
7141  i++;
7142  }
7143  s++;
7144  }
7145  else {
7146  int clen;
7147  c = rb_enc_codepoint_len(s, send, &clen, enc);
7148  if (tr_find(c, table, del, nodel)) {
7149  i++;
7150  }
7151  s += clen;
7152  }
7153  }
7154 
7155  return INT2NUM(i);
7156 }
7157 
7158 static VALUE
7160 {
7161  if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
7162  val = rb_check_string_type(val);
7163  if (NIL_P(val)) return 0;
7164  }
7165  return val;
7166 }
7167 
7168 static const char isspacetable[256] = {
7169  0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
7170  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7171  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7172  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7173  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7174  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7175  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7176  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7177  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7178  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7179  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7180  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7181  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7182  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7183  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
7184  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
7185 };
7186 
7187 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
7188 
7189 /*
7190  * call-seq:
7191  * str.split(pattern=nil, [limit]) -> an_array
7192  *
7193  * Divides <i>str</i> into substrings based on a delimiter, returning an array
7194  * of these substrings.
7195  *
7196  * If <i>pattern</i> is a <code>String</code>, then its contents are used as
7197  * the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
7198  * space, <i>str</i> is split on whitespace, with leading whitespace and runs
7199  * of contiguous whitespace characters ignored.
7200  *
7201  * If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
7202  * pattern matches. Whenever the pattern matches a zero-length string,
7203  * <i>str</i> is split into individual characters. If <i>pattern</i> contains
7204  * groups, the respective matches will be returned in the array as well.
7205  *
7206  * If <i>pattern</i> is <code>nil</code>, the value of <code>$;</code> is used.
7207  * If <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
7208  * split on whitespace as if ' ' were specified.
7209  *
7210  * If the <i>limit</i> parameter is omitted, trailing null fields are
7211  * suppressed. If <i>limit</i> is a positive number, at most that number
7212  * of split substrings will be returned (captured groups will be returned
7213  * as well, but are not counted towards the limit).
7214  * If <i>limit</i> is <code>1</code>, the entire
7215  * string is returned as the only entry in an array. If negative, there is no
7216  * limit to the number of fields returned, and trailing null fields are not
7217  * suppressed.
7218  *
7219  * When the input +str+ is empty an empty Array is returned as the string is
7220  * considered to have no fields to split.
7221  *
7222  * " now's the time".split #=> ["now's", "the", "time"]
7223  * " now's the time".split(' ') #=> ["now's", "the", "time"]
7224  * " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
7225  * "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
7226  * "hello".split(//) #=> ["h", "e", "l", "l", "o"]
7227  * "hello".split(//, 3) #=> ["h", "e", "llo"]
7228  * "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
7229  *
7230  * "mellow yellow".split("ello") #=> ["m", "w y", "w"]
7231  * "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
7232  * "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
7233  * "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
7234  *
7235  * "1:2:3".split(/(:)()()/, 2) #=> ["1", ":", "", "", "2:3"]
7236  *
7237  * "".split(',', -1) #=> []
7238  */
7239 
7240 static VALUE
7242 {
7243  rb_encoding *enc;
7244  VALUE spat;
7245  VALUE limit;
7246  enum {awk, string, regexp} split_type;
7247  long beg, end, i = 0;
7248  int lim = 0;
7249  VALUE result, tmp;
7250 
7251  if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
7252  lim = NUM2INT(limit);
7253  if (lim <= 0) limit = Qnil;
7254  else if (lim == 1) {
7255  if (RSTRING_LEN(str) == 0)
7256  return rb_ary_new2(0);
7257  return rb_ary_new3(1, str);
7258  }
7259  i = 1;
7260  }
7261 
7262  enc = STR_ENC_GET(str);
7263  split_type = regexp;
7264  if (!NIL_P(spat)) {
7265  spat = get_pat_quoted(spat, 0);
7266  }
7267  else if (NIL_P(spat = rb_fs)) {
7268  split_type = awk;
7269  }
7270  else if (!(spat = rb_fs_check(spat))) {
7271  rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
7272  }
7273  if (split_type != awk) {
7274  if (BUILTIN_TYPE(spat) == T_STRING) {
7275  rb_encoding *enc2 = STR_ENC_GET(spat);
7276 
7277  mustnot_broken(spat);
7278  split_type = string;
7279  if (RSTRING_LEN(spat) == 0) {
7280  /* Special case - split into chars */
7281  spat = rb_reg_regcomp(spat);
7282  split_type = regexp;
7283  }
7284  else if (rb_enc_asciicompat(enc2) == 1) {
7285  if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' ') {
7286  split_type = awk;
7287  }
7288  }
7289  else {
7290  int l;
7291  if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
7292  RSTRING_LEN(spat) == l) {
7293  split_type = awk;
7294  }
7295  }
7296  }
7297  }
7298 
7299  result = rb_ary_new();
7300  beg = 0;
7301  if (split_type == awk) {
7302  char *ptr = RSTRING_PTR(str);
7303  char *eptr = RSTRING_END(str);
7304  char *bptr = ptr;
7305  int skip = 1;
7306  unsigned int c;
7307 
7308  end = beg;
7309  if (is_ascii_string(str)) {
7310  while (ptr < eptr) {
7311  c = (unsigned char)*ptr++;
7312  if (skip) {
7313  if (ascii_isspace(c)) {
7314  beg = ptr - bptr;
7315  }
7316  else {
7317  end = ptr - bptr;
7318  skip = 0;
7319  if (!NIL_P(limit) && lim <= i) break;
7320  }
7321  }
7322  else if (ascii_isspace(c)) {
7323  rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
7324  skip = 1;
7325  beg = ptr - bptr;
7326  if (!NIL_P(limit)) ++i;
7327  }
7328  else {
7329  end = ptr - bptr;
7330  }
7331  }
7332  }
7333  else {
7334  while (ptr < eptr) {
7335  int n;
7336 
7337  c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
7338  ptr += n;
7339  if (skip) {
7340  if (rb_isspace(c)) {
7341  beg = ptr - bptr;
7342  }
7343  else {
7344  end = ptr - bptr;
7345  skip = 0;
7346  if (!NIL_P(limit) && lim <= i) break;
7347  }
7348  }
7349  else if (rb_isspace(c)) {
7350  rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
7351  skip = 1;
7352  beg = ptr - bptr;
7353  if (!NIL_P(limit)) ++i;
7354  }
7355  else {
7356  end = ptr - bptr;
7357  }
7358  }
7359  }
7360  }
7361  else if (split_type == string) {
7362  char *ptr = RSTRING_PTR(str);
7363  char *temp = ptr;
7364  char *eptr = RSTRING_END(str);
7365  char *sptr = RSTRING_PTR(spat);
7366  long slen = RSTRING_LEN(spat);
7367 
7368  mustnot_broken(str);
7369  enc = rb_enc_check(str, spat);
7370  while (ptr < eptr &&
7371  (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
7372  /* Check we are at the start of a char */
7373  char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
7374  if (t != ptr + end) {
7375  ptr = t;
7376  continue;
7377  }
7378  rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
7379  ptr += end + slen;
7380  if (!NIL_P(limit) && lim <= ++i) break;
7381  }
7382  beg = ptr - temp;
7383  }
7384  else {
7385  char *ptr = RSTRING_PTR(str);
7386  long len = RSTRING_LEN(str);
7387  long start = beg;
7388  long idx;
7389  int last_null = 0;
7390  struct re_registers *regs;
7391 
7392  while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
7393  regs = RMATCH_REGS(rb_backref_get());
7394  if (start == end && BEG(0) == END(0)) {
7395  if (!ptr) {
7396  rb_ary_push(result, str_new_empty(str));
7397  break;
7398  }
7399  else if (last_null == 1) {
7400  rb_ary_push(result, rb_str_subseq(str, beg,
7401  rb_enc_fast_mbclen(ptr+beg,
7402  ptr+len,
7403  enc)));
7404  beg = start;
7405  }
7406  else {
7407  if (start == len)
7408  start++;
7409  else
7410  start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
7411  last_null = 1;
7412  continue;
7413  }
7414  }
7415  else {
7416  rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
7417  beg = start = END(0);
7418  }
7419  last_null = 0;
7420 
7421  for (idx=1; idx < regs->num_regs; idx++) {
7422  if (BEG(idx) == -1) continue;
7423  if (BEG(idx) == END(idx))
7424  tmp = str_new_empty(str);
7425  else
7426  tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
7427  rb_ary_push(result, tmp);
7428  }
7429  if (!NIL_P(limit) && lim <= ++i) break;
7430  }
7431  }
7432  if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
7433  if (RSTRING_LEN(str) == beg)
7434  tmp = str_new_empty(str);
7435  else
7436  tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
7437  rb_ary_push(result, tmp);
7438  }
7439  if (NIL_P(limit) && lim == 0) {
7440  long len;
7441  while ((len = RARRAY_LEN(result)) > 0 &&
7442  (tmp = RARRAY_AREF(result, len-1), RSTRING_LEN(tmp) == 0))
7443  rb_ary_pop(result);
7444  }
7445 
7446  return result;
7447 }
7448 
7449 VALUE
7450 rb_str_split(VALUE str, const char *sep0)
7451 {
7452  VALUE sep;
7453 
7454  StringValue(str);
7455  sep = rb_str_new_cstr(sep0);
7456  return rb_str_split_m(1, &sep, str);
7457 }
7458 
7459 static const char *
7460 chomp_newline(const char *p, const char *e, rb_encoding *enc)
7461 {
7462  const char *prev = rb_enc_prev_char(p, e, e, enc);
7463  if (rb_enc_is_newline(prev, e, enc)) {
7464  e = prev;
7465  prev = rb_enc_prev_char(p, e, e, enc);
7466  if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
7467  e = prev;
7468  }
7469  return e;
7470 }
7471 
7472 static VALUE
7473 rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, int wantarray)
7474 {
7475  rb_encoding *enc;
7476  VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
7477  const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
7478  long pos, len, rslen;
7479  int paragraph_mode = 0;
7480  int rsnewline = 0;
7481 
7483 
7484  if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
7485  rs = rb_rs;
7486  if (!NIL_P(opts)) {
7487  static ID keywords[1];
7488  if (!keywords[0]) {
7489  keywords[0] = rb_intern_const("chomp");
7490  }
7491  rb_get_kwargs(opts, keywords, 0, 1, &chomp);
7492  chomp = (chomp != Qundef && RTEST(chomp));
7493  }
7494 
7495  if (rb_block_given_p()) {
7496  if (wantarray) {
7497 #if STRING_ENUMERATORS_WANTARRAY
7498  rb_warn("given block not used");
7499  ary = rb_ary_new();
7500 #else
7501  rb_warning("passing a block to String#lines is deprecated");
7502  wantarray = 0;
7503 #endif
7504  }
7505  }
7506  else {
7507  if (wantarray)
7508  ary = rb_ary_new();
7509  else
7510  return SIZED_ENUMERATOR(str, argc, argv, 0);
7511  }
7512 
7513  if (NIL_P(rs)) {
7514  if (wantarray) {
7515  rb_ary_push(ary, str);
7516  return ary;
7517  }
7518  else {
7519  rb_yield(str);
7520  return orig;
7521  }
7522  }
7523 
7524  str = rb_str_new_frozen(str);
7525  ptr = subptr = RSTRING_PTR(str);
7526  pend = RSTRING_END(str);
7527  len = RSTRING_LEN(str);
7528  StringValue(rs);
7529  rslen = RSTRING_LEN(rs);
7530 
7531  if (rs == rb_default_rs)
7532  enc = rb_enc_get(str);
7533  else
7534  enc = rb_enc_check(str, rs);
7535 
7536  if (rslen == 0) {
7537  rsptr = "\n\n";
7538  rslen = 2;
7539  paragraph_mode = 1;
7540  rsnewline = 1;
7541  }
7542  else {
7543  rsptr = RSTRING_PTR(rs);
7544  if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
7545  rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
7546  rsnewline = 1;
7547  }
7548  }
7549 
7550  if ((rs == rb_default_rs || paragraph_mode) && !rb_enc_asciicompat(enc)) {
7551  rs = rb_str_new(rsptr, rslen);
7552  rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
7553  rsptr = RSTRING_PTR(rs);
7554  rslen = RSTRING_LEN(rs);
7555  }
7556 
7557  while (subptr < pend) {
7558  pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
7559  if (pos < 0) break;
7560  hit = subptr + pos;
7561  adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
7562  if (hit != adjusted) {
7563  subptr = adjusted;
7564  continue;
7565  }
7566  subend = hit + rslen;
7567  if (paragraph_mode) {
7568  while (subend < pend) {
7569  int n;
7570  if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
7571  n = 0;
7572  if (!rb_enc_is_newline(subend + n, pend, enc)) break;
7573  subend += n;
7574  subend += rb_enc_mbclen(subend, pend, enc);
7575  }
7576  }
7577  hit = subend;
7578  if (chomp) {
7579  if (rsnewline) {
7580  subend = chomp_newline(subptr, subend, enc);
7581  }
7582  else {
7583  subend -= rslen;
7584  }
7585  }
7586  line = rb_str_subseq(str, subptr - ptr, subend - subptr);
7587  if (wantarray) {
7588  rb_ary_push(ary, line);
7589  }
7590  else {
7591  rb_yield(line);
7592  str_mod_check(str, ptr, len);
7593  }
7594  subptr = hit;
7595  }
7596 
7597  if (subptr != pend) {
7598  if (chomp && paragraph_mode) {
7599  pend = chomp_newline(subptr, pend, enc);
7600  }
7601  line = rb_str_subseq(str, subptr - ptr, pend - subptr);
7602  if (wantarray)
7603  rb_ary_push(ary, line);
7604  else
7605  rb_yield(line);
7606  RB_GC_GUARD(str);
7607  }
7608 
7609  if (wantarray)
7610  return ary;
7611  else
7612  return orig;
7613 }
7614 
7615 /*
7616  * call-seq:
7617  * str.each_line(separator=$/) {|substr| block } -> str
7618  * str.each_line(separator=$/) -> an_enumerator
7619  *
7620  * Splits <i>str</i> using the supplied parameter as the record
7621  * separator (<code>$/</code> by default), passing each substring in
7622  * turn to the supplied block. If a zero-length record separator is
7623  * supplied, the string is split into paragraphs delimited by
7624  * multiple successive newlines.
7625  *
7626  * If no block is given, an enumerator is returned instead.
7627  *
7628  * print "Example one\n"
7629  * "hello\nworld".each_line {|s| p s}
7630  * print "Example two\n"
7631  * "hello\nworld".each_line('l') {|s| p s}
7632  * print "Example three\n"
7633  * "hello\n\n\nworld".each_line('') {|s| p s}
7634  *
7635  * <em>produces:</em>
7636  *
7637  * Example one
7638  * "hello\n"
7639  * "world"
7640  * Example two
7641  * "hel"
7642  * "l"
7643  * "o\nworl"
7644  * "d"
7645  * Example three
7646  * "hello\n\n\n"
7647  * "world"
7648  */
7649 
7650 static VALUE
7652 {
7653  return rb_str_enumerate_lines(argc, argv, str, 0);
7654 }
7655 
7656 /*
7657  * call-seq:
7658  * str.lines(separator=$/) -> an_array
7659  *
7660  * Returns an array of lines in <i>str</i> split using the supplied
7661  * record separator (<code>$/</code> by default). This is a
7662  * shorthand for <code>str.each_line(separator).to_a</code>.
7663  *
7664  * If a block is given, which is a deprecated form, works the same as
7665  * <code>each_line</code>.
7666  */
7667 
7668 static VALUE
7670 {
7671  return rb_str_enumerate_lines(argc, argv, str, 1);
7672 }
7673 
7674 static VALUE
7676 {
7677  return LONG2FIX(RSTRING_LEN(str));
7678 }
7679 
7680 static VALUE
7681 rb_str_enumerate_bytes(VALUE str, int wantarray)
7682 {
7683  long i;
7685 
7686  if (rb_block_given_p()) {
7687  if (wantarray) {
7688 #if STRING_ENUMERATORS_WANTARRAY
7689  rb_warn("given block not used");
7690  ary = rb_ary_new();
7691 #else
7692  rb_warning("passing a block to String#bytes is deprecated");
7693  wantarray = 0;
7694 #endif
7695  }
7696  }
7697  else {
7698  if (wantarray)
7699  ary = rb_ary_new2(RSTRING_LEN(str));
7700  else
7701  return SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
7702  }
7703 
7704  for (i=0; i<RSTRING_LEN(str); i++) {
7705  if (wantarray)
7706  rb_ary_push(ary, INT2FIX(RSTRING_PTR(str)[i] & 0xff));
7707  else
7708  rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
7709  }
7710  if (wantarray)
7711  return ary;
7712  else
7713  return str;
7714 }
7715 
7716 /*
7717  * call-seq:
7718  * str.each_byte {|integer| block } -> str
7719  * str.each_byte -> an_enumerator
7720  *
7721  * Passes each byte in <i>str</i> to the given block, or returns an
7722  * enumerator if no block is given.
7723  *
7724  * "hello".each_byte {|c| print c, ' ' }
7725  *
7726  * <em>produces:</em>
7727  *
7728  * 104 101 108 108 111
7729  */
7730 
7731 static VALUE
7733 {
7734  return rb_str_enumerate_bytes(str, 0);
7735 }
7736 
7737 /*
7738  * call-seq:
7739  * str.bytes -> an_array
7740  *
7741  * Returns an array of bytes in <i>str</i>. This is a shorthand for
7742  * <code>str.each_byte.to_a</code>.
7743  *
7744  * If a block is given, which is a deprecated form, works the same as
7745  * <code>each_byte</code>.
7746  */
7747 
7748 static VALUE
7750 {
7751  return rb_str_enumerate_bytes(str, 1);
7752 }
7753 
7754 static VALUE
7756 {
7757  return rb_str_length(str);
7758 }
7759 
7760 static VALUE
7761 rb_str_enumerate_chars(VALUE str, int wantarray)
7762 {
7763  VALUE orig = str;
7764  VALUE substr;
7765  long i, len, n;
7766  const char *ptr;
7767  rb_encoding *enc;
7769 
7770  str = rb_str_new_frozen(str);
7771  ptr = RSTRING_PTR(str);
7772  len = RSTRING_LEN(str);
7773  enc = rb_enc_get(str);
7774 
7775  if (rb_block_given_p()) {
7776  if (wantarray) {
7777 #if STRING_ENUMERATORS_WANTARRAY
7778  rb_warn("given block not used");
7779  ary = rb_ary_new_capa(str_strlen(str, enc)); /* str's enc*/
7780 #else
7781  rb_warning("passing a block to String#chars is deprecated");
7782  wantarray = 0;
7783 #endif
7784  }
7785  }
7786  else {
7787  if (wantarray)
7788  ary = rb_ary_new_capa(str_strlen(str, enc)); /* str's enc*/
7789  else
7790  return SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
7791  }
7792 
7794  for (i = 0; i < len; i += n) {
7795  n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
7796  substr = rb_str_subseq(str, i, n);
7797  if (wantarray)
7798  rb_ary_push(ary, substr);
7799  else
7800  rb_yield(substr);
7801  }
7802  }
7803  else {
7804  for (i = 0; i < len; i += n) {
7805  n = rb_enc_mbclen(ptr + i, ptr + len, enc);
7806  substr = rb_str_subseq(str, i, n);
7807  if (wantarray)
7808  rb_ary_push(ary, substr);
7809  else
7810  rb_yield(substr);
7811  }
7812  }
7813  RB_GC_GUARD(str);
7814  if (wantarray)
7815  return ary;
7816  else
7817  return orig;
7818 }
7819 
7820 /*
7821  * call-seq:
7822  * str.each_char {|cstr| block } -> str
7823  * str.each_char -> an_enumerator
7824  *
7825  * Passes each character in <i>str</i> to the given block, or returns
7826  * an enumerator if no block is given.
7827  *
7828  * "hello".each_char {|c| print c, ' ' }
7829  *
7830  * <em>produces:</em>
7831  *
7832  * h e l l o
7833  */
7834 
7835 static VALUE
7837 {
7838  return rb_str_enumerate_chars(str, 0);
7839 }
7840 
7841 /*
7842  * call-seq:
7843  * str.chars -> an_array
7844  *
7845  * Returns an array of characters in <i>str</i>. This is a shorthand
7846  * for <code>str.each_char.to_a</code>.
7847  *
7848  * If a block is given, which is a deprecated form, works the same as
7849  * <code>each_char</code>.
7850  */
7851 
7852 static VALUE
7854 {
7855  return rb_str_enumerate_chars(str, 1);
7856 }
7857 
7858 
7859 static VALUE
7861 {
7862  VALUE orig = str;
7863  int n;
7864  unsigned int c;
7865  const char *ptr, *end;
7866  rb_encoding *enc;
7868 
7869  if (single_byte_optimizable(str))
7870  return rb_str_enumerate_bytes(str, wantarray);
7871 
7872  str = rb_str_new_frozen(str);
7873  ptr = RSTRING_PTR(str);
7874  end = RSTRING_END(str);
7875  enc = STR_ENC_GET(str);
7876 
7877  if (rb_block_given_p()) {
7878  if (wantarray) {
7879 #if STRING_ENUMERATORS_WANTARRAY
7880  rb_warn("given block not used");
7881  ary = rb_ary_new_capa(str_strlen(str, enc)); /* str's enc*/
7882 #else
7883  rb_warning("passing a block to String#codepoints is deprecated");
7884  wantarray = 0;
7885 #endif
7886  }
7887  }
7888  else {
7889  if (wantarray)
7890  ary = rb_ary_new_capa(str_strlen(str, enc)); /* str's enc*/
7891  else
7892  return SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
7893  }
7894 
7895  while (ptr < end) {
7896  c = rb_enc_codepoint_len(ptr, end, &n, enc);
7897  if (wantarray)
7898  rb_ary_push(ary, UINT2NUM(c));
7899  else
7900  rb_yield(UINT2NUM(c));
7901  ptr += n;
7902  }
7903  RB_GC_GUARD(str);
7904  if (wantarray)
7905  return ary;
7906  else
7907  return orig;
7908 }
7909 
7910 /*
7911  * call-seq:
7912  * str.each_codepoint {|integer| block } -> str
7913  * str.each_codepoint -> an_enumerator
7914  *
7915  * Passes the <code>Integer</code> ordinal of each character in <i>str</i>,
7916  * also known as a <i>codepoint</i> when applied to Unicode strings to the
7917  * given block.
7918  *
7919  * If no block is given, an enumerator is returned instead.
7920  *
7921  * "hello\u0639".each_codepoint {|c| print c, ' ' }
7922  *
7923  * <em>produces:</em>
7924  *
7925  * 104 101 108 108 111 1593
7926  */
7927 
7928 static VALUE
7930 {
7931  return rb_str_enumerate_codepoints(str, 0);
7932 }
7933 
7934 /*
7935  * call-seq:
7936  * str.codepoints -> an_array
7937  *
7938  * Returns an array of the <code>Integer</code> ordinals of the
7939  * characters in <i>str</i>. This is a shorthand for
7940  * <code>str.each_codepoint.to_a</code>.
7941  *
7942  * If a block is given, which is a deprecated form, works the same as
7943  * <code>each_codepoint</code>.
7944  */
7945 
7946 static VALUE
7948 {
7949  return rb_str_enumerate_codepoints(str, 1);
7950 }
7951 
7952 
7953 static long
7955 {
7956  rb_encoding *enc = STR_ENC_GET(str);
7957  const char *p, *p2, *beg, *end;
7958 
7959  beg = RSTRING_PTR(str);
7960  end = beg + RSTRING_LEN(str);
7961  if (beg > end) return 0;
7962  p = rb_enc_prev_char(beg, end, end, enc);
7963  if (!p) return 0;
7964  if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
7965  p2 = rb_enc_prev_char(beg, p, end, enc);
7966  if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
7967  }
7968  return p - beg;
7969 }
7970 
7971 /*
7972  * call-seq:
7973  * str.chop! -> str or nil
7974  *
7975  * Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
7976  * or <code>nil</code> if <i>str</i> is the empty string. See also
7977  * <code>String#chomp!</code>.
7978  */
7979 
7980 static VALUE
7982 {
7983  str_modify_keep_cr(str);
7984  if (RSTRING_LEN(str) > 0) {
7985  long len;
7986  len = chopped_length(str);
7987  STR_SET_LEN(str, len);
7988  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
7989  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
7990  ENC_CODERANGE_CLEAR(str);
7991  }
7992  return str;
7993  }
7994  return Qnil;
7995 }
7996 
7997 
7998 /*
7999  * call-seq:
8000  * str.chop -> new_str
8001  *
8002  * Returns a new <code>String</code> with the last character removed. If the
8003  * string ends with <code>\r\n</code>, both characters are removed. Applying
8004  * <code>chop</code> to an empty string returns an empty
8005  * string. <code>String#chomp</code> is often a safer alternative, as it leaves
8006  * the string unchanged if it doesn't end in a record separator.
8007  *
8008  * "string\r\n".chop #=> "string"
8009  * "string\n\r".chop #=> "string\n"
8010  * "string\n".chop #=> "string"
8011  * "string".chop #=> "strin"
8012  * "x".chop.chop #=> ""
8013  */
8014 
8015 static VALUE
8017 {
8018  return rb_str_subseq(str, 0, chopped_length(str));
8019 }
8020 
8021 
8022 static long
8024 {
8025  rb_encoding *enc;
8026  int newline;
8027  char *pp, *e, *rsptr;
8028  long rslen;
8029  char *const p = RSTRING_PTR(str);
8030  long len = RSTRING_LEN(str);
8031 
8032  if (len == 0) return 0;
8033  e = p + len;
8034  if (rs == rb_default_rs) {
8035  smart_chomp:
8036  enc = rb_enc_get(str);
8037  if (rb_enc_mbminlen(enc) > 1) {
8038  pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
8039  if (rb_enc_is_newline(pp, e, enc)) {
8040  e = pp;
8041  }
8042  pp = e - rb_enc_mbminlen(enc);
8043  if (pp >= p) {
8044  pp = rb_enc_left_char_head(p, pp, e, enc);
8045  if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
8046  e = pp;
8047  }
8048  }
8049  }
8050  else {
8051  switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
8052  case '\n':
8053  if (--e > p && *(e-1) == '\r') {
8054  --e;
8055  }
8056  break;
8057  case '\r':
8058  --e;
8059  break;
8060  }
8061  }
8062  return e - p;
8063  }
8064 
8065  enc = rb_enc_get(str);
8066  RSTRING_GETMEM(rs, rsptr, rslen);
8067  if (rslen == 0) {
8068  if (rb_enc_mbminlen(enc) > 1) {
8069  while (e > p) {
8070  pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
8071  if (!rb_enc_is_newline(pp, e, enc)) break;
8072  e = pp;
8073  pp -= rb_enc_mbminlen(enc);
8074  if (pp >= p) {
8075  pp = rb_enc_left_char_head(p, pp, e, enc);
8076  if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
8077  e = pp;
8078  }
8079  }
8080  }
8081  }
8082  else {
8083  while (e > p && *(e-1) == '\n') {
8084  --e;
8085  if (e > p && *(e-1) == '\r')
8086  --e;
8087  }
8088  }
8089  return e - p;
8090  }
8091  if (rslen > len) return len;
8092 
8093  enc = rb_enc_get(rs);
8094  newline = rsptr[rslen-1];
8095  if (rslen == rb_enc_mbminlen(enc)) {
8096  if (rslen == 1) {
8097  if (newline == '\n')
8098  goto smart_chomp;
8099  }
8100  else {
8101  if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
8102  goto smart_chomp;
8103  }
8104  }
8105 
8106  enc = rb_enc_check(str, rs);
8107  if (is_broken_string(rs)) {
8108  return len;
8109  }
8110  pp = e - rslen;
8111  if (p[len-1] == newline &&
8112  (rslen <= 1 ||
8113  memcmp(rsptr, pp, rslen) == 0)) {
8114  if (rb_enc_left_char_head(p, pp, e, enc) == pp)
8115  return len - rslen;
8116  RB_GC_GUARD(rs);
8117  }
8118  return len;
8119 }
8120 
8121 static VALUE
8122 chomp_rs(int argc, const VALUE *argv)
8123 {
8124  rb_check_arity(argc, 0, 1);
8125  if (argc > 0) {
8126  VALUE rs = argv[0];
8127  if (!NIL_P(rs)) StringValue(rs);
8128  return rs;
8129  }
8130  else {
8131  return rb_rs;
8132  }
8133 }
8134 
8135 VALUE
8137 {
8138  long olen = RSTRING_LEN(str);
8139  long len = chompped_length(str, rs);
8140  if (len >= olen) return Qnil;
8141  STR_SET_LEN(str, len);
8142  TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
8143  if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
8144  ENC_CODERANGE_CLEAR(str);
8145  }
8146  return str;
8147 }
8148 
8149 /*
8150  * call-seq:
8151  * str.chomp!(separator=$/) -> str or nil
8152  *
8153  * Modifies <i>str</i> in place as described for <code>String#chomp</code>,
8154  * returning <i>str</i>, or <code>nil</code> if no modifications were made.
8155  */
8156 
8157 static VALUE
8159 {
8160  VALUE rs;
8161  str_modify_keep_cr(str);
8162  if (RSTRING_LEN(str) == 0) return Qnil;
8163  rs = chomp_rs(argc, argv);
8164  if (NIL_P(rs)) return Qnil;
8165  return rb_str_chomp_string(str, rs);
8166 }
8167 
8168 
8169 /*
8170  * call-seq:
8171  * str.chomp(separator=$/) -> new_str
8172  *
8173  * Returns a new <code>String</code> with the given record separator removed
8174  * from the end of <i>str</i> (if present). If <code>$/</code> has not been
8175  * changed from the default Ruby record separator, then <code>chomp</code> also
8176  * removes carriage return characters (that is it will remove <code>\n</code>,
8177  * <code>\r</code>, and <code>\r\n</code>). If <code>$/</code> is an empty string,
8178  * it will remove all trailing newlines from the string.
8179  *
8180  * "hello".chomp #=> "hello"
8181  * "hello\n".chomp #=> "hello"
8182  * "hello\r\n".chomp #=> "hello"
8183  * "hello\n\r".chomp #=> "hello\n"
8184  * "hello\r".chomp #=> "hello"
8185  * "hello \n there".chomp #=> "hello \n there"
8186  * "hello".chomp("llo") #=> "he"
8187  * "hello\r\n\r\n".chomp('') #=> "hello"
8188  * "hello\r\n\r\r\n".chomp('') #=> "hello\r\n\r"
8189  */
8190 
8191 static VALUE
8193 {
8194  VALUE rs = chomp_rs(argc, argv);
8195  if (NIL_P(rs)) return rb_str_dup(str);
8196  return rb_str_subseq(str, 0, chompped_length(str, rs));
8197 }
8198 
8199 static long
8200 lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
8201 {
8202  const char *const start = s;
8203 
8204  if (!s || s >= e) return 0;
8205 
8206  /* remove spaces at head */
8207  if (single_byte_optimizable(str)) {
8208  while (s < e && ascii_isspace(*s)) s++;
8209  }
8210  else {
8211  while (s < e) {
8212  int n;
8213  unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
8214 
8215  if (!rb_isspace(cc)) break;
8216  s += n;
8217  }
8218  }
8219  return s - start;
8220 }
8221 
8222 /*
8223  * call-seq:
8224  * str.lstrip! -> self or nil
8225  *
8226  * Removes leading whitespace from <i>str</i>, returning <code>nil</code> if no
8227  * change was made. See also <code>String#rstrip!</code> and
8228  * <code>String#strip!</code>.
8229  *
8230  * Refer to <code>strip</code> for the definition of whitespace.
8231  *
8232  * " hello ".lstrip! #=> "hello "
8233  * "hello ".lstrip! #=> nil
8234  * "hello".lstrip! #=> nil
8235  */
8236 
8237 static VALUE
8239 {
8240  rb_encoding *enc;
8241  char *start, *s;
8242  long olen, loffset;
8243 
8244  str_modify_keep_cr(str);
8245  enc = STR_ENC_GET(str);
8246  RSTRING_GETMEM(str, start, olen);
8247  loffset = lstrip_offset(str, start, start+olen, enc);
8248  if (loffset > 0) {
8249  long len = olen-loffset;
8250  s = start + loffset;
8251  memmove(start, s, len);
8252  STR_SET_LEN(str, len);
8253 #if !SHARABLE_MIDDLE_SUBSTRING
8254  TERM_FILL(start+len, rb_enc_mbminlen(enc));
8255 #endif
8256  return str;
8257  }
8258  return Qnil;
8259 }
8260 
8261 
8262 /*
8263  * call-seq:
8264  * str.lstrip -> new_str
8265  *
8266  * Returns a copy of <i>str</i> with leading whitespace removed. See also
8267  * <code>String#rstrip</code> and <code>String#strip</code>.
8268  *
8269  * Refer to <code>strip</code> for the definition of whitespace.
8270  *
8271  * " hello ".lstrip #=> "hello "
8272  * "hello".lstrip #=> "hello"
8273  */
8274 
8275 static VALUE
8277 {
8278  char *start;
8279  long len, loffset;
8280  RSTRING_GETMEM(str, start, len);
8281  loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
8282  if (loffset <= 0) return rb_str_dup(str);
8283  return rb_str_subseq(str, loffset, len - loffset);
8284 }
8285 
8286 static long
8287 rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
8288 {
8289  const char *t;
8290 
8292  if (!s || s >= e) return 0;
8293  t = e;
8294 
8295  /* remove trailing spaces or '\0's */
8296  if (single_byte_optimizable(str)) {
8297  unsigned char c;
8298  while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
8299  }
8300  else {
8301  char *tp;
8302 
8303  while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
8304  unsigned int c = rb_enc_codepoint(tp, e, enc);
8305  if (c && !rb_isspace(c)) break;
8306  t = tp;
8307  }
8308  }
8309  return e - t;
8310 }
8311 
8312 /*
8313  * call-seq:
8314  * str.rstrip! -> self or nil
8315  *
8316  * Removes trailing whitespace from <i>str</i>, returning <code>nil</code> if
8317  * no change was made. See also <code>String#lstrip!</code> and
8318  * <code>String#strip!</code>.
8319  *
8320  * Refer to <code>strip</code> for the definition of whitespace.
8321  *
8322  * " hello ".rstrip! #=> " hello"
8323  * " hello".rstrip! #=> nil
8324  * "hello".rstrip! #=> nil
8325  */
8326 
8327 static VALUE
8329 {
8330  rb_encoding *enc;
8331  char *start;
8332  long olen, roffset;
8333 
8334  str_modify_keep_cr(str);
8335  enc = STR_ENC_GET(str);
8336  RSTRING_GETMEM(str, start, olen);
8337  roffset = rstrip_offset(str, start, start+olen, enc);
8338  if (roffset > 0) {
8339  long len = olen - roffset;
8340 
8341  STR_SET_LEN(str, len);
8342 #if !SHARABLE_MIDDLE_SUBSTRING
8343  TERM_FILL(start+len, rb_enc_mbminlen(enc));
8344 #endif
8345  return str;
8346  }
8347  return Qnil;
8348 }
8349 
8350 
8351 /*
8352  * call-seq:
8353  * str.rstrip -> new_str
8354  *
8355  * Returns a copy of <i>str</i> with trailing whitespace removed. See also
8356  * <code>String#lstrip</code> and <code>String#strip</code>.
8357  *
8358  * Refer to <code>strip</code> for the definition of whitespace.
8359  *
8360  * " hello ".rstrip #=> " hello"
8361  * "hello".rstrip #=> "hello"
8362  */
8363 
8364 static VALUE
8366 {
8367  rb_encoding *enc;
8368  char *start;
8369  long olen, roffset;
8370 
8371  enc = STR_ENC_GET(str);
8372  RSTRING_GETMEM(str, start, olen);
8373  roffset = rstrip_offset(str, start, start+olen, enc);
8374 
8375  if (roffset <= 0) return rb_str_dup(str);
8376  return rb_str_subseq(str, 0, olen-roffset);
8377 }
8378 
8379 
8380 /*
8381  * call-seq:
8382  * str.strip! -> str or nil
8383  *
8384  * Removes leading and trailing whitespace from <i>str</i>. Returns
8385  * <code>nil</code> if <i>str</i> was not altered.
8386  *
8387  * Refer to <code>strip</code> for the definition of whitespace.
8388  */
8389 
8390 static VALUE
8392 {
8393  char *start;
8394  long olen, loffset, roffset;
8395  rb_encoding *enc;
8396 
8397  str_modify_keep_cr(str);
8398  enc = STR_ENC_GET(str);
8399  RSTRING_GETMEM(str, start, olen);
8400  loffset = lstrip_offset(str, start, start+olen, enc);
8401  roffset = rstrip_offset(str, start+loffset, start+olen, enc);
8402 
8403  if (loffset > 0 || roffset > 0) {
8404  long len = olen-roffset;
8405  if (loffset > 0) {
8406  len -= loffset;
8407  memmove(start, start + loffset, len);
8408  }
8409  STR_SET_LEN(str, len);
8410 #if !SHARABLE_MIDDLE_SUBSTRING
8411  TERM_FILL(start+len, rb_enc_mbminlen(enc));
8412 #endif
8413  return str;
8414  }
8415  return Qnil;
8416 }
8417 
8418 
8419 /*
8420  * call-seq:
8421  * str.strip -> new_str
8422  *
8423  * Returns a copy of <i>str</i> with leading and trailing whitespace removed.
8424  *
8425  * Whitespace is defined as any of the following characters:
8426  * null, horizontal tab, line feed, vertical tab, form feed, carriage return, space.
8427  *
8428  * " hello ".strip #=> "hello"
8429  * "\tgoodbye\r\n".strip #=> "goodbye"
8430  * "\x00\t\n\v\f\r ".strip #=> ""
8431  */
8432 
8433 static VALUE
8435 {
8436  char *start;
8437  long olen, loffset, roffset;
8438  rb_encoding *enc = STR_ENC_GET(str);
8439 
8440  RSTRING_GETMEM(str, start, olen);
8441  loffset = lstrip_offset(str, start, start+olen, enc);
8442  roffset = rstrip_offset(str, start+loffset, start+olen, enc);
8443 
8444  if (loffset <= 0 && roffset <= 0) return rb_str_dup(str);
8445  return rb_str_subseq(str, loffset, olen-loffset-roffset);
8446 }
8447 
8448 static VALUE
8449 scan_once(VALUE str, VALUE pat, long *start)
8450 {
8451  VALUE result, match;
8452  struct re_registers *regs;
8453  int i;
8454 
8455  if (rb_pat_search(pat, str, *start, 1) >= 0) {
8456  match = rb_backref_get();
8457  regs = RMATCH_REGS(match);
8458  if (BEG(0) == END(0)) {
8459  rb_encoding *enc = STR_ENC_GET(str);
8460  /*
8461  * Always consume at least one character of the input string
8462  */
8463  if (RSTRING_LEN(str) > END(0))
8464  *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
8465  RSTRING_END(str), enc);
8466  else
8467  *start = END(0)+1;
8468  }
8469  else {
8470  *start = END(0);
8471  }
8472  if (regs->num_regs == 1) {
8473  return rb_reg_nth_match(0, match);
8474  }
8475  result = rb_ary_new2(regs->num_regs);
8476  for (i=1; i < regs->num_regs; i++) {
8477  rb_ary_push(result, rb_reg_nth_match(i, match));
8478  }
8479 
8480  return result;
8481  }
8482  return Qnil;
8483 }
8484 
8485 
8486 /*
8487  * call-seq:
8488  * str.scan(pattern) -> array
8489  * str.scan(pattern) {|match, ...| block } -> str
8490  *
8491  * Both forms iterate through <i>str</i>, matching the pattern (which may be a
8492  * <code>Regexp</code> or a <code>String</code>). For each match, a result is
8493  * generated and either added to the result array or passed to the block. If
8494  * the pattern contains no groups, each individual result consists of the
8495  * matched string, <code>$&</code>. If the pattern contains groups, each
8496  * individual result is itself an array containing one entry per group.
8497  *
8498  * a = "cruel world"
8499  * a.scan(/\w+/) #=> ["cruel", "world"]
8500  * a.scan(/.../) #=> ["cru", "el ", "wor"]
8501  * a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
8502  * a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
8503  *
8504  * And the block form:
8505  *
8506  * a.scan(/\w+/) {|w| print "<<#{w}>> " }
8507  * print "\n"
8508  * a.scan(/(.)(.)/) {|x,y| print y, x }
8509  * print "\n"
8510  *
8511  * <em>produces:</em>
8512  *
8513  * <<cruel>> <<world>>
8514  * rceu lowlr
8515  */
8516 
8517 static VALUE
8519 {
8520  VALUE result;
8521  long start = 0;
8522  long last = -1, prev = 0;
8523  char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
8524 
8525  pat = get_pat_quoted(pat, 1);
8526  mustnot_broken(str);
8527  if (!rb_block_given_p()) {
8528  VALUE ary = rb_ary_new();
8529 
8530  while (!NIL_P(result = scan_once(str, pat, &start))) {
8531  last = prev;
8532  prev = start;
8533  rb_ary_push(ary, result);
8534  }
8535  if (last >= 0) rb_pat_search(pat, str, last, 1);
8536  return ary;
8537  }
8538 
8539  while (!NIL_P(result = scan_once(str, pat, &start))) {
8540  last = prev;
8541  prev = start;
8542  rb_yield(result);
8543  str_mod_check(str, p, len);
8544  }
8545  if (last >= 0) rb_pat_search(pat, str, last, 1);
8546  return str;
8547 }
8548 
8549 
8550 /*
8551  * call-seq:
8552  * str.hex -> integer
8553  *
8554  * Treats leading characters from <i>str</i> as a string of hexadecimal digits
8555  * (with an optional sign and an optional <code>0x</code>) and returns the
8556  * corresponding number. Zero is returned on error.
8557  *
8558  * "0x0a".hex #=> 10
8559  * "-1234".hex #=> -4660
8560  * "0".hex #=> 0
8561  * "wombat".hex #=> 0
8562  */
8563 
8564 static VALUE
8566 {
8567  return rb_str_to_inum(str, 16, FALSE);
8568 }
8569 
8570 
8571 /*
8572  * call-seq:
8573  * str.oct -> integer
8574  *
8575  * Treats leading characters of <i>str</i> as a string of octal digits (with an
8576  * optional sign) and returns the corresponding number. Returns 0 if the
8577  * conversion fails.
8578  *
8579  * "123".oct #=> 83
8580  * "-377".oct #=> -255
8581  * "bad".oct #=> 0
8582  * "0377bad".oct #=> 255
8583  *
8584  * If +str+ starts with <code>0</code>, radix indicators are honored.
8585  * See Kernel#Integer.
8586  */
8587 
8588 static VALUE
8590 {
8591  return rb_str_to_inum(str, -8, FALSE);
8592 }
8593 
8594 
8595 /*
8596  * call-seq:
8597  * str.crypt(salt_str) -> new_str
8598  *
8599  * Applies a one-way cryptographic hash to <i>str</i> by invoking the
8600  * standard library function <code>crypt(3)</code> with the given
8601  * salt string. While the format and the result are system and
8602  * implementation dependent, using a salt matching the regular
8603  * expression <code>\A[a-zA-Z0-9./]{2}</code> should be valid and
8604  * safe on any platform, in which only the first two characters are
8605  * significant.
8606  *
8607  * This method is for use in system specific scripts, so if you want
8608  * a cross-platform hash function consider using Digest or OpenSSL
8609  * instead.
8610  */
8611 
8612 static VALUE
8614 {
8615 #ifdef HAVE_CRYPT_R
8616  struct crypt_data data;
8617 #else
8618  extern char *crypt(const char *, const char *);
8619 #endif
8620  VALUE result;
8621  const char *s, *saltp;
8622  char *res;
8623 #ifdef BROKEN_CRYPT
8624  char salt_8bit_clean[3];
8625 #endif
8626 
8627  StringValue(salt);
8628  mustnot_wchar(str);
8629  mustnot_wchar(salt);
8630  if (RSTRING_LEN(salt) < 2) {
8631  short_salt:
8632  rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
8633  }
8634 
8635  s = StringValueCStr(str);
8636  saltp = RSTRING_PTR(salt);
8637  if (!saltp[0] || !saltp[1]) goto short_salt;
8638 #ifdef BROKEN_CRYPT
8639  if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
8640  salt_8bit_clean[0] = saltp[0] & 0x7f;
8641  salt_8bit_clean[1] = saltp[1] & 0x7f;
8642  salt_8bit_clean[2] = '\0';
8643  saltp = salt_8bit_clean;
8644  }
8645 #endif
8646 #ifdef HAVE_CRYPT_R
8647 # ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
8648  data.initialized = 0;
8649 # endif
8650  res = crypt_r(s, saltp, &data);
8651 #else
8652  res = crypt(s, saltp);
8653 #endif
8654  if (!res) {
8655  rb_sys_fail("crypt");
8656  }
8657  result = rb_str_new_cstr(res);
8658  FL_SET_RAW(result, OBJ_TAINTED_RAW(str) | OBJ_TAINTED_RAW(salt));
8659  return result;
8660 }
8661 
8662 
8663 /*
8664  * call-seq:
8665  * str.ord -> integer
8666  *
8667  * Return the <code>Integer</code> ordinal of a one-character string.
8668  *
8669  * "a".ord #=> 97
8670  */
8671 
8672 VALUE
8674 {
8675  unsigned int c;
8676 
8678  return UINT2NUM(c);
8679 }
8680 /*
8681  * call-seq:
8682  * str.sum(n=16) -> integer
8683  *
8684  * Returns a basic <em>n</em>-bit checksum of the characters in <i>str</i>,
8685  * where <em>n</em> is the optional <code>Integer</code> parameter, defaulting
8686  * to 16. The result is simply the sum of the binary value of each byte in
8687  * <i>str</i> modulo <code>2**n - 1</code>. This is not a particularly good
8688  * checksum.
8689  */
8690 
8691 static VALUE
8693 {
8694  VALUE vbits;
8695  int bits;
8696  char *ptr, *p, *pend;
8697  long len;
8698  VALUE sum = INT2FIX(0);
8699  unsigned long sum0 = 0;
8700 
8701  if (argc == 0) {
8702  bits = 16;
8703  }
8704  else {
8705  rb_scan_args(argc, argv, "01", &vbits);
8706  bits = NUM2INT(vbits);
8707  if (bits < 0)
8708  bits = 0;
8709  }
8710  ptr = p = RSTRING_PTR(str);
8711  len = RSTRING_LEN(str);
8712  pend = p + len;
8713 
8714  while (p < pend) {
8715  if (FIXNUM_MAX - UCHAR_MAX < sum0) {
8716  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
8717  str_mod_check(str, ptr, len);
8718  sum0 = 0;
8719  }
8720  sum0 += (unsigned char)*p;
8721  p++;
8722  }
8723 
8724  if (bits == 0) {
8725  if (sum0) {
8726  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
8727  }
8728  }
8729  else {
8730  if (sum == INT2FIX(0)) {
8731  if (bits < (int)sizeof(long)*CHAR_BIT) {
8732  sum0 &= (((unsigned long)1)<<bits)-1;
8733  }
8734  sum = LONG2FIX(sum0);
8735  }
8736  else {
8737  VALUE mod;
8738 
8739  if (sum0) {
8740  sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
8741  }
8742 
8743  mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
8744  mod = rb_funcall(mod, '-', 1, INT2FIX(1));
8745  sum = rb_funcall(sum, '&', 1, mod);
8746  }
8747  }
8748  return sum;
8749 }
8750 
8751 static VALUE
8752 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
8753 {
8754  rb_encoding *enc;
8755  VALUE w;
8756  long width, len, flen = 1, fclen = 1;
8757  VALUE res;
8758  char *p;
8759  const char *f = " ";
8760  long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
8761  VALUE pad;
8762  int singlebyte = 1, cr;
8763  int termlen;
8764 
8765  rb_scan_args(argc, argv, "11", &w, &pad);
8766  enc = STR_ENC_GET(str);
8767  termlen = rb_enc_mbminlen(enc);
8768  width = NUM2LONG(w);
8769  if (argc == 2) {
8770  StringValue(pad);
8771  enc = rb_enc_check(str, pad);
8772  f = RSTRING_PTR(pad);
8773  flen = RSTRING_LEN(pad);
8774  fclen = str_strlen(pad, enc); /* rb_enc_check */
8775  singlebyte = single_byte_optimizable(pad);
8776  if (flen == 0 || fclen == 0) {
8777  rb_raise(rb_eArgError, "zero width padding");
8778  }
8779  }
8780  len = str_strlen(str, enc); /* rb_enc_check */
8781  if (width < 0 || len >= width) return rb_str_dup(str);
8782  n = width - len;
8783  llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
8784  rlen = n - llen;
8785  cr = ENC_CODERANGE(str);
8786  if (flen > 1) {
8787  llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
8788  rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
8789  }
8790  size = RSTRING_LEN(str);
8791  if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
8792  (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
8793  (len += llen2 + rlen2) >= LONG_MAX - size) {
8794  rb_raise(rb_eArgError, "argument too big");
8795  }
8796  len += size;
8797  res = str_new0(rb_obj_class(str), 0, len, termlen);
8798  p = RSTRING_PTR(res);
8799  if (flen <= 1) {
8800  memset(p, *f, llen);
8801  p += llen;
8802  }
8803  else {
8804  while (llen >= fclen) {
8805  memcpy(p,f,flen);
8806  p += flen;
8807  llen -= fclen;
8808  }
8809  if (llen > 0) {
8810  memcpy(p, f, llen2);
8811  p += llen2;
8812  }
8813  }
8814  memcpy(p, RSTRING_PTR(str), size);
8815  p += size;
8816  if (flen <= 1) {
8817  memset(p, *f, rlen);
8818  p += rlen;
8819  }
8820  else {
8821  while (rlen >= fclen) {
8822  memcpy(p,f,flen);
8823  p += flen;
8824  rlen -= fclen;
8825  }
8826  if (rlen > 0) {
8827  memcpy(p, f, rlen2);
8828  p += rlen2;
8829  }
8830  }
8831  TERM_FILL(p, termlen);
8832  STR_SET_LEN(res, p-RSTRING_PTR(res));
8833  OBJ_INFECT_RAW(res, str);
8834  if (!NIL_P(pad)) OBJ_INFECT_RAW(res, pad);
8835  rb_enc_associate(res, enc);
8836  if (argc == 2)
8837  cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
8838  if (cr != ENC_CODERANGE_BROKEN)
8839  ENC_CODERANGE_SET(res, cr);
8840 
8841  RB_GC_GUARD(pad);
8842  return res;
8843 }
8844 
8845 
8846 /*
8847  * call-seq:
8848  * str.ljust(integer, padstr=' ') -> new_str
8849  *
8850  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
8851  * <code>String</code> of length <i>integer</i> with <i>str</i> left justified
8852  * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
8853  *
8854  * "hello".ljust(4) #=> "hello"
8855  * "hello".ljust(20) #=> "hello "
8856  * "hello".ljust(20, '1234') #=> "hello123412341234123"
8857  */
8858 
8859 static VALUE
8861 {
8862  return rb_str_justify(argc, argv, str, 'l');
8863 }
8864 
8865 
8866 /*
8867  * call-seq:
8868  * str.rjust(integer, padstr=' ') -> new_str
8869  *
8870  * If <i>integer</i> is greater than the length of <i>str</i>, returns a new
8871  * <code>String</code> of length <i>integer</i> with <i>str</i> right justified
8872  * and padded with <i>padstr</i>; otherwise, returns <i>str</i>.
8873  *
8874  * "hello".rjust(4) #=> "hello"
8875  * "hello".rjust(20) #=> " hello"
8876  * "hello".rjust(20, '1234') #=> "123412341234123hello"
8877  */
8878 
8879 static VALUE
8881 {
8882  return rb_str_justify(argc, argv, str, 'r');
8883 }
8884 
8885 
8886 /*
8887  * call-seq:
8888  * str.center(width, padstr=' ') -> new_str
8889  *
8890  * Centers +str+ in +width+. If +width+ is greater than the length of +str+,
8891  * returns a new String of length +width+ with +str+ centered and padded with
8892  * +padstr+; otherwise, returns +str+.
8893  *
8894  * "hello".center(4) #=> "hello"
8895  * "hello".center(20) #=> " hello "
8896  * "hello".center(20, '123') #=> "1231231hello12312312"
8897  */
8898 
8899 static VALUE
8901 {
8902  return rb_str_justify(argc, argv, str, 'c');
8903 }
8904 
8905 /*
8906  * call-seq:
8907  * str.partition(sep) -> [head, sep, tail]
8908  * str.partition(regexp) -> [head, match, tail]
8909  *
8910  * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string
8911  * and returns the part before it, the match, and the part
8912  * after it.
8913  * If it is not found, returns two empty strings and <i>str</i>.
8914  *
8915  * "hello".partition("l") #=> ["he", "l", "lo"]
8916  * "hello".partition("x") #=> ["hello", "", ""]
8917  * "hello".partition(/.l/) #=> ["h", "el", "lo"]
8918  */
8919 
8920 static VALUE
8922 {
8923  long pos;
8924 
8925  sep = get_pat_quoted(sep, 0);
8926  if (RB_TYPE_P(sep, T_REGEXP)) {
8927  pos = rb_reg_search(sep, str, 0, 0);
8928  if (pos < 0) {
8929  failed:
8930  return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
8931  }
8932  sep = rb_str_subpat(str, sep, INT2FIX(0));
8933  if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
8934  }
8935  else {
8936  pos = rb_str_index(str, sep, 0);
8937  if (pos < 0) goto failed;
8938  }
8939  return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
8940  sep,
8941  rb_str_subseq(str, pos+RSTRING_LEN(sep),
8942  RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
8943 }
8944 
8945 /*
8946  * call-seq:
8947  * str.rpartition(sep) -> [head, sep, tail]
8948  * str.rpartition(regexp) -> [head, match, tail]
8949  *
8950  * Searches <i>sep</i> or pattern (<i>regexp</i>) in the string from the end
8951  * of the string, and returns the part before it, the match, and the part
8952  * after it.
8953  * If it is not found, returns two empty strings and <i>str</i>.
8954  *
8955  * "hello".rpartition("l") #=> ["hel", "l", "o"]
8956  * "hello".rpartition("x") #=> ["", "", "hello"]
8957  * "hello".rpartition(/.l/) #=> ["he", "ll", "o"]
8958  */
8959 
8960 static VALUE
8962 {
8963  long pos = RSTRING_LEN(str);
8964  int regex = FALSE;
8965 
8966  if (RB_TYPE_P(sep, T_REGEXP)) {
8967  pos = rb_reg_search(sep, str, pos, 1);
8968  regex = TRUE;
8969  }
8970  else {
8971  VALUE tmp;
8972 
8973  tmp = rb_check_string_type(sep);
8974  if (NIL_P(tmp)) {
8975  rb_raise(rb_eTypeError, "type mismatch: %s given",
8976  rb_obj_classname(sep));
8977  }
8978  sep = tmp;
8979  pos = rb_str_sublen(str, pos);
8980  pos = rb_str_rindex(str, sep, pos);
8981  }
8982  if (pos < 0) {
8983  return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
8984  }
8985  if (regex) {
8986  sep = rb_reg_nth_match(0, rb_backref_get());
8987  }
8988  else {
8989  pos = rb_str_offset(str, pos);
8990  }
8991  return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
8992  sep,
8993  rb_str_subseq(str, pos+RSTRING_LEN(sep),
8994  RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
8995 }
8996 
8997 /*
8998  * call-seq:
8999  * str.start_with?([prefixes]+) -> true or false
9000  *
9001  * Returns true if +str+ starts with one of the +prefixes+ given.
9002  *
9003  * "hello".start_with?("hell") #=> true
9004  *
9005  * # returns true if one of the prefixes matches.
9006  * "hello".start_with?("heaven", "hell") #=> true
9007  * "hello".start_with?("heaven", "paradise") #=> false
9008  */
9009 
9010 static VALUE
9012 {
9013  int i;
9014 
9015  for (i=0; i<argc; i++) {
9016  VALUE tmp = argv[i];
9017  StringValue(tmp);
9018  rb_enc_check(str, tmp);
9019  if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
9020  if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
9021  return Qtrue;
9022  }
9023  return Qfalse;
9024 }
9025 
9026 /*
9027  * call-seq:
9028  * str.end_with?([suffixes]+) -> true or false
9029  *
9030  * Returns true if +str+ ends with one of the +suffixes+ given.
9031  *
9032  * "hello".end_with?("ello") #=> true
9033  *
9034  * # returns true if one of the +suffixes+ matches.
9035  * "hello".end_with?("heaven", "ello") #=> true
9036  * "hello".end_with?("heaven", "paradise") #=> false
9037  */
9038 
9039 static VALUE
9041 {
9042  int i;
9043  char *p, *s, *e;
9044  rb_encoding *enc;
9045 
9046  for (i=0; i<argc; i++) {
9047  VALUE tmp = argv[i];
9048  StringValue(tmp);
9049  enc = rb_enc_check(str, tmp);
9050  if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
9051  p = RSTRING_PTR(str);
9052  e = p + RSTRING_LEN(str);
9053  s = e - RSTRING_LEN(tmp);
9054  if (rb_enc_left_char_head(p, s, e, enc) != s)
9055  continue;
9056  if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
9057  return Qtrue;
9058  }
9059  return Qfalse;
9060 }
9061 
9062 void
9064 {
9065  if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
9066  rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
9067  }
9068  *var = val;
9069 }
9070 
9071 static void
9073 {
9074  val = rb_fs_check(val);
9075  if (!val) {
9077  "value of %"PRIsVALUE" must be String or Regexp",
9078  rb_id2str(id));
9079  }
9080  *var = val;
9081 }
9082 
9083 
9084 /*
9085  * call-seq:
9086  * str.force_encoding(encoding) -> str
9087  *
9088  * Changes the encoding to +encoding+ and returns self.
9089  */
9090 
9091 static VALUE
9093 {
9094  str_modifiable(str);
9095  rb_enc_associate(str, rb_to_encoding(enc));
9096  ENC_CODERANGE_CLEAR(str);
9097  return str;
9098 }
9099 
9100 /*
9101  * call-seq:
9102  * str.b -> str
9103  *
9104  * Returns a copied string whose encoding is ASCII-8BIT.
9105  */
9106 
9107 static VALUE
9109 {
9110  VALUE str2 = str_alloc(rb_cString);
9111  str_replace_shared_without_enc(str2, str);
9112  OBJ_INFECT_RAW(str2, str);
9113  ENC_CODERANGE_CLEAR(str2);
9114  return str2;
9115 }
9116 
9117 /*
9118  * call-seq:
9119  * str.valid_encoding? -> true or false
9120  *
9121  * Returns true for a string which is encoded correctly.
9122  *
9123  * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? #=> true
9124  * "\xc2".force_encoding("UTF-8").valid_encoding? #=> false
9125  * "\x80".force_encoding("UTF-8").valid_encoding? #=> false
9126  */
9127 
9128 static VALUE
9130 {
9131  int cr = rb_enc_str_coderange(str);
9132 
9133  return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
9134 }
9135 
9136 /*
9137  * call-seq:
9138  * str.ascii_only? -> true or false
9139  *
9140  * Returns true for a string which has only ASCII characters.
9141  *
9142  * "abc".force_encoding("UTF-8").ascii_only? #=> true
9143  * "abc\u{6666}".force_encoding("UTF-8").ascii_only? #=> false
9144  */
9145 
9146 static VALUE
9148 {
9149  int cr = rb_enc_str_coderange(str);
9150 
9151  return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
9152 }
9153 
9168 VALUE
9169 rb_str_ellipsize(VALUE str, long len)
9170 {
9171  static const char ellipsis[] = "...";
9172  const long ellipsislen = sizeof(ellipsis) - 1;
9173  rb_encoding *const enc = rb_enc_get(str);
9174  const long blen = RSTRING_LEN(str);
9175  const char *const p = RSTRING_PTR(str), *e = p + blen;
9176  VALUE estr, ret = 0;
9177 
9178  if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
9179  if (len * rb_enc_mbminlen(enc) >= blen ||
9180  (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
9181  ret = str;
9182  }
9183  else if (len <= ellipsislen ||
9184  !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
9185  if (rb_enc_asciicompat(enc)) {
9186  ret = rb_str_new_with_class(str, ellipsis, len);
9187  rb_enc_associate(ret, enc);
9188  }
9189  else {
9190  estr = rb_usascii_str_new(ellipsis, len);
9191  ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
9192  }
9193  }
9194  else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
9195  rb_str_cat(ret, ellipsis, ellipsislen);
9196  }
9197  else {
9198  estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
9199  rb_enc_from_encoding(enc), 0, Qnil);
9200  rb_str_append(ret, estr);
9201  }
9202  return ret;
9203 }
9204 
9205 static VALUE
9207 {
9208  int cr;
9209  str = StringValue(str);
9210  cr = rb_enc_str_coderange(str);
9211  if (cr == ENC_CODERANGE_BROKEN) {
9212  rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
9213  }
9214  else {
9215  rb_encoding *e = STR_ENC_GET(str);
9216  if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
9217  rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
9218  rb_enc_name(enc), rb_enc_name(e));
9219  }
9220  }
9221  return str;
9222 }
9223 
9224 static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
9225 
9231 VALUE
9233 {
9234  rb_encoding *enc = STR_ENC_GET(str);
9235  return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
9236 }
9237 
9238 VALUE
9240 {
9241  int cr = ENC_CODERANGE_UNKNOWN;
9242  if (enc == STR_ENC_GET(str)) {
9243  /* cached coderange makes sense only when enc equals the
9244  * actual encoding of str */
9245  cr = ENC_CODERANGE(str);
9246  }
9247  return enc_str_scrub(enc, str, repl, cr);
9248 }
9249 
9250 static VALUE
9251 enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
9252 {
9253  int encidx;
9254  VALUE buf = Qnil;
9255  const char *rep;
9256  long replen = -1;
9257  int tainted = 0;
9258 
9259  if (rb_block_given_p()) {
9260  if (!NIL_P(repl))
9261  rb_raise(rb_eArgError, "both of block and replacement given");
9262  replen = 0;
9263  }
9264 
9265  if (ENC_CODERANGE_CLEAN_P(cr))
9266  return Qnil;
9267 
9268  if (!NIL_P(repl)) {
9269  repl = str_compat_and_valid(repl, enc);
9270  tainted = OBJ_TAINTED_RAW(repl);
9271  }
9272 
9273  if (rb_enc_dummy_p(enc)) {
9274  return Qnil;
9275  }
9276  encidx = rb_enc_to_index(enc);
9277 
9278 #define DEFAULT_REPLACE_CHAR(str) do { \
9279  static const char replace[sizeof(str)-1] = str; \
9280  rep = replace; replen = (int)sizeof(replace); \
9281  } while (0)
9282 
9283  if (rb_enc_asciicompat(enc)) {
9284  const char *p = RSTRING_PTR(str);
9285  const char *e = RSTRING_END(str);
9286  const char *p1 = p;
9287  int rep7bit_p;
9288  if (!replen) {
9289  rep = NULL;
9290  rep7bit_p = FALSE;
9291  }
9292  else if (!NIL_P(repl)) {
9293  rep = RSTRING_PTR(repl);
9294  replen = RSTRING_LEN(repl);
9295  rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
9296  }
9297  else if (encidx == rb_utf8_encindex()) {
9298  DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
9299  rep7bit_p = FALSE;
9300  }
9301  else {
9302  DEFAULT_REPLACE_CHAR("?");
9303  rep7bit_p = TRUE;
9304  }
9305  cr = ENC_CODERANGE_7BIT;
9306 
9307  p = search_nonascii(p, e);
9308  if (!p) {
9309  p = e;
9310  }
9311  while (p < e) {
9312  int ret = rb_enc_precise_mbclen(p, e, enc);
9313  if (MBCLEN_NEEDMORE_P(ret)) {
9314  break;
9315  }
9316  else if (MBCLEN_CHARFOUND_P(ret)) {
9317  cr = ENC_CODERANGE_VALID;
9318  p += MBCLEN_CHARFOUND_LEN(ret);
9319  }
9320  else if (MBCLEN_INVALID_P(ret)) {
9321  /*
9322  * p1~p: valid ascii/multibyte chars
9323  * p ~e: invalid bytes + unknown bytes
9324  */
9325  long clen = rb_enc_mbmaxlen(enc);
9326  if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
9327  if (p > p1) {
9328  rb_str_buf_cat(buf, p1, p - p1);
9329  }
9330 
9331  if (e - p < clen) clen = e - p;
9332  if (clen <= 2) {
9333  clen = 1;
9334  }
9335  else {
9336  const char *q = p;
9337  clen--;
9338  for (; clen > 1; clen--) {
9339  ret = rb_enc_precise_mbclen(q, q + clen, enc);
9340  if (MBCLEN_NEEDMORE_P(ret)) break;
9341  if (MBCLEN_INVALID_P(ret)) continue;
9342  UNREACHABLE;
9343  }
9344  }
9345  if (rep) {
9346  rb_str_buf_cat(buf, rep, replen);
9347  if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
9348  }
9349  else {
9350  repl = rb_yield(rb_enc_str_new(p, clen, enc));
9351  repl = str_compat_and_valid(repl, enc);
9352  tainted |= OBJ_TAINTED_RAW(repl);
9353  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
9354  if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
9355  cr = ENC_CODERANGE_VALID;
9356  }
9357  p += clen;
9358  p1 = p;
9359  p = search_nonascii(p, e);
9360  if (!p) {
9361  p = e;
9362  break;
9363  }
9364  }
9365  else {
9366  UNREACHABLE;
9367  }
9368  }
9369  if (NIL_P(buf)) {
9370  if (p == e) {
9371  ENC_CODERANGE_SET(str, cr);
9372  return Qnil;
9373  }
9374  buf = rb_str_buf_new(RSTRING_LEN(str));
9375  }
9376  if (p1 < p) {
9377  rb_str_buf_cat(buf, p1, p - p1);
9378  }
9379  if (p < e) {
9380  if (rep) {
9381  rb_str_buf_cat(buf, rep, replen);
9382  if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
9383  }
9384  else {
9385  repl = rb_yield(rb_enc_str_new(p, e-p, enc));
9386  repl = str_compat_and_valid(repl, enc);
9387  tainted |= OBJ_TAINTED_RAW(repl);
9388  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
9389  if (ENC_CODERANGE(repl) == ENC_CODERANGE_VALID)
9390  cr = ENC_CODERANGE_VALID;
9391  }
9392  }
9393  }
9394  else {
9395  /* ASCII incompatible */
9396  const char *p = RSTRING_PTR(str);
9397  const char *e = RSTRING_END(str);
9398  const char *p1 = p;
9399  long mbminlen = rb_enc_mbminlen(enc);
9400  if (!replen) {
9401  rep = NULL;
9402  }
9403  else if (!NIL_P(repl)) {
9404  rep = RSTRING_PTR(repl);
9405  replen = RSTRING_LEN(repl);
9406  }
9407  else if (encidx == ENCINDEX_UTF_16BE) {
9408  DEFAULT_REPLACE_CHAR("\xFF\xFD");
9409  }
9410  else if (encidx == ENCINDEX_UTF_16LE) {
9411  DEFAULT_REPLACE_CHAR("\xFD\xFF");
9412  }
9413  else if (encidx == ENCINDEX_UTF_32BE) {
9414  DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
9415  }
9416  else if (encidx == ENCINDEX_UTF_32LE) {
9417  DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
9418  }
9419  else {
9420  DEFAULT_REPLACE_CHAR("?");
9421  }
9422 
9423  while (p < e) {
9424  int ret = rb_enc_precise_mbclen(p, e, enc);
9425  if (MBCLEN_NEEDMORE_P(ret)) {
9426  break;
9427  }
9428  else if (MBCLEN_CHARFOUND_P(ret)) {
9429  p += MBCLEN_CHARFOUND_LEN(ret);
9430  }
9431  else if (MBCLEN_INVALID_P(ret)) {
9432  const char *q = p;
9433  long clen = rb_enc_mbmaxlen(enc);
9434  if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
9435  if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
9436 
9437  if (e - p < clen) clen = e - p;
9438  if (clen <= mbminlen * 2) {
9439  clen = mbminlen;
9440  }
9441  else {
9442  clen -= mbminlen;
9443  for (; clen > mbminlen; clen-=mbminlen) {
9444  ret = rb_enc_precise_mbclen(q, q + clen, enc);
9445  if (MBCLEN_NEEDMORE_P(ret)) break;
9446  if (MBCLEN_INVALID_P(ret)) continue;
9447  UNREACHABLE;
9448  }
9449  }
9450  if (rep) {
9451  rb_str_buf_cat(buf, rep, replen);
9452  }
9453  else {
9454  repl = rb_yield(rb_enc_str_new(p, clen, enc));
9455  repl = str_compat_and_valid(repl, enc);
9456  tainted |= OBJ_TAINTED_RAW(repl);
9457  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
9458  }
9459  p += clen;
9460  p1 = p;
9461  }
9462  else {
9463  UNREACHABLE;
9464  }
9465  }
9466  if (NIL_P(buf)) {
9467  if (p == e) {
9469  return Qnil;
9470  }
9471  buf = rb_str_buf_new(RSTRING_LEN(str));
9472  }
9473  if (p1 < p) {
9474  rb_str_buf_cat(buf, p1, p - p1);
9475  }
9476  if (p < e) {
9477  if (rep) {
9478  rb_str_buf_cat(buf, rep, replen);
9479  }
9480  else {
9481  repl = rb_yield(rb_enc_str_new(p, e-p, enc));
9482  repl = str_compat_and_valid(repl, enc);
9483  tainted |= OBJ_TAINTED_RAW(repl);
9484  rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
9485  }
9486  }
9487  cr = ENC_CODERANGE_VALID;
9488  }
9489  FL_SET_RAW(buf, tainted|OBJ_TAINTED_RAW(str));
9490  ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
9491  return buf;
9492 }
9493 
9494 /*
9495  * call-seq:
9496  * str.scrub -> new_str
9497  * str.scrub(repl) -> new_str
9498  * str.scrub{|bytes|} -> new_str
9499  *
9500  * If the string is invalid byte sequence then replace invalid bytes with given replacement
9501  * character, else returns self.
9502  * If block is given, replace invalid bytes with returned value of the block.
9503  *
9504  * "abc\u3042\x81".scrub #=> "abc\u3042\uFFFD"
9505  * "abc\u3042\x81".scrub("*") #=> "abc\u3042*"
9506  * "abc\u3042\xE3\x80".scrub{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>"
9507  */
9508 static VALUE
9510 {
9511  VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
9512  VALUE new = rb_str_scrub(str, repl);
9513  return NIL_P(new) ? rb_str_dup(str): new;
9514 }
9515 
9516 /*
9517  * call-seq:
9518  * str.scrub! -> str
9519  * str.scrub!(repl) -> str
9520  * str.scrub!{|bytes|} -> str
9521  *
9522  * If the string is invalid byte sequence then replace invalid bytes with given replacement
9523  * character, else returns self.
9524  * If block is given, replace invalid bytes with returned value of the block.
9525  *
9526  * "abc\u3042\x81".scrub! #=> "abc\u3042\uFFFD"
9527  * "abc\u3042\x81".scrub!("*") #=> "abc\u3042*"
9528  * "abc\u3042\xE3\x80".scrub!{|bytes| '<'+bytes.unpack('H*')[0]+'>' } #=> "abc\u3042<e380>"
9529  */
9530 static VALUE
9532 {
9533  VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
9534  VALUE new = rb_str_scrub(str, repl);
9535  if (!NIL_P(new)) rb_str_replace(str, new);
9536  return str;
9537 }
9538 
9539 /**********************************************************************
9540  * Document-class: Symbol
9541  *
9542  * <code>Symbol</code> objects represent names and some strings
9543  * inside the Ruby
9544  * interpreter. They are generated using the <code>:name</code> and
9545  * <code>:"string"</code> literals
9546  * syntax, and by the various <code>to_sym</code> methods. The same
9547  * <code>Symbol</code> object will be created for a given name or string
9548  * for the duration of a program's execution, regardless of the context
9549  * or meaning of that name. Thus if <code>Fred</code> is a constant in
9550  * one context, a method in another, and a class in a third, the
9551  * <code>Symbol</code> <code>:Fred</code> will be the same object in
9552  * all three contexts.
9553  *
9554  * module One
9555  * class Fred
9556  * end
9557  * $f1 = :Fred
9558  * end
9559  * module Two
9560  * Fred = 1
9561  * $f2 = :Fred
9562  * end
9563  * def Fred()
9564  * end
9565  * $f3 = :Fred
9566  * $f1.object_id #=> 2514190
9567  * $f2.object_id #=> 2514190
9568  * $f3.object_id #=> 2514190
9569  *
9570  */
9571 
9572 
9573 /*
9574  * call-seq:
9575  * sym == obj -> true or false
9576  *
9577  * Equality---If <i>sym</i> and <i>obj</i> are exactly the same
9578  * symbol, returns <code>true</code>.
9579  */
9580 
9581 #define sym_equal rb_obj_equal
9582 
9583 static int
9584 sym_printable(const char *s, const char *send, rb_encoding *enc)
9585 {
9586  while (s < send) {
9587  int n;
9588  int c = rb_enc_precise_mbclen(s, send, enc);
9589 
9590  if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
9591  n = MBCLEN_CHARFOUND_LEN(c);
9592  c = rb_enc_mbc_to_codepoint(s, send, enc);
9593  if (!rb_enc_isprint(c, enc)) return FALSE;
9594  s += n;
9595  }
9596  return TRUE;
9597 }
9598 
9599 int
9601 {
9602  rb_encoding *enc;
9603  const char *ptr;
9604  long len;
9606 
9607  if (resenc == NULL) resenc = rb_default_external_encoding();
9608  enc = STR_ENC_GET(sym);
9609  ptr = RSTRING_PTR(sym);
9610  len = RSTRING_LEN(sym);
9611  if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
9612  !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
9613  return FALSE;
9614  }
9615  return TRUE;
9616 }
9617 
9618 VALUE
9620 {
9621  rb_encoding *enc;
9622  const char *ptr;
9623  long len;
9624  rb_encoding *resenc;
9625 
9626  Check_Type(str, T_STRING);
9627  resenc = rb_default_internal_encoding();
9628  if (resenc == NULL) resenc = rb_default_external_encoding();
9629  enc = STR_ENC_GET(str);
9630  ptr = RSTRING_PTR(str);
9631  len = RSTRING_LEN(str);
9632  if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
9633  !sym_printable(ptr, ptr + len, enc)) {
9634  return rb_str_inspect(str);
9635  }
9636  return str;
9637 }
9638 
9639 VALUE
9641 {
9642  return rb_str_quote_unprintable(rb_id2str(id));
9643 }
9644 
9645 /*
9646  * call-seq:
9647  * sym.inspect -> string
9648  *
9649  * Returns the representation of <i>sym</i> as a symbol literal.
9650  *
9651  * :fred.inspect #=> ":fred"
9652  */
9653 
9654 static VALUE
9656 {
9657  VALUE str = rb_sym2str(sym);
9658  const char *ptr;
9659  long len;
9660  char *dest;
9661 
9662  if (!rb_str_symname_p(str)) {
9663  str = rb_str_inspect(str);
9664  len = RSTRING_LEN(str);
9665  rb_str_resize(str, len + 1);
9666  dest = RSTRING_PTR(str);
9667  memmove(dest + 1, dest, len);
9668  }
9669  else {
9670  rb_encoding *enc = STR_ENC_GET(str);
9671  RSTRING_GETMEM(str, ptr, len);
9672  str = rb_enc_str_new(0, len + 1, enc);
9673  dest = RSTRING_PTR(str);
9674  memcpy(dest + 1, ptr, len);
9675  }
9676  dest[0] = ':';
9677  return str;
9678 }
9679 
9680 
9681 /*
9682  * call-seq:
9683  * sym.id2name -> string
9684  * sym.to_s -> string
9685  *
9686  * Returns the name or string corresponding to <i>sym</i>.
9687  *
9688  * :fred.id2name #=> "fred"
9689  */
9690 
9691 
9692 VALUE
9694 {
9695  return str_new_shared(rb_cString, rb_sym2str(sym));
9696 }
9697 
9698 
9699 /*
9700  * call-seq:
9701  * sym.to_sym -> sym
9702  * sym.intern -> sym
9703  *
9704  * In general, <code>to_sym</code> returns the <code>Symbol</code> corresponding
9705  * to an object. As <i>sym</i> is already a symbol, <code>self</code> is returned
9706  * in this case.
9707  */
9708 
9709 static VALUE
9711 {
9712  return sym;
9713 }
9714 
9715 VALUE
9716 rb_sym_proc_call(ID mid, int argc, const VALUE *argv, VALUE passed_proc)
9717 {
9718  VALUE obj;
9719 
9720  if (argc < 1) {
9721  rb_raise(rb_eArgError, "no receiver given");
9722  }
9723  obj = argv[0];
9724  return rb_funcall_with_block(obj, mid, argc - 1, argv + 1, passed_proc);
9725 }
9726 
9727 #if 0
9728 /*
9729  * call-seq:
9730  * sym.to_proc
9731  *
9732  * Returns a _Proc_ object which respond to the given method by _sym_.
9733  *
9734  * (1..3).collect(&:to_s) #=> ["1", "2", "3"]
9735  */
9736 
9737 VALUE
9739 {
9740 }
9741 #endif
9742 
9743 /*
9744  * call-seq:
9745  *
9746  * sym.succ
9747  *
9748  * Same as <code>sym.to_s.succ.intern</code>.
9749  */
9750 
9751 static VALUE
9753 {
9754  return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
9755 }
9756 
9757 /*
9758  * call-seq:
9759  *
9760  * symbol <=> other_symbol -> -1, 0, +1 or nil
9761  *
9762  * Compares +symbol+ with +other_symbol+ after calling #to_s on each of the
9763  * symbols. Returns -1, 0, +1 or nil depending on whether +symbol+ is less
9764  * than, equal to, or greater than +other_symbol+.
9765  *
9766  * +nil+ is returned if the two values are incomparable.
9767  *
9768  * See String#<=> for more information.
9769  */
9770 
9771 static VALUE
9772 sym_cmp(VALUE sym, VALUE other)
9773 {
9774  if (!SYMBOL_P(other)) {
9775  return Qnil;
9776  }
9777  return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
9778 }
9779 
9780 /*
9781  * call-seq:
9782  *
9783  * sym.casecmp(other) -> -1, 0, +1 or nil
9784  *
9785  * Case-insensitive version of <code>Symbol#<=></code>.
9786  * Currently, case-insensitivity only works on characters A-Z/a-z,
9787  * not all of Unicode. This is different from <code>casecmp?</code>.
9788  */
9789 
9790 static VALUE
9792 {
9793  if (!SYMBOL_P(other)) {
9794  return Qnil;
9795  }
9796  return rb_str_casecmp(rb_sym2str(sym), rb_sym2str(other));
9797 }
9798 
9799 /*
9800  * call-seq:
9801  *
9802  * sym.casecmp?(other) -> true, false, or nil
9803  *
9804  * Returns true if sym and other are equal after Unicode case folding,
9805  * false if they are not equal, and nil if other is not a symbol.
9806  */
9807 
9808 static VALUE
9810 {
9811  if (!SYMBOL_P(other)) {
9812  return Qnil;
9813  }
9814  return rb_str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
9815 }
9816 
9817 /*
9818  * call-seq:
9819  * sym =~ obj -> integer or nil
9820  *
9821  * Returns <code>sym.to_s =~ obj</code>.
9822  */
9823 
9824 static VALUE
9825 sym_match(VALUE sym, VALUE other)
9826 {
9827  return rb_str_match(rb_sym2str(sym), other);
9828 }
9829 
9830 /*
9831  * call-seq:
9832  * sym.match(obj) -> MatchData or nil
9833  *
9834  * Returns <code>sym.to_s.match(obj)</code>.
9835  */
9836 
9837 static VALUE
9839 {
9840  return rb_str_match_m(argc, argv, rb_sym2str(sym));
9841 }
9842 
9843 /*
9844  * call-seq:
9845  * sym.match?(obj) -> true or false
9846  *
9847  * Returns <code>sym.to_s.match?(obj)</code>.
9848  */
9849 
9850 static VALUE
9852 {
9853  return rb_str_match_m_p(argc, argv, sym);
9854 }
9855 
9856 /*
9857  * call-seq:
9858  * sym[idx] -> char
9859  * sym[b, n] -> string
9860  * sym.slice(idx) -> char
9861  * sym.slice(b, n) -> string
9862  *
9863  * Returns <code>sym.to_s[]</code>.
9864  */
9865 
9866 static VALUE
9868 {
9869  return rb_str_aref_m(argc, argv, rb_sym2str(sym));
9870 }
9871 
9872 /*
9873  * call-seq:
9874  * sym.length -> integer
9875  * sym.size -> integer
9876  *
9877  * Same as <code>sym.to_s.length</code>.
9878  */
9879 
9880 static VALUE
9882 {
9883  return rb_str_length(rb_sym2str(sym));
9884 }
9885 
9886 /*
9887  * call-seq:
9888  * sym.empty? -> true or false
9889  *
9890  * Returns that _sym_ is :"" or not.
9891  */
9892 
9893 static VALUE
9895 {
9896  return rb_str_empty(rb_sym2str(sym));
9897 }
9898 
9899 /*
9900  * call-seq:
9901  * sym.upcase [options] -> symbol
9902  *
9903  * Same as <code>sym.to_s.upcase.intern</code>.
9904  */
9905 
9906 static VALUE
9908 {
9909  return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
9910 }
9911 
9912 /*
9913  * call-seq:
9914  * sym.downcase [options] -> symbol
9915  *
9916  * Same as <code>sym.to_s.downcase.intern</code>.
9917  */
9918 
9919 static VALUE
9921 {
9922  return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
9923 }
9924 
9925 /*
9926  * call-seq:
9927  * sym.capitalize [options] -> symbol
9928  *
9929  * Same as <code>sym.to_s.capitalize.intern</code>.
9930  */
9931 
9932 static VALUE
9934 {
9935  return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
9936 }
9937 
9938 /*
9939  * call-seq:
9940  * sym.swapcase [options] -> symbol
9941  *
9942  * Same as <code>sym.to_s.swapcase.intern</code>.
9943  */
9944 
9945 static VALUE
9947 {
9948  return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
9949 }
9950 
9951 /*
9952  * call-seq:
9953  * sym.encoding -> encoding
9954  *
9955  * Returns the Encoding object that represents the encoding of _sym_.
9956  */
9957 
9958 static VALUE
9960 {
9961  return rb_obj_encoding(rb_sym2str(sym));
9962 }
9963 
9964 static VALUE
9966 {
9967  if (!RB_TYPE_P(name, T_STRING)) {
9968  VALUE tmp = rb_check_string_type(name);
9969  if (NIL_P(tmp)) {
9970  rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
9971  name);
9972  }
9973  name = tmp;
9974  }
9975  return name;
9976 }
9977 
9978 ID
9980 {
9981  if (SYMBOL_P(name)) {
9982  return SYM2ID(name);
9983  }
9984  name = string_for_symbol(name);
9985  return rb_intern_str(name);
9986 }
9987 
9988 VALUE
9990 {
9991  if (SYMBOL_P(name)) {
9992  return name;
9993  }
9994  name = string_for_symbol(name);
9995  return rb_str_intern(name);
9996 }
9997 
9998 /*
9999  * A <code>String</code> object holds and manipulates an arbitrary sequence of
10000  * bytes, typically representing characters. String objects may be created
10001  * using <code>String::new</code> or as literals.
10002  *
10003  * Because of aliasing issues, users of strings should be aware of the methods
10004  * that modify the contents of a <code>String</code> object. Typically,
10005  * methods with names ending in ``!'' modify their receiver, while those
10006  * without a ``!'' return a new <code>String</code>. However, there are
10007  * exceptions, such as <code>String#[]=</code>.
10008  *
10009  */
10010 
10011 void
10013 {
10014 #undef rb_intern
10015 #define rb_intern(str) rb_intern_const(str)
10016 
10017  rb_cString = rb_define_class("String", rb_cObject);
10021  rb_define_method(rb_cString, "initialize", rb_str_init, -1);
10022  rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
10026  rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
10028  rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
10029  rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
10035  rb_define_method(rb_cString, "insert", rb_str_insert, 2);
10036  rb_define_method(rb_cString, "length", rb_str_length, 0);
10038  rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
10039  rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
10047  rb_define_method(rb_cString, "upto", rb_str_upto, -1);
10050  rb_define_method(rb_cString, "replace", rb_str_replace, 1);
10053  rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
10054  rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
10055  rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
10056  rb_define_method(rb_cString, "scrub", str_scrub, -1);
10057  rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
10058  rb_define_method(rb_cString, "freeze", rb_str_freeze, 0);
10061 
10062  rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
10065  rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
10066  rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
10068 
10069  sym_ascii = ID2SYM(rb_intern("ascii"));
10070  sym_turkic = ID2SYM(rb_intern("turkic"));
10071  sym_lithuanian = ID2SYM(rb_intern("lithuanian"));
10072  sym_fold = ID2SYM(rb_intern("fold"));
10073 
10074  rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
10075  rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
10076  rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
10077  rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
10078 
10081  rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
10083 
10087  rb_define_method(rb_cString, "lines", rb_str_lines, -1);
10090  rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
10091  rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
10097  rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
10098  rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
10100 
10101  rb_define_method(rb_cString, "include?", rb_str_include, 1);
10102  rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
10103  rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
10104 
10106 
10107  rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
10108  rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
10109  rb_define_method(rb_cString, "center", rb_str_center, -1);
10110 
10111  rb_define_method(rb_cString, "sub", rb_str_sub, -1);
10112  rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
10114  rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
10116  rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
10117  rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
10118 
10126 
10129  rb_define_method(rb_cString, "delete", rb_str_delete, -1);
10130  rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
10131  rb_define_method(rb_cString, "count", rb_str_count, -1);
10132 
10135  rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
10137 
10138  rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
10139  rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
10140  rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
10141  rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
10142 
10143  rb_define_method(rb_cString, "sum", rb_str_sum, -1);
10144 
10145  rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
10147 
10148  rb_define_method(rb_cString, "partition", rb_str_partition, 1);
10149  rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
10150 
10151  rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
10152  rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
10154  rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
10156 
10157  rb_fs = Qnil;
10160 
10161  rb_cSymbol = rb_define_class("Symbol", rb_cObject);
10165  rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0); /* in symbol.c */
10166 
10169  rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
10171  rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
10172  rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
10173  rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
10174  rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0);
10175  rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
10176  rb_define_method(rb_cSymbol, "next", sym_succ, 0);
10177 
10178  rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
10179  rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
10180  rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
10182 
10183  rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
10184  rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
10185  rb_define_method(rb_cSymbol, "length", sym_length, 0);
10186  rb_define_method(rb_cSymbol, "size", sym_length, 0);
10187  rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
10188  rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
10189  rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
10190 
10191  rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
10192  rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
10193  rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
10194  rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
10195 
10196  rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
10197 
10200 }
static int str_independent(VALUE str)
Definition: string.c:1942
static VALUE rb_str_casecmp_p(VALUE str1, VALUE str2)
Definition: string.c:3265
static VALUE str_uminus(VALUE str)
Definition: string.c:2504
static VALUE rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
Definition: string.c:6096
#define RBASIC_CLEAR_CLASS(obj)
Definition: internal.h:1312
int rb_enc_str_asciionly_p(VALUE str)
Definition: string.c:640
VALUE rb_utf8_str_new(const char *ptr, long len)
Definition: string.c:750
union RString::@125 as
static long chopped_length(VALUE str)
Definition: string.c:7954
VALUE rb_str_resize(VALUE str, long len)
Definition: string.c:2562
#define CASEMAP_DEBUG
Definition: string.c:5971
#define ENCINDEX_US_ASCII
Definition: encindex.h:44
#define ISDIGIT(c)
Definition: ruby.h:2129
VALUE rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
Definition: string.c:1007
static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str)
Definition: string.c:1074
Definition: string.c:6370
int rb_enc_codelen(int c, rb_encoding *enc)
Definition: encoding.c:1077
int rb_enc_get_index(VALUE obj)
Definition: encoding.c:773
static VALUE rb_str_bytesize(VALUE str)
Definition: string.c:1765
static void str_make_independent(VALUE str)
Definition: string.c:186
Definition: st.h:99
int rb_reg_backref_number(VALUE match, VALUE backref)
Definition: re.c:1146
#define BARE_STRING_P(str)
Definition: string.c:258
static VALUE str_buf_cat(VALUE str, const char *ptr, long len)
Definition: string.c:2614
#define ONIGENC_CODE_TO_MBCLEN(enc, code)
Definition: onigmo.h:367
#define is_broken_string(str)
Definition: internal.h:1491
#define MBCLEN_CHARFOUND_P(ret)
Definition: encoding.h:185
static long rb_str_rindex(VALUE str, VALUE sub, long pos)
Definition: string.c:3462
static OnigCaseFoldType check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
Definition: string.c:5929
rb_encoding * rb_enc_check(VALUE str1, VALUE str2)
Definition: encoding.c:879
#define FL_EXIVAR
Definition: ruby.h:1222
VALUE rb_ary_pop(VALUE ary)
Definition: array.c:949
rb_econv_result_t
Definition: encoding.h:291
#define MBCLEN_CHARFOUND_LEN(ret)
Definition: encoding.h:186
#define RESIZE_CAPA(str, capacity)
Definition: string.c:128
#define RARRAY_LEN(a)
Definition: ruby.h:1026
void rb_bug(const char *fmt,...)
Definition: error.c:482
#define rb_enc_mbc_to_codepoint(p, e, enc)
Definition: encoding.h:202
VALUE rb_ary_new_capa(long capa)
Definition: array.c:487
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Definition: string.c:835
void rb_enc_copy(VALUE obj1, VALUE obj2)
Definition: encoding.c:978
#define FALSE
Definition: nkf.h:174
#define RSTRING(obj)
Definition: ruby.h:1208
#define rb_intern(str)
size_t strlen(const char *)
#define INT2NUM(x)
Definition: ruby.h:1538
static VALUE str_upto_each(VALUE beg, VALUE end, int excl, int(*each)(VALUE, VALUE), VALUE)
Definition: string.c:4065
VALUE rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
Definition: string.c:9239
#define CHECK_IF_ASCII(c)
void rb_backref_set(VALUE)
Definition: vm.c:1213
Definition: st.h:79
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Definition: encoding.h:109
RUBY_FUNC_EXPORTED VALUE rb_str_locktmp_ensure(VALUE str, VALUE(*func)(VALUE), VALUE arg)
Definition: string.c:2538
Definition: st.h:99
static int sym_printable(const char *s, const char *send, rb_encoding *enc)
Definition: string.c:9584
VALUE rb_str_equal(VALUE str1, VALUE str2)
Definition: string.c:3105
VALUE rb_str_new_static(const char *ptr, long len)
Definition: string.c:829
#define NUM2INT(x)
Definition: ruby.h:684
static rb_encoding * get_actual_encoding(const int encidx, VALUE str)
Definition: string.c:197
static int max(int a, int b)
Definition: strftime.c:142
VALUE rb_locale_str_new_cstr(const char *ptr)
Definition: string.c:1038
#define ENCINDEX_UTF_16LE
Definition: encindex.h:46
VALUE rb_sym_to_s(VALUE sym)
Definition: string.c:9693
#define ascii_isspace(c)
Definition: string.c:7187
static int coderange_scan(const char *p, long len, rb_encoding *enc)
Definition: string.c:497
static unsigned int hash(str, len) register const char *str
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
Definition: string.c:998
void rb_undef_alloc_func(VALUE)
Definition: vm_method.c:681
void rb_define_singleton_method(VALUE obj, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a singleton method for obj.
Definition: class.c:1716
VALUE rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
Definition: string.c:367
#define sym_equal
Definition: string.c:9581
static VALUE rb_str_to_f(VALUE str)
Definition: string.c:5515
VALUE rb_str_new_frozen(VALUE orig)
Definition: string.c:1123
static VALUE rb_str_oct(VALUE str)
Definition: string.c:8589
static VALUE str_compat_and_valid(VALUE str, rb_encoding *enc)
Definition: string.c:9206
st_index_t rb_str_hash(VALUE str)
Definition: string.c:2985
#define FL_TAINT
Definition: ruby.h:1220
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
Definition: string.c:2771
static long rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
Definition: string.c:3286
#define CLASS_OF(v)
Definition: ruby.h:453
static VALUE rb_str_scan(VALUE str, VALUE pat)
Definition: string.c:8518
VALUE rb_locale_str_new(const char *ptr, long len)
Definition: string.c:1032
static VALUE rb_str_gsub(int argc, VALUE *argv, VALUE str)
Definition: string.c:5079
static int ntz_intptr(uintptr_t x)
Definition: internal.h:315
static VALUE rb_str_match(VALUE x, VALUE y)
Definition: string.c:3588
#define st_foreach
Definition: regint.h:186
static VALUE rb_str_capitalize(int argc, VALUE *argv, VALUE str)
Definition: string.c:6307
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Definition: class.c:1858
Definition: id.h:82
#define FIXNUM_MAX
Definition: ruby.h:228
#define Qtrue
Definition: ruby.h:437
VALUE rb_reg_check_preprocess(VALUE)
Definition: re.c:2603
void rb_str_set_len(VALUE str, long len)
Definition: string.c:2545
#define ASSUME(x)
Definition: ruby.h:42
static void rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
Definition: string.c:583
#define is_ascii_string(str)
Definition: internal.h:1490
unsigned char * USTR
Definition: string.c:6368
static unsigned int trnext(struct tr *t, rb_encoding *enc)
Definition: string.c:6377
#define ONIGERR_INVALID_CODE_POINT_VALUE
Definition: onigmo.h:689
#define ENC_CODERANGE_SET(obj, cr)
Definition: encoding.h:106
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Definition: string.c:847
#define rb_id2str(id)
Definition: vm_backtrace.c:29
Definition: id.h:85
Definition: st.h:99
static VALUE rb_str_b(VALUE str)
Definition: string.c:9108
char * pend
Definition: string.c:6373
VALUE rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
Definition: string.c:380
void Init_String(void)
Definition: string.c:10012
static VALUE rb_str_clear(VALUE str)
Definition: string.c:5118
#define OBJ_FREEZE_RAW(x)
Definition: ruby.h:1307
void rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
Definition: string.c:2103
rb_encoding * rb_to_encoding(VALUE enc)
Definition: encoding.c:246
int rb_enc_dummy_p(rb_encoding *enc)
Definition: encoding.c:132
#define ENC_CODERANGE_CLEAR(obj)
Definition: encoding.h:107
void rb_econv_close(rb_econv_t *ec)
Definition: transcode.c:1698
#define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, s, end)
Definition: onigmo.h:334
static VALUE sym_match_m(int argc, VALUE *argv, VALUE sym)
Definition: string.c:9838
VALUE rb_enc_from_encoding(rb_encoding *encoding)
Definition: encoding.c:117
VALUE rb_eTypeError
Definition: error.c:762
#define rb_check_arity
Definition: intern.h:303
static VALUE str_gsub(int argc, VALUE *argv, VALUE str, int bang)
Definition: string.c:4886
#define UNREACHABLE
Definition: ruby.h:46
VALUE rb_reg_match(VALUE, VALUE)
Definition: re.c:3074
long rb_memsearch(const void *, long, const void *, long, rb_encoding *)
Definition: re.c:252
static VALUE rb_str_succ_bang(VALUE str)
Definition: string.c:3996
static VALUE rb_str_enumerate_bytes(VALUE str, int wantarray)
Definition: string.c:7681
static VALUE rb_str_each_line(int argc, VALUE *argv, VALUE str)
Definition: string.c:7651
rb_encoding * rb_default_internal_encoding(void)
Definition: encoding.c:1510
VALUE rb_ary_push(VALUE ary, VALUE item)
Definition: array.c:905
#define ENCINDEX_ASCII
Definition: encindex.h:42
SSL_METHOD *(* func)(void)
Definition: ossl_ssl.c:54
VALUE rb_reg_regsub(VALUE, VALUE, struct re_registers *, VALUE)
Definition: re.c:3670
#define SYM2ID(x)
Definition: ruby.h:384
RUBY_EXTERN char * crypt(const char *, const char *)
st_index_t rb_memhash(const void *ptr, long len)
Definition: random.c:1502
int rb_usascii_encindex(void)
Definition: encoding.c:1344
VALUE rb_str_split(VALUE str, const char *sep0)
Definition: string.c:7450
long capa
Definition: ruby.h:968
struct RBasic basic
Definition: ruby.h:962
static VALUE rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
Definition: string.c:7675
rb_encoding * rb_enc_compatible(VALUE str1, VALUE str2)
Definition: encoding.c:962
static VALUE sym_turkic
Definition: string.c:194
static VALUE rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
Definition: string.c:5028
VALUE rb_str_export_to_enc(VALUE str, rb_encoding *enc)
Definition: string.c:1068
#define ONIGENC_CTYPE_ALPHA
Definition: onigmo.h:295
static int fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t arg, int existing)
Definition: string.c:261
void ruby_sized_xfree(void *x, size_t size)
Definition: gc.c:8009
#define ENCINDEX_UTF_32
Definition: encindex.h:50
VALUE rb_funcall(VALUE, ID, int,...)
Calls a method.
Definition: vm_eval.c:821
static VALUE rb_str_codepoints(VALUE str)
Definition: string.c:7947
#define str_buf_cat2(str, ptr)
Definition: string.c:2661
#define ENCINDEX_UTF_16BE
Definition: encindex.h:45
static VALUE rb_str_rstrip(VALUE str)
Definition: string.c:8365
VALUE rb_filesystem_str_new(const char *ptr, long len)
Definition: string.c:1044
VALUE rb_str_export(VALUE str)
Definition: string.c:1056
static VALUE rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
Definition: string.c:8752
#define RGENGC_WB_PROTECTED_STRING
Definition: ruby.h:780
unsigned int OnigCaseFoldType
Definition: onigmo.h:95
static VALUE rb_str_include(VALUE str, VALUE arg)
Definition: string.c:5448
static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Definition: string.c:935
static void rb_str_check_dummy_enc(rb_encoding *enc)
Definition: string.c:5920
#define RBASIC_SET_CLASS(obj, cls)
Definition: internal.h:1314
VALUE rb_backref_get(void)
Definition: vm.c:1207
VALUE rb_str_freeze(VALUE str)
Definition: string.c:2467
#define ENCODING_GET_INLINED(obj)
Definition: encoding.h:57
long rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
Definition: string.c:1646
#define STR_IS_SHARED_M
Definition: string.c:83
unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len_p, rb_encoding *enc)
Definition: encoding.c:1056
static VALUE str_new0(VALUE klass, const char *ptr, long len, int termlen)
Definition: string.c:702
#define Check_Type(v, t)
Definition: ruby.h:562
long rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
Definition: string.c:1655
void rb_raise(VALUE exc, const char *fmt,...)
Definition: error.c:2207
char * p
Definition: string.c:6373
static VALUE str_replace(VALUE str, VALUE str2)
Definition: string.c:1381
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Definition: ruby.h:991
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:854
VALUE rb_funcall_with_block(VALUE, ID, int, const VALUE *, VALUE)
Definition: vm_eval.c:880
static VALUE rb_str_to_i(int argc, VALUE *argv, VALUE str)
Definition: string.c:5482
int rb_objspace_garbage_object_p(VALUE obj)
Definition: gc.c:3014
char * rb_string_value_ptr(volatile VALUE *ptr)
Definition: string.c:2052
VALUE rb_convert_type(VALUE, int, const char *, const char *)
Definition: object.c:2630
static VALUE rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
Definition: string.c:4616
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Definition: string.c:841
static void mustnot_wchar(VALUE str)
Definition: string.c:241
#define RB_GC_GUARD(v)
Definition: ruby.h:552
VALUE rb_str_concat_literals(size_t num, const VALUE *strary)
Definition: string.c:2825
void rb_define_alloc_func(VALUE, rb_alloc_func_t)
static VALUE rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
Definition: string.c:3670
int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1008
static rb_encoding * get_encoding(VALUE str)
Definition: string.c:227
static VALUE rb_str_empty(VALUE str)
Definition: string.c:1782
static VALUE rb_str_chars(VALUE str)
Definition: string.c:7853
static VALUE rb_str_reverse_bang(VALUE str)
Definition: string.c:5409
#define DATA_PTR(dta)
Definition: ruby.h:1113
#define ENC_CODERANGE_MASK
Definition: encoding.h:98
void rb_include_module(VALUE klass, VALUE module)
Definition: class.c:864
static VALUE rb_str_center(int argc, VALUE *argv, VALUE str)
Definition: string.c:8900
VALUE rb_hash_lookup(VALUE hash, VALUE key)
Definition: hash.c:867
VALUE rb_sym_proc_call(ID mid, int argc, const VALUE *argv, VALUE passed_proc)
Definition: string.c:9716
st_data_t st_index_t
Definition: st.h:50
VALUE rb_range_beg_len(VALUE, long *, long *, long, int)
Definition: range.c:1019
static VALUE sym_upcase(int argc, VALUE *argv, VALUE sym)
Definition: string.c:9907
#define st_delete
Definition: regint.h:182
#define DEFAULT_REPLACE_CHAR(str)
double rb_str_to_dbl(VALUE, int)
Definition: object.c:2869
static VALUE sym_ascii
Definition: string.c:194
RUBY_FUNC_EXPORTED size_t rb_str_memsize(VALUE str)
Definition: string.c:1294
int st_update(st_table *table, st_data_t key, st_update_callback_func *func, st_data_t arg)
Definition: st.c:1371
static VALUE rb_str_subpat(VALUE str, VALUE re, VALUE backref)
Definition: string.c:4200
static int include_range_i(VALUE str, VALUE arg)
Definition: string.c:4147
VALUE rb_str_new(const char *ptr, long len)
Definition: string.c:736
static VALUE rb_str_aset_m(int argc, VALUE *argv, VALUE str)
Definition: string.c:4548
#define rb_enc_mbmaxlen(enc)
Definition: encoding.h:175
static const char * str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
Definition: string.c:2068
#define assert(x)
Definition: dlmalloc.c:1176
ID rb_id_encoding(void)
Definition: encoding.c:753
unsigned int last
Definition: nkf.c:4311
static VALUE rb_str_format_m(VALUE str, VALUE arg)
Definition: string.c:1903
#define STR_SET_NOEMBED(str)
Definition: string.c:88
void rb_gc_force_recycle(VALUE obj)
Definition: gc.c:6102
static long str_strlen(VALUE str, rb_encoding *enc)
Definition: string.c:1713
#define FIXNUM_P(f)
Definition: ruby.h:365
static VALUE rb_str_chomp(int argc, VALUE *argv, VALUE str)
Definition: string.c:8192
rb_encoding * rb_utf8_encoding(void)
Definition: encoding.c:1320
VALUE rb_str_export_locale(VALUE str)
Definition: string.c:1062
#define BEG(no)
Definition: string.c:22
static VALUE sym_length(VALUE sym)
Definition: string.c:9881
VALUE rb_str_new_shared(VALUE str)
Definition: string.c:1114
void rb_undef_method(VALUE klass, const char *name)
Definition: class.c:1533
static VALUE rb_tainted_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Definition: string.c:862
#define CHAR_ESC_LEN
Definition: string.c:5552
static VALUE empty_str_alloc(VALUE klass)
Definition: string.c:695
#define ENCINDEX_UTF_8
Definition: encindex.h:43
static VALUE rb_str_hash_m(VALUE str)
Definition: string.c:3016
static void mustnot_broken(VALUE str)
Definition: string.c:233
static int fstring_cmp(VALUE a, VALUE b)
Definition: string.c:406
VALUE rb_cString
Definition: string.c:65
static VALUE rb_str_aset(VALUE str, VALUE indx, VALUE val)
Definition: string.c:4481
#define ONIGENC_CASE_MODIFIED
Definition: onigmo.h:119
#define ENC_CODERANGE_7BIT
Definition: encoding.h:100
VALUE rb_eRangeError
Definition: error.c:766
const char * rb_obj_classname(VALUE)
Definition: variable.c:458
int rb_enc_symname_p(const char *, rb_encoding *)
Definition: symbol.c:196
VALUE rb_enc_sprintf(rb_encoding *enc, const char *format,...)
Definition: sprintf.c:1421
#define rb_ary_new2
Definition: intern.h:90
RUBY_EXTERN void * memmove(void *, const void *, size_t)
Definition: memmove.c:7
#define ONIGENC_CASE_FOLD_LITHUANIAN
Definition: onigmo.h:124
static long lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
Definition: string.c:8200
#define ONIGENC_CODE_TO_MBC_MAXLEN
Definition: onigmo.h:289
#define sym(x)
Definition: date_core.c:3721
static VALUE rb_str_insert(VALUE str, VALUE idx, VALUE str2)
Definition: string.c:4581
VALUE rb_str_append(VALUE str, VALUE str2)
Definition: string.c:2818
char * crypt_r(const char *key, const char *setting, struct crypt_data *data)
Definition: crypt.c:396
VALUE rb_str_buf_cat(VALUE, const char *, long)
RUBY_SYMBOL_EXPORT_BEGIN typedef unsigned long st_data_t
Definition: st.h:22
static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
Definition: string.c:1949
VALUE rb_fstring_enc_cstr(const char *ptr, rb_encoding *enc)
Definition: string.c:393
#define NEWOBJ_OF(obj, type, klass, flags)
Definition: ruby.h:754
static VALUE register_fstring(VALUE str)
Definition: string.c:332
#define ISALPHA(c)
Definition: ruby.h:2128
void rb_exc_raise(VALUE mesg)
Definition: eval.c:620
static VALUE sym_inspect(VALUE sym)
Definition: string.c:9655
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Definition: string.c:2995
#define OBJ_INFECT_RAW(x, s)
Definition: ruby.h:1303
static VALUE rb_str_partition(VALUE str, VALUE sep)
Definition: string.c:8921
static VALUE rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
Definition: string.c:5983
static long str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
Definition: string.c:2258
#define STR_EMBEDDABLE_P(len, termlen)
Definition: string.c:174
#define rb_enc_isctype(c, t, enc)
Definition: encoding.h:223
#define RBASIC_SET_CLASS_RAW(obj, cls)
Definition: internal.h:1313
static VALUE rb_str_ljust(int argc, VALUE *argv, VALUE str)
Definition: string.c:8860
#define RB_TYPE_P(obj, type)
Definition: ruby.h:527
static void str_shared_replace(VALUE str, VALUE str2)
Definition: string.c:1320
int rb_enc_str_coderange(VALUE str)
Definition: string.c:620
static VALUE sym_match_m_p(int argc, VALUE *argv, VALUE sym)
Definition: string.c:9851
static int fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
Definition: string.c:399
#define MEMZERO(p, type, n)
Definition: ruby.h:1660
Definition: ruby.h:961
VALUE rb_str_plus(VALUE str1, VALUE str2)
Definition: string.c:1800
static VALUE rb_str_setbyte(VALUE str, VALUE index, VALUE value)
Definition: string.c:5173
rb_encoding * rb_default_external_encoding(void)
Definition: encoding.c:1425
int rb_enc_to_index(rb_encoding *enc)
Definition: encoding.c:126
static VALUE str_new_frozen(VALUE klass, VALUE orig)
Definition: string.c:1174
#define FL_TEST(x, f)
Definition: ruby.h:1284
static long chompped_length(VALUE str, VALUE rs)
Definition: string.c:8023
#define STR_NOFREE
Definition: string.c:85
VALUE rb_mComparable
Definition: compar.c:15
static VALUE sym_fold
Definition: string.c:194
static void rb_str_ascii_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
Definition: string.c:6060
neighbor_char
Definition: string.c:3678
static VALUE rb_str_strip(VALUE str)
Definition: string.c:8434
#define rb_intern_str(string)
Definition: generator.h:16
unsigned int now
Definition: string.c:6372
#define ALLOC_N(type, n)
Definition: ruby.h:1587
int rb_block_given_p(void)
Definition: eval.c:797
static int all_digits_p(const char *s, long len)
Definition: string.c:4004
static VALUE chomp_rs(int argc, const VALUE *argv)
Definition: string.c:8122
VALUE rb_hash_aset(VALUE hash, VALUE key, VALUE val)
Definition: hash.c:1576
static VALUE rb_str_split_m(int argc, VALUE *argv, VALUE str)
Definition: string.c:7241
#define val
static int single_byte_optimizable(VALUE str)
Definition: string.c:418
int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1002
static void rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
Definition: string.c:4359
RUBY_EXTERN VALUE rb_cObject
Definition: ruby.h:1872
#define TERM_LEN(str)
Definition: string.c:119
VALUE rb_eRuntimeError
Definition: error.c:761
static VALUE sym_to_sym(VALUE sym)
Definition: string.c:9710
#define rb_enc_isascii(c, enc)
Definition: encoding.h:224
unsigned char u8
Definition: many2.c:13
VALUE rb_str_to_inum(VALUE str, int base, int badcheck)
Definition: bignum.c:4205
static VALUE str_new_shared(VALUE klass, VALUE str)
Definition: string.c:1108
#define MBCLEN_NEEDMORE_P(ret)
Definition: encoding.h:188
VALUE rb_str_length(VALUE str)
Definition: string.c:1749
#define RSTRING_END(str)
Definition: ruby.h:986
static VALUE rb_str_rpartition(VALUE str, VALUE sep)
Definition: string.c:8961
VALUE rb_str_cat_cstr(VALUE str, const char *ptr)
Definition: string.c:2674
static VALUE rb_str_crypt(VALUE str, VALUE salt)
Definition: string.c:8613
VALUE rb_str_cat2(VALUE, const char *)
static VALUE rb_str_cmp_m(VALUE str1, VALUE str2)
Definition: string.c:3159
static int str_dependent_p(VALUE str)
Definition: string.c:1931
int rb_str_symname_p(VALUE sym)
Definition: string.c:9600
#define OBJ_FROZEN_RAW(x)
Definition: ruby.h:1305
static VALUE rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
Definition: string.c:2967
VALUE rb_ary_new(void)
Definition: array.c:493
static void rb_check_lockedtmp(VALUE str)
Definition: string.c:1916
VALUE rb_str_new_cstr(const char *ptr)
Definition: string.c:770
static void str_modify_keep_cr(VALUE str)
Definition: string.c:2011
VALUE rb_str_buf_cat2(VALUE, const char *)
#define dp(v)
Definition: vm_debug.h:21
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Definition: transcode.c:2575
int rb_ascii8bit_encindex(void)
Definition: encoding.c:1314
#define UINT2NUM(x)
Definition: ruby.h:1539
#define STR_BUF_MIN_SIZE
Definition: string.c:1244
#define STR_SET_EMBED(str)
Definition: string.c:92
static VALUE rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
Definition: string.c:7053
#define snprintf
Definition: subst.h:6
static void rb_fs_setter(VALUE val, ID id, VALUE *var)
Definition: string.c:9072
#define NIL_P(v)
Definition: ruby.h:451
#define ISASCII(c)
Definition: ruby.h:2121
OnigUChar space[1]
Definition: string.c:5979
static VALUE rb_str_delete(int argc, VALUE *argv, VALUE str)
Definition: string.c:6908
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition: class.c:646
#define rb_enc_step_back(s, p, e, n, enc)
Definition: encoding.h:218
#define ENC_CODERANGE_CLEAN_P(cr)
Definition: encoding.h:103
static VALUE rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
Definition: string.c:6162
void rb_enc_set_index(VALUE obj, int idx)
Definition: encoding.c:818
static VALUE rb_str_enumerate_chars(VALUE str, int wantarray)
Definition: string.c:7761
static VALUE rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, int wantarray)
Definition: string.c:7473
static VALUE rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
Definition: string.c:7033
VALUE rb_str_concat(VALUE str1, VALUE str2)
Definition: string.c:2890
static VALUE rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
Definition: string.c:6271
static VALUE str_byte_substr(VALUE str, long beg, long len, int empty)
Definition: string.c:5225
static VALUE get_pat(VALUE)
Definition: string.c:4636
register int hval
Definition: zonetab.h:82
#define TOUPPER(c)
Definition: ruby.h:2132
#define offsetof(p_type, field)
Definition: addrinfo.h:186
static size_t str_capacity(VALUE str, const int termlen)
Definition: string.c:660
static VALUE rb_fs_check(VALUE val)
Definition: string.c:7159
#define END(no)
Definition: string.c:23
#define OBJ_FROZEN(x)
Definition: ruby.h:1306
st_table * rb_vm_fstring_table(void)
Definition: vm.c:3196
RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen,(str))
Definition: string.c:2514
#define STR_ENC_GET(str)
Definition: string.c:163
static VALUE rb_str_strip_bang(VALUE str)
Definition: string.c:8391
#define TYPE(x)
Definition: ruby.h:521
int argc
Definition: ruby.c:183
VALUE rb_str_scrub(VALUE str, VALUE repl)
Definition: string.c:9232
unsigned char OnigUChar
Definition: onigmo.h:79
char ary[RSTRING_EMBED_LEN_MAX+1]
Definition: ruby.h:972
#define Qfalse
Definition: ruby.h:436
long rb_str_offset(VALUE str, long pos)
Definition: string.c:2266
#define ENCINDEX_UTF_16
Definition: encindex.h:49
#define STR_SET_EMBED_LEN(str, n)
Definition: string.c:93
#define ALLOCA_N(type, n)
Definition: ruby.h:1593
static long rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
Definition: string.c:4688
#define range(low, item, hi)
Definition: date_strftime.c:21
#define ENC_CODERANGE_UNKNOWN
Definition: encoding.h:99
#define LONG_MAX
Definition: ruby.h:189
static const char * chomp_newline(const char *p, const char *e, rb_encoding *enc)
Definition: string.c:7460
static VALUE rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
Definition: string.c:6687
#define rb_enc_isprint(c, enc)
Definition: encoding.h:230
#define RUBY_FUNC_EXPORTED
Definition: defines.h:263
#define MEMCPY(p1, p2, type, n)
Definition: ruby.h:1661
#define ENC_CODERANGE_BROKEN
Definition: encoding.h:102
static int rb_isspace(int c)
Definition: ruby.h:2112
VALUE rb_enc_associate_index(VALUE obj, int idx)
Definition: encoding.c:826
VALUE rb_eEncCompatError
Definition: error.c:769
#define rb_str_index(str, sub, offset)
Definition: string.c:3283
VALUE rb_obj_alloc(VALUE)
Definition: object.c:1845
#define rb_enc_codepoint(p, e, enc)
Definition: encoding.h:201
#define OBJ_FREEZE(x)
Definition: ruby.h:1308
void rb_str_update(VALUE str, long beg, long len, VALUE val)
Definition: string.c:4396
#define rb_enc_mbminlen(enc)
Definition: encoding.h:174
unsigned int max
Definition: string.c:6372
#define STR_SHARED_P(s)
Definition: internal.h:1489
static VALUE rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
Definition: string.c:7755
VALUE rb_eIndexError
Definition: error.c:764
static VALUE rb_str_rjust(int argc, VALUE *argv, VALUE str)
Definition: string.c:8880
VALUE rb_utf8_str_new_cstr(const char *ptr)
Definition: string.c:785
#define ENC_CODERANGE_VALID
Definition: encoding.h:101
#define numberof(array)
Definition: etc.c:616
#define RUBY_DTRACE_CREATE_HOOK(name, arg)
Definition: internal.h:1753
long rb_str_sublen(VALUE str, long pos)
Definition: string.c:2313
RUBY_FUNC_EXPORTED VALUE rb_fstring(VALUE str)
Definition: string.c:305
VALUE rb_str_times(VALUE str, VALUE times)
Definition: string.c:1842
static VALUE sym_lithuanian
Definition: string.c:194
VALUE rb_str_tmp_frozen_acquire(VALUE orig)
Definition: string.c:1135
struct mapping_buffer mapping_buffer
int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
Definition: encoding.c:1032
static VALUE sym_cmp(VALUE sym, VALUE other)
Definition: string.c:9772
void rb_str_modify_expand(VALUE str, long expand)
Definition: string.c:1988
#define sub(x, y)
Definition: date_strftime.c:24
#define FL_ABLE(x)
Definition: ruby.h:1282
static VALUE str_eql(const VALUE str1, const VALUE str2)
Definition: string.c:3077
#define RSTRING_LEN(str)
Definition: ruby.h:978
static VALUE sym_encoding(VALUE sym)
Definition: string.c:9959
VALUE rb_yield(VALUE)
Definition: vm_eval.c:1020
VALUE rb_obj_as_string(VALUE obj)
Definition: string.c:1364
#define RARRAY_CONST_PTR(a)
Definition: ruby.h:1028
VALUE rb_str_subseq(VALUE str, long beg, long len)
Definition: string.c:2324
#define REALLOC_N(var, type, n)
Definition: ruby.h:1591
char * rb_string_value_cstr(volatile VALUE *ptr)
Definition: string.c:2132
static VALUE sym_downcase(int argc, VALUE *argv, VALUE sym)
Definition: string.c:9920
#define RUBY_MAX_CHAR_LEN
Definition: string.c:82
static void must_not_null(const char *ptr)
Definition: string.c:680
long rb_reg_search0(VALUE, VALUE, long, int, int)
Definition: re.c:1497
#define TRUE
Definition: nkf.h:175
static VALUE rb_str_byteslice(int argc, VALUE *argv, VALUE str)
Definition: string.c:5329
static long str_rindex(VALUE str, VALUE sub, const char *s, long pos, rb_encoding *enc)
Definition: string.c:3438
int(* case_map)(OnigCaseFoldType *flagP, const OnigUChar **pp, const OnigUChar *end, OnigUChar *to, OnigUChar *to_end, const struct OnigEncodingTypeST *enc)
Definition: onigmo.h:177
static VALUE setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
Definition: string.c:352
static VALUE str_succ(VALUE str)
Definition: string.c:3899
VALUE rb_str_format(int, const VALUE *, VALUE)
Definition: sprintf.c:461
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:1020
int rb_enc_unicode_p(rb_encoding *enc)
Definition: encoding.c:525
static VALUE string_for_symbol(VALUE name)
Definition: string.c:9965
#define STR_TMPLOCK
Definition: string.c:84
#define rb_enc_name(enc)
Definition: encoding.h:171
#define rb_strlen_lit(str)
Definition: intern.h:867
#define ONIGENC_CASE_UPCASE
Definition: onigmo.h:113
static VALUE rb_str_tr(VALUE str, VALUE src, VALUE repl)
Definition: string.c:6729
static VALUE rb_str_chop_bang(VALUE str)
Definition: string.c:7981
static VALUE str_new_empty(VALUE str)
Definition: string.c:1236
VALUE rb_hash_new(void)
Definition: hash.c:441
void ruby_xfree(void *x)
Definition: gc.c:8017
#define ENCODING_MASK
Definition: encoding.h:38
static unsigned int nlz_intptr(uintptr_t x)
Definition: internal.h:246
VALUE rb_str_escape(VALUE str)
Definition: string.c:5588
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Definition: class.c:1919
static VALUE rb_str_enumerate_codepoints(VALUE str, int wantarray)
Definition: string.c:7860
static VALUE rb_str_squeeze(int argc, VALUE *argv, VALUE str)
Definition: string.c:7016
long rb_reg_search(VALUE, VALUE, long, int)
Definition: re.c:1586
static VALUE str_duplicate(VALUE klass, VALUE str)
Definition: string.c:1404
VALUE rb_check_hash_type(VALUE hash)
Definition: hash.c:736
#define ONIGENC_CASE_FOLD
Definition: onigmo.h:120
int rb_str_cmp(VALUE str1, VALUE str2)
Definition: string.c:3050
unsigned char buf[MIME_BUF_SIZE]
Definition: nkf.c:4309
#define PRIsVALUE
Definition: ruby.h:135
size_t capa
Definition: string.c:5976
unsigned long ID
Definition: ruby.h:86
VALUE rb_str_buf_new_cstr(const char *ptr)
Definition: string.c:1263
rb_encoding * rb_usascii_encoding(void)
Definition: encoding.c:1335
static VALUE rb_str_aref_m(int argc, VALUE *argv, VALUE str)
Definition: string.c:4314
#define STRING(string, length)
Definition: yaml_private.h:128
static VALUE str_scrub_bang(int argc, VALUE *argv, VALUE str)
Definition: string.c:9531
#define Qnil
Definition: ruby.h:438
unsigned int uintptr_t
Definition: win32.h:106
static VALUE rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
Definition: string.c:6925
VALUE rb_sym_to_proc(VALUE sym)
Definition: proc.c:1203
#define FL_TEST_RAW(x, f)
Definition: ruby.h:1283
const char * name
Definition: onigmo.h:162
static VALUE str_substr(VALUE str, long beg, long len, int empty)
Definition: string.c:2441
#define BUILTIN_TYPE(x)
Definition: ruby.h:518
size_t used
Definition: string.c:5977
#define OBJ_TAINT(x)
Definition: ruby.h:1300
unsigned long VALUE
Definition: ruby.h:85
static enum neighbor_char enc_pred_char(char *p, long len, rb_encoding *enc)
Definition: string.c:3737
VALUE shared
Definition: ruby.h:969
VALUE rb_cSymbol
Definition: string.c:66
static int tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
Definition: string.c:6802
rb_encoding * rb_locale_encoding(void)
Definition: encoding.c:1370
static VALUE result
Definition: nkf.c:40
VALUE rb_str_replace(VALUE str, VALUE str2)
Definition: string.c:5097
static VALUE rb_str_lstrip_bang(VALUE str)
Definition: string.c:8238
#define rb_enc_is_newline(p, end, enc)
Definition: encoding.h:221
static VALUE str_new(VALUE klass, const char *ptr, long len)
Definition: string.c:730
static VALUE str_alloc(VALUE klass)
Definition: string.c:688
#define ONIGENC_CASE_DOWNCASE
Definition: onigmo.h:114
#define RBASIC(obj)
Definition: ruby.h:1204
#define STR_FAKESTR
Definition: string.c:86
static VALUE rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
Definition: string.c:4724
#define ENC_CODERANGE_AND(a, b)
Definition: encoding.h:108
static VALUE rb_str_is_ascii_only_p(VALUE str)
Definition: string.c:9147
int rb_utf8_encindex(void)
Definition: encoding.c:1329
void rb_str_shared_replace(VALUE str, VALUE str2)
Definition: string.c:1314
#define ENCODING_SET_INLINED(obj, i)
Definition: encoding.h:55
VALUE rb_obj_encoding(VALUE obj)
Definition: encoding.c:992
#define rb_ary_new3
Definition: intern.h:91
VALUE rb_check_funcall(VALUE, ID, int, const VALUE *)
Definition: vm_eval.c:439
#define TERM_FILL(ptr, termlen)
Definition: string.c:120
#define ONIGENC_CASE_ASCII_ONLY
Definition: onigmo.h:125
#define ONIGENC_CASE_TITLECASE
Definition: onigmo.h:115
#define rb_enc_asciicompat(enc)
Definition: encoding.h:239
static VALUE rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
Definition: string.c:8158
VALUE rb_ensure(VALUE(*b_proc)(ANYARGS), VALUE data1, VALUE(*e_proc)(ANYARGS), VALUE data2)
Definition: eval.c:923
VALUE flags
Definition: ruby.h:855
VALUE rb_str_buf_cat_ascii(VALUE str, const char *ptr)
Definition: string.c:2778
int memcmp(const void *s1, const void *s2, size_t len)
Definition: memcmp.c:7
VALUE rb_str_quote_unprintable(VALUE str)
Definition: string.c:9619
static VALUE sym_casecmp(VALUE sym, VALUE other)
Definition: string.c:9791
long rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
Definition: string.c:530
static int zero_filled(const char *s, int n)
Definition: string.c:2059
static char * str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
Definition: string.c:2188
#define RARRAY_LENINT(ary)
Definition: ruby.h:1027
RUBY_EXTERN VALUE rb_rs
Definition: intern.h:536
#define ST2FIX(h)
Definition: ruby.h:1579
static VALUE rb_str_getbyte(VALUE str, VALUE index)
Definition: string.c:5154
static void rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
Definition: string.c:613
void rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
Definition: string.c:1148
void rb_sys_fail(const char *mesg)
Definition: error.c:2326
static VALUE rb_str_chr(VALUE str)
Definition: string.c:5142
#define ENCODING_IS_ASCII8BIT(obj)
Definition: encoding.h:59
static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
Definition: string.c:9251
#define MAYBE_UNUSED
Definition: ffi_common.h:32
static const char * search_nonascii(const char *p, const char *e)
Definition: string.c:438
static VALUE str_scrub(int argc, VALUE *argv, VALUE str)
Definition: string.c:9509
static void str_modifiable(VALUE str)
Definition: string.c:1924
VALUE rb_cEncodingConverter
Definition: transcode.c:23
static VALUE rb_str_bytes(VALUE str)
Definition: string.c:7749
static VALUE rb_str_index_m(int argc, VALUE *argv, VALUE str)
Definition: string.c:3348
#define CHAR_BIT
Definition: ruby.h:196
VALUE rb_str_to_str(VALUE str)
Definition: string.c:1305
VALUE rb_str_chomp_string(VALUE str, VALUE rs)
Definition: string.c:8136
void rb_define_hooked_variable(const char *, VALUE *, VALUE(*)(ANYARGS), void(*)(ANYARGS))
Definition: variable.c:616
static VALUE rb_str_match_m(int argc, VALUE *argv, VALUE str)
Definition: string.c:3638
#define FL_UNSET(x, f)
Definition: ruby.h:1292
static void str_mod_check(VALUE s, const char *p, long len)
Definition: string.c:652
static int rb_popcount_intptr(uintptr_t x)
Definition: internal.h:285
VALUE rb_string_value(volatile VALUE *ptr)
Definition: string.c:2041
static VALUE rb_str_lines(int argc, VALUE *argv, VALUE str)
Definition: string.c:7669
VALUE rb_tainted_str_new_cstr(const char *ptr)
Definition: string.c:871
#define rb_funcallv
Definition: console.c:21
#define LONG2NUM(x)
Definition: ruby.h:1573
static const char isspacetable[256]
Definition: string.c:7168
int rb_respond_to(VALUE, ID)
Definition: vm_method.c:1995
register unsigned int len
Definition: zonetab.h:51
static VALUE scan_once(VALUE str, VALUE pat, long *start)
Definition: string.c:8449
static VALUE rb_str_sub(int argc, VALUE *argv, VALUE str)
Definition: string.c:4878
#define StringValueCStr(v)
Definition: ruby.h:571
VALUE rb_usascii_str_new(const char *ptr, long len)
Definition: string.c:742
VALUE rb_str_buf_append(VALUE str, VALUE str2)
Definition: string.c:2802
static VALUE rb_str_s_try_convert(VALUE dummy, VALUE str)
Definition: string.c:2182
#define RMATCH_REGS(obj)
Definition: re.h:52
RUBY_EXTERN VALUE rb_default_rs
Definition: intern.h:537
static VALUE sym_succ(VALUE sym)
Definition: string.c:9752
void rb_str_free(VALUE str)
Definition: string.c:1281
static VALUE sym_swapcase(int argc, VALUE *argv, VALUE sym)
Definition: string.c:9946
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Definition: string.c:1050
static VALUE rb_str_end_with(int argc, VALUE *argv, VALUE str)
Definition: string.c:9040
struct mapping_buffer * next
Definition: string.c:5978
#define RSTRING_PTR(str)
Definition: ruby.h:982
#define rb_enc_right_char_head(s, p, e, enc)
Definition: encoding.h:217
static void str_enc_copy(VALUE str1, VALUE str2)
Definition: string.c:577
#define ENCODING_GET(obj)
Definition: encoding.h:58
VALUE rb_equal(VALUE, VALUE)
Definition: object.c:86
rb_encoding * rb_enc_get(VALUE obj)
Definition: encoding.c:860
#define STR_SET_SHARED(str, shared_str)
Definition: string.c:151
#define STR_HEAP_PTR(str)
Definition: string.c:160
int size
Definition: encoding.c:57
static VALUE rb_str_hex(VALUE str)
Definition: string.c:8565
char * rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
Definition: string.c:2238
static char * str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
Definition: string.c:2244
#define f
static VALUE rb_str_reverse(VALUE str)
Definition: string.c:5350
#define INT2FIX(i)
Definition: ruby.h:232
#define UNLIMITED_ARGUMENTS
Definition: intern.h:44
char * rb_str_subpos(VALUE str, long beg, long *lenp)
Definition: string.c:2348
VALUE rb_str_unlocktmp(VALUE str)
Definition: string.c:2528
VALUE rb_tainted_str_new(const char *ptr, long len)
Definition: string.c:853
#define CASE_MAPPING_ADDITIONAL_LENGTH
Definition: string.c:5969
#define MBCLEN_INVALID_P(ret)
Definition: encoding.h:187
static VALUE rb_str_valid_encoding_p(VALUE str)
Definition: string.c:9129
#define RARRAY_AREF(a, i)
Definition: ruby.h:1040
static VALUE rb_str_each_byte(VALUE str)
Definition: string.c:7732
static VALUE rb_str_chop(VALUE str)
Definition: string.c:8016
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Definition: transcode.c:1442
VALUE rb_check_convert_type(VALUE, int, const char *, const char *)
Definition: object.c:2643
static VALUE get_pat_quoted(VALUE pat, int check)
Definition: string.c:4661
static long rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
Definition: string.c:8287
static VALUE rb_str_count(int argc, VALUE *argv, VALUE str)
Definition: string.c:7089
#define STR_SET_LEN(str, n)
Definition: string.c:99
static VALUE rb_str_eql(VALUE str1, VALUE str2)
Definition: string.c:3125
static void rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
Definition: string.c:4443
#define xmalloc
Definition: defines.h:183
static VALUE rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
Definition: string.c:6327
#define RBASIC_CLASS(obj)
Definition: ruby.h:878
#define ONIGENC_MBCLEN_CHARFOUND_LEN(r)
Definition: onigmo.h:347
static long enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
Definition: string.c:1574
static VALUE rb_str_lstrip(VALUE str)
Definition: string.c:8276
#define RESIZE_CAPA_TERM(str, capacity, termlen)
Definition: string.c:132
int num_regs
Definition: onigmo.h:716
#define lesser(a, b)
Definition: string.c:3022
VALUE rb_check_array_type(VALUE ary)
Definition: array.c:635
#define FL_SET_RAW(x, f)
Definition: ruby.h:1289
VALUE rb_hash_aref(VALUE hash, VALUE key)
Definition: hash.c:845
static enum neighbor_char enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
Definition: string.c:3799
static VALUE sym_match(VALUE sym, VALUE other)
Definition: string.c:9825
#define UNALIGNED_WORD_ACCESS
Definition: defines.h:345
#define ONIGERR_TOO_BIG_WIDE_CHAR_VALUE
Definition: onigmo.h:691
#define FL_WB_PROTECTED
Definition: ruby.h:1216
#define ENC_CODERANGE(obj)
Definition: encoding.h:104
static VALUE rb_str_upto(int argc, VALUE *argv, VALUE beg)
Definition: string.c:4055
VALUE rb_to_symbol(VALUE name)
Definition: string.c:9989
VALUE rb_str_cat(VALUE str, const char *ptr, long len)
Definition: string.c:2664
VALUE rb_any_to_s(VALUE)
Definition: object.c:500
long rb_str_strlen(VALUE str)
Definition: string.c:1735
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Definition: string.c:884
#define LONG2FIX(i)
Definition: ruby.h:234
static VALUE tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
Definition: string.c:6433
#define RTEST(v)
Definition: ruby.h:450
int rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
Definition: string.c:5555
#define T_STRING
Definition: ruby.h:496
#define ONIGENC_CASE_FOLD_TURKISH_AZERI
Definition: onigmo.h:122
VALUE rb_str_locktmp(VALUE)
#define PRIuSIZE
Definition: ruby.h:177
static VALUE rb_str_swapcase(int argc, VALUE *argv, VALUE str)
Definition: string.c:6361
#define OBJ_INFECT(x, s)
Definition: ruby.h:1304
static Bigint * diff(Bigint *a, Bigint *b)
Definition: util.c:1507
const struct st_hash_type rb_fstring_hash_type
Definition: string.c:253
VALUE rb_str_drop_bytes(VALUE str, long len)
Definition: string.c:4331
size_t rb_str_capacity(VALUE str)
Definition: string.c:674
VALUE rb_fstring_new(const char *ptr, long len)
Definition: string.c:373
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Definition: string.c:793
rb_encoding * rb_filesystem_encoding(void)
Definition: encoding.c:1385
static VALUE rb_str_init(int argc, VALUE *argv, VALUE str)
Definition: string.c:1466
rb_encoding * rb_enc_get_from_index(int index)
Definition: encoding.c:628
void rb_str_setter(VALUE val, ID id, VALUE *var)
Definition: string.c:9063
static VALUE rb_str_rstrip_bang(VALUE str)
Definition: string.c:8328
#define STR_SHARED
Definition: internal.h:1487
VALUE rb_str_tmp_new(long len)
Definition: string.c:1275
static VALUE rb_str_each_char(VALUE str)
Definition: string.c:7836
VALUE rb_fs
Definition: string.c:435
#define ISPRINT(c)
Definition: ruby.h:2122
#define rb_enc_left_char_head(s, p, e, enc)
Definition: encoding.h:216
static VALUE str_replace_shared(VALUE str2, VALUE str)
Definition: string.c:1100
VALUE rb_reg_match_p(VALUE re, VALUE str, long pos)
Definition: re.c:3234
static VALUE rb_str_upcase(int argc, VALUE *argv, VALUE str)
Definition: string.c:6143
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Definition: string.c:992
#define UNLIKELY(x)
Definition: ffi_common.h:126
void rb_backref_set_string(VALUE string, long pos, long len)
Definition: re.c:1308
#define rb_str_splice(str, beg, len, val)
Definition: string.c:4440
#define SHARABLE_SUBSTRING_P(beg, len, end)
Definition: string.c:169
#define RETURN_ENUMERATOR(obj, argc, argv)
Definition: intern.h:240
#define FL_UNSET_RAW(x, f)
Definition: ruby.h:1291
static VALUE rb_str_start_with(int argc, VALUE *argv, VALUE str)
Definition: string.c:9011
VALUE rb_fstring_cstr(const char *ptr)
Definition: string.c:387
VALUE rb_str_substr(VALUE str, long beg, long len)
Definition: string.c:2435
static void str_discard(VALUE str)
Definition: string.c:2021
void rb_must_asciicompat(VALUE str)
Definition: string.c:2032
const char * name
Definition: nkf.c:208
#define FL_SET(x, f)
Definition: ruby.h:1290
#define ID2SYM(x)
Definition: ruby.h:383
VALUE rb_sym_all_symbols(void)
Definition: symbol.c:814
int gen
Definition: string.c:6371
VALUE rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len, rb_encoding *from, int ecflags, VALUE ecopts)
Definition: string.c:914
static VALUE sym_empty(VALUE sym)
Definition: string.c:9894
static VALUE rb_str_to_s(VALUE str)
Definition: string.c:5532
static VALUE sym_casecmp_p(VALUE sym, VALUE other)
Definition: string.c:9809
static VALUE str_byte_aref(VALUE str, VALUE indx)
Definition: string.c:5281
#define FL_FREEZE
Definition: ruby.h:1223
VALUE rb_external_str_new(const char *ptr, long len)
Definition: string.c:1020
Definition: id.h:93
VALUE rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
Definition: string.c:4156
VALUE rb_str_succ(VALUE orig)
Definition: string.c:3889
rb_encoding * rb_ascii8bit_encoding(void)
Definition: encoding.c:1305
void rb_warning(const char *fmt,...)
Definition: error.c:250
#define RSTRING_LENINT(str)
Definition: ruby.h:990
ONIG_EXTERN int onigenc_ascii_only_case_map(OnigCaseFoldType *flagP, const OnigUChar **pp, const OnigUChar *end, OnigUChar *to, OnigUChar *to_end, const struct OnigEncodingTypeST *enc)
Definition: regenc.c:955
rb_encoding * rb_enc_check_str(VALUE str1, VALUE str2)
Definition: encoding.c:868
#define rb_check_frozen(obj)
Definition: intern.h:276
#define CONST_ID(var, str)
Definition: ruby.h:1743
#define SIZED_ENUMERATOR(obj, argc, argv, size_fn)
Definition: intern.h:233
static VALUE rb_str_sum(int argc, VALUE *argv, VALUE str)
Definition: string.c:8692
VALUE rb_str_intern(VALUE)
Definition: symbol.c:661
VALUE rb_str_inspect(VALUE str)
Definition: string.c:5664
static void tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first, VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
Definition: string.c:6738
#define rb_intern_const(str)
Definition: ruby.h:1756
VALUE rb_obj_freeze(VALUE)
Definition: object.c:1111
#define memcpy(d, s, n)
Definition: ffi_common.h:55
#define RSTRING_EMBED_LEN(str)
Definition: ruby.h:975
static char * str_fill_term(VALUE str, char *s, long len, int termlen)
Definition: string.c:2079
#define SPECIAL_CONST_P(x)
Definition: ruby.h:1249
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Definition: transcode.c:2884
void void xfree(void *)
VALUE rb_str_buf_new(long capa)
Definition: string.c:1247
int rb_num_to_uint(VALUE val, unsigned int *ret)
Definition: numeric.c:242
static VALUE str_uplus(VALUE str)
Definition: string.c:2484
static VALUE rb_str_casecmp(VALUE str1, VALUE str2)
Definition: string.c:3193
int rb_str_comparable(VALUE str1, VALUE str2)
Definition: string.c:3025
#define rb_enc_mbcput(c, buf, enc)
Definition: encoding.h:211
static int str_upto_i(VALUE str, VALUE arg)
Definition: string.c:4016
#define SYMBOL_P(x)
Definition: ruby.h:382
#define mod(x, y)
Definition: date_strftime.c:28
VALUE rb_str_ord(VALUE s)
Definition: string.c:8673
#define RB_INTEGER_TYPE_P(obj)
Definition: ruby_missing.h:20
#define rb_str_dup_frozen
static VALUE sym_capitalize(int argc, VALUE *argv, VALUE sym)
Definition: string.c:9933
static VALUE sym_aref(int argc, VALUE *argv, VALUE sym)
Definition: string.c:9867
#define ONIGENC_CTYPE_DIGIT
Definition: onigmo.h:298
#define NULL
Definition: _sdbm.c:102
#define OBJ_TAINTED_RAW(x)
Definition: ruby.h:1297
#define FIX2LONG(x)
Definition: ruby.h:363
#define Qundef
Definition: ruby.h:439
VALUE rb_invcmp(VALUE x, VALUE y)
Definition: compar.c:46
VALUE rb_str_resurrect(VALUE str)
Definition: string.c:1442
static VALUE rb_str_aref(VALUE str, VALUE indx)
Definition: string.c:4211
VALUE rb_check_string_type(VALUE str)
Definition: string.c:2164
static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str)
Definition: string.c:6246
VALUE rb_usascii_str_new_cstr(const char *ptr)
Definition: string.c:777
VALUE rb_id_quote_unprintable(ID id)
Definition: string.c:9640
VALUE rb_reg_regcomp(VALUE)
Definition: re.c:2867
static int match(VALUE str, VALUE pat, VALUE hash, int(*cb)(VALUE, VALUE))
Definition: date_parse.c:280
void rb_define_method(VALUE klass, const char *name, VALUE(*func)(ANYARGS), int argc)
Definition: class.c:1515
#define ENCINDEX_UTF_32LE
Definition: encindex.h:48
static VALUE rb_str_delete_bang(int, VALUE *, VALUE)
Definition: string.c:6832
static VALUE rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
Definition: string.c:2872
void rb_warn(const char *fmt,...)
Definition: error.c:221
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition: string.c:9169
#define bp()
Definition: vm_debug.h:25
#define ENCINDEX_UTF_32BE
Definition: encindex.h:47
VALUE rb_eArgError
Definition: error.c:763
#define rb_enc_prev_char(s, p, e, enc)
Definition: encoding.h:214
static VALUE rb_str_force_encoding(VALUE str, VALUE enc)
Definition: string.c:9092
#define T_REGEXP
Definition: ruby.h:497
#define STR_HEAP_SIZE(str)
Definition: string.c:161
#define IS_EVSTR(p, e)
Definition: string.c:5758
VALUE rb_str_dump(VALUE str)
Definition: string.c:5771
#define NUM2LONG(x)
Definition: ruby.h:648
#define STR_NOEMBED
Definition: internal.h:1486
static VALUE rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
Definition: string.c:3513
#define TR_TABLE_SIZE
Definition: string.c:6736
#define RB_OBJ_WRITE(a, slot, b)
Definition: ruby.h:1437
VALUE rb_reg_nth_match(int, VALUE)
Definition: re.c:1610
static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex)
Definition: string.c:803
#define rb_enc_code_to_mbclen(c, enc)
Definition: encoding.h:208
static VALUE rb_str_each_codepoint(VALUE str)
Definition: string.c:7929
void rb_str_modify(VALUE str)
Definition: string.c:1980
#define STR_EMBED_P(str)
Definition: internal.h:1488
#define ONIGENC_MBCLEN_CHARFOUND_P(r)
Definition: onigmo.h:346
char ** argv
Definition: ruby.c:184
ID rb_to_id(VALUE name)
Definition: string.c:9979
char * ptr
Definition: ruby.h:966
#define DBL2NUM(dbl)
Definition: ruby.h:941
#define StringValue(v)
Definition: ruby.h:569
#define L(x)
Definition: asm.h:125
VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
Definition: string.c:758
static enum neighbor_char enc_succ_char(char *p, long len, rb_encoding *enc)
Definition: string.c:3685
VALUE rb_external_str_new_cstr(const char *ptr)
Definition: string.c:1026
rb_encoding * rb_enc_from_index(int index)
Definition: encoding.c:616
#define rb_sym2str(sym)
Definition: console.c:107
VALUE rb_obj_class(VALUE)
Definition: object.c:229
VALUE rb_str_dup(VALUE str)
Definition: string.c:1436
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Definition: string.c:1230
char * rb_str_fill_terminator(VALUE str, const int newminlen)
Definition: string.c:2156