Ruby 3.2.5p208 (2024-07-26 revision 31d0f1a2e7dbfb60731d1f05b868e1d578cda493)
string.c
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "gc.h"
27#include "id.h"
28#include "internal.h"
29#include "internal/array.h"
30#include "internal/compar.h"
31#include "internal/compilers.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/numeric.h"
36#include "internal/object.h"
37#include "internal/proc.h"
38#include "internal/re.h"
39#include "internal/sanitizers.h"
40#include "internal/string.h"
41#include "internal/transcode.h"
42#include "probes.h"
43#include "ruby/encoding.h"
44#include "ruby/re.h"
45#include "ruby/util.h"
46#include "ruby_assert.h"
47#include "vm_sync.h"
48
49#if defined HAVE_CRYPT_R
50# if defined HAVE_CRYPT_H
51# include <crypt.h>
52# endif
53#elif !defined HAVE_CRYPT
54# include "missing/crypt.h"
55# define HAVE_CRYPT_R 1
56#endif
57
58#define BEG(no) (regs->beg[(no)])
59#define END(no) (regs->end[(no)])
60
61#undef rb_str_new
62#undef rb_usascii_str_new
63#undef rb_utf8_str_new
64#undef rb_enc_str_new
65#undef rb_str_new_cstr
66#undef rb_usascii_str_new_cstr
67#undef rb_utf8_str_new_cstr
68#undef rb_enc_str_new_cstr
69#undef rb_external_str_new_cstr
70#undef rb_locale_str_new_cstr
71#undef rb_str_dup_frozen
72#undef rb_str_buf_new_cstr
73#undef rb_str_buf_cat
74#undef rb_str_buf_cat2
75#undef rb_str_cat2
76#undef rb_str_cat_cstr
77#undef rb_fstring_cstr
78
81
82/* FLAGS of RString
83 *
84 * 1: RSTRING_NOEMBED
85 * 2: STR_SHARED (== ELTS_SHARED)
86 * 2-6: RSTRING_EMBED_LEN (5 bits == 32)
87 * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
88 * other strings that rely on this string's buffer)
89 * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
90 * early, specific to rb_str_tmp_frozen_{acquire,release})
91 * 7: STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
92 * such as read(2). Any modification and realloc is prohibited)
93 *
94 * 8-9: ENC_CODERANGE (2 bits)
95 * 10-16: ENCODING (7 bits == 128)
96 * 17: RSTRING_FSTR
97 * 18: STR_NOFREE (do not free this string's buffer when a String is freed.
98 * used for a string object based on C string literal)
99 * 19: STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
100 * object header is temporarily allocated on C stack)
101 */
102
103#define RUBY_MAX_CHAR_LEN 16
104#define STR_SHARED_ROOT FL_USER5
105#define STR_BORROWED FL_USER6
106#define STR_TMPLOCK FL_USER7
107#define STR_NOFREE FL_USER18
108#define STR_FAKESTR FL_USER19
109
110#define STR_SET_NOEMBED(str) do {\
111 FL_SET((str), STR_NOEMBED);\
112 if (USE_RVARGC) {\
113 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
114 }\
115 else {\
116 STR_SET_EMBED_LEN((str), 0);\
117 }\
118} while (0)
119#define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
120#if USE_RVARGC
121# define STR_SET_EMBED_LEN(str, n) do { \
122 assert(str_embed_capa(str) > (n));\
123 RSTRING(str)->as.embed.len = (n);\
124} while (0)
125#else
126# define STR_SET_EMBED_LEN(str, n) do { \
127 long tmp_n = (n);\
128 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
129 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
130} while (0)
131#endif
132
133#define STR_SET_LEN(str, n) do { \
134 if (STR_EMBED_P(str)) {\
135 STR_SET_EMBED_LEN((str), (n));\
136 }\
137 else {\
138 RSTRING(str)->as.heap.len = (n);\
139 }\
140} while (0)
141
142#define STR_DEC_LEN(str) do {\
143 if (STR_EMBED_P(str)) {\
144 long n = RSTRING_LEN(str);\
145 n--;\
146 STR_SET_EMBED_LEN((str), n);\
147 }\
148 else {\
149 RSTRING(str)->as.heap.len--;\
150 }\
151} while (0)
152
153static inline bool
154str_enc_fastpath(VALUE str)
155{
156 // The overwhelming majority of strings are in one of these 3 encodings.
157 switch (ENCODING_GET_INLINED(str)) {
158 case ENCINDEX_ASCII_8BIT:
159 case ENCINDEX_UTF_8:
160 case ENCINDEX_US_ASCII:
161 return true;
162 default:
163 return false;
164 }
165}
166
167#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
168#define TERM_FILL(ptr, termlen) do {\
169 char *const term_fill_ptr = (ptr);\
170 const int term_fill_len = (termlen);\
171 *term_fill_ptr = '\0';\
172 if (UNLIKELY(term_fill_len > 1))\
173 memset(term_fill_ptr, 0, term_fill_len);\
174} while (0)
175
176#define RESIZE_CAPA(str,capacity) do {\
177 const int termlen = TERM_LEN(str);\
178 RESIZE_CAPA_TERM(str,capacity,termlen);\
179} while (0)
180#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
181 if (STR_EMBED_P(str)) {\
182 if (str_embed_capa(str) < capacity + termlen) {\
183 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
184 const long tlen = RSTRING_LEN(str);\
185 memcpy(tmp, RSTRING_PTR(str), tlen);\
186 RSTRING(str)->as.heap.ptr = tmp;\
187 RSTRING(str)->as.heap.len = tlen;\
188 STR_SET_NOEMBED(str);\
189 RSTRING(str)->as.heap.aux.capa = (capacity);\
190 }\
191 }\
192 else {\
193 assert(!FL_TEST((str), STR_SHARED)); \
194 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
195 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
196 RSTRING(str)->as.heap.aux.capa = (capacity);\
197 }\
198} while (0)
199
200#define STR_SET_SHARED(str, shared_str) do { \
201 if (!FL_TEST(str, STR_FAKESTR)) { \
202 assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
203 assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
204 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
205 FL_SET((str), STR_SHARED); \
206 FL_SET((shared_str), STR_SHARED_ROOT); \
207 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
208 FL_SET_RAW((shared_str), STR_BORROWED); \
209 } \
210} while (0)
211
212#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
213#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
214/* TODO: include the terminator size in capa. */
215
216#define STR_ENC_GET(str) get_encoding(str)
217
218#if !defined SHARABLE_MIDDLE_SUBSTRING
219# define SHARABLE_MIDDLE_SUBSTRING 0
220#endif
221#if !SHARABLE_MIDDLE_SUBSTRING
222#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
223#else
224#define SHARABLE_SUBSTRING_P(beg, len, end) 1
225#endif
226
227
228static inline long
229str_embed_capa(VALUE str)
230{
231#if USE_RVARGC
232 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
233#else
234 return RSTRING_EMBED_LEN_MAX + 1;
235#endif
236}
237
238bool
239rb_str_reembeddable_p(VALUE str)
240{
241 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
242}
243
244static inline size_t
245rb_str_embed_size(long capa)
246{
247 return offsetof(struct RString, as.embed.ary) + capa;
248}
249
250size_t
251rb_str_size_as_embedded(VALUE str)
252{
253 size_t real_size;
254#if USE_RVARGC
255 if (STR_EMBED_P(str)) {
256 real_size = rb_str_embed_size(RSTRING(str)->as.embed.len) + TERM_LEN(str);
257 }
258 /* if the string is not currently embedded, but it can be embedded, how
259 * much space would it require */
260 else if (rb_str_reembeddable_p(str)) {
261 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
262 }
263 else {
264#endif
265 real_size = sizeof(struct RString);
266#if USE_RVARGC
267 }
268#endif
269 return real_size;
270}
271
272static inline bool
273STR_EMBEDDABLE_P(long len, long termlen)
274{
275#if USE_RVARGC
276 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
277#else
278 return len <= RSTRING_EMBED_LEN_MAX + 1 - termlen;
279#endif
280}
281
282static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
283static VALUE str_new_frozen(VALUE klass, VALUE orig);
284static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
285static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
286static VALUE str_new(VALUE klass, const char *ptr, long len);
287static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
288static inline void str_modifiable(VALUE str);
289static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
290
291static inline void
292str_make_independent(VALUE str)
293{
294 long len = RSTRING_LEN(str);
295 int termlen = TERM_LEN(str);
296 str_make_independent_expand((str), len, 0L, termlen);
297}
298
299static inline int str_dependent_p(VALUE str);
300
301void
302rb_str_make_independent(VALUE str)
303{
304 if (str_dependent_p(str)) {
305 str_make_independent(str);
306 }
307}
308
309void
310rb_str_make_embedded(VALUE str)
311{
312 RUBY_ASSERT(rb_str_reembeddable_p(str));
313 RUBY_ASSERT(!STR_EMBED_P(str));
314
315 char *buf = RSTRING(str)->as.heap.ptr;
316 long len = RSTRING(str)->as.heap.len;
317
318 STR_SET_EMBED(str);
319 STR_SET_EMBED_LEN(str, len);
320
321 if (len > 0) {
322 memcpy(RSTRING_PTR(str), buf, len);
323 ruby_xfree(buf);
324 }
325
326 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
327}
328
329void
330rb_str_update_shared_ary(VALUE str, VALUE old_root, VALUE new_root)
331{
332 // if the root location hasn't changed, we don't need to update
333 if (new_root == old_root) {
334 return;
335 }
336
337 // if the root string isn't embedded, we don't need to touch the ponter.
338 // it already points to the shame shared buffer
339 if (!STR_EMBED_P(new_root)) {
340 return;
341 }
342
343 size_t offset = (size_t)((uintptr_t)RSTRING(str)->as.heap.ptr - (uintptr_t)RSTRING(old_root)->as.embed.ary);
344
345 RUBY_ASSERT(RSTRING(str)->as.heap.ptr >= RSTRING(old_root)->as.embed.ary);
346 RSTRING(str)->as.heap.ptr = RSTRING(new_root)->as.embed.ary + offset;
347}
348
349void
350rb_debug_rstring_null_ptr(const char *func)
351{
352 fprintf(stderr, "%s is returning NULL!! "
353 "SIGSEGV is highly expected to follow immediately.\n"
354 "If you could reproduce, attach your debugger here, "
355 "and look at the passed string.\n",
356 func);
357}
358
359/* symbols for [up|down|swap]case/capitalize options */
360static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
361
362static rb_encoding *
363get_encoding(VALUE str)
364{
365 return rb_enc_from_index(ENCODING_GET(str));
366}
367
368static void
369mustnot_broken(VALUE str)
370{
371 if (is_broken_string(str)) {
372 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
373 }
374}
375
376static void
377mustnot_wchar(VALUE str)
378{
379 rb_encoding *enc = STR_ENC_GET(str);
380 if (rb_enc_mbminlen(enc) > 1) {
381 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
382 }
383}
384
385static int fstring_cmp(VALUE a, VALUE b);
386
387static VALUE register_fstring(VALUE str, bool copy);
388
389const struct st_hash_type rb_fstring_hash_type = {
390 fstring_cmp,
392};
393
394#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
395
397 VALUE fstr;
398 bool copy;
399};
400
401static int
402fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
403{
404
405 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
406 VALUE str = (VALUE)*key;
407
408 if (existing) {
409 /* because of lazy sweep, str may be unmarked already and swept
410 * at next time */
411
412 if (rb_objspace_garbage_object_p(str)) {
413 arg->fstr = Qundef;
414 return ST_DELETE;
415 }
416
417 arg->fstr = str;
418 return ST_STOP;
419 }
420 else {
421 if (FL_TEST_RAW(str, STR_FAKESTR)) {
422 if (arg->copy) {
423 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->as.heap.len);
424 rb_enc_copy(new_str, str);
425 str = new_str;
426 }
427 else {
428 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
429 RSTRING(str)->as.heap.len,
430 ENCODING_GET(str));
431 }
432 OBJ_FREEZE_RAW(str);
433 }
434 else {
435 if (!OBJ_FROZEN(str))
436 str = str_new_frozen(rb_cString, str);
437 if (STR_SHARED_P(str)) { /* str should not be shared */
438 /* shared substring */
439 str_make_independent(str);
440 assert(OBJ_FROZEN(str));
441 }
442 if (!BARE_STRING_P(str)) {
443 str = str_new_frozen(rb_cString, str);
444 }
445 }
446 RBASIC(str)->flags |= RSTRING_FSTR;
447
448 *key = *value = arg->fstr = str;
449 return ST_CONTINUE;
450 }
451}
452
453RUBY_FUNC_EXPORTED
454VALUE
455rb_fstring(VALUE str)
456{
457 VALUE fstr;
458 int bare;
459
460 Check_Type(str, T_STRING);
461
462 if (FL_TEST(str, RSTRING_FSTR))
463 return str;
464
465 bare = BARE_STRING_P(str);
466 if (!bare) {
467 if (STR_EMBED_P(str)) {
468 OBJ_FREEZE_RAW(str);
469 return str;
470 }
471 if (FL_TEST_RAW(str, STR_NOEMBED|STR_SHARED_ROOT|STR_SHARED) == (STR_NOEMBED|STR_SHARED_ROOT)) {
472 assert(OBJ_FROZEN(str));
473 return str;
474 }
475 }
476
477 if (!OBJ_FROZEN(str))
478 rb_str_resize(str, RSTRING_LEN(str));
479
480 fstr = register_fstring(str, FALSE);
481
482 if (!bare) {
483 str_replace_shared_without_enc(str, fstr);
484 OBJ_FREEZE_RAW(str);
485 return str;
486 }
487 return fstr;
488}
489
490static VALUE
491register_fstring(VALUE str, bool copy)
492{
493 struct fstr_update_arg args;
494 args.copy = copy;
495
496 RB_VM_LOCK_ENTER();
497 {
498 st_table *frozen_strings = rb_vm_fstring_table();
499 do {
500 args.fstr = str;
501 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
502 } while (UNDEF_P(args.fstr));
503 }
504 RB_VM_LOCK_LEAVE();
505
506 assert(OBJ_FROZEN(args.fstr));
507 assert(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
508 assert(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
509 assert(RBASIC_CLASS(args.fstr) == rb_cString);
510 return args.fstr;
511}
512
513static VALUE
514setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
515{
516 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
517 /* SHARED to be allocated by the callback */
518
519 if (!name) {
520 RUBY_ASSERT_ALWAYS(len == 0);
521 name = "";
522 }
523
524 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
525
526 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
527 fake_str->as.heap.len = len;
528 fake_str->as.heap.ptr = (char *)name;
529 fake_str->as.heap.aux.capa = len;
530 return (VALUE)fake_str;
531}
532
533/*
534 * set up a fake string which refers a static string literal.
535 */
536VALUE
537rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
538{
539 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
540}
541
542/*
543 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
544 * shared string which refers a static string literal. `ptr` must
545 * point a constant string.
546 */
547MJIT_FUNC_EXPORTED VALUE
548rb_fstring_new(const char *ptr, long len)
549{
550 struct RString fake_str;
551 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
552}
553
554VALUE
555rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
556{
557 struct RString fake_str;
558 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
559}
560
561VALUE
562rb_fstring_cstr(const char *ptr)
563{
564 return rb_fstring_new(ptr, strlen(ptr));
565}
566
567static int
568fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
569{
570 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
571 return ST_CONTINUE;
572}
573
574static int
575fstring_cmp(VALUE a, VALUE b)
576{
577 long alen, blen;
578 const char *aptr, *bptr;
579 RSTRING_GETMEM(a, aptr, alen);
580 RSTRING_GETMEM(b, bptr, blen);
581 return (alen != blen ||
582 ENCODING_GET(a) != ENCODING_GET(b) ||
583 memcmp(aptr, bptr, alen) != 0);
584}
585
586static inline int
587single_byte_optimizable(VALUE str)
588{
589 rb_encoding *enc;
590
591 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
593 return 1;
594
595 enc = STR_ENC_GET(str);
596 if (rb_enc_mbmaxlen(enc) == 1)
597 return 1;
598
599 /* Conservative. Possibly single byte.
600 * "\xa1" in Shift_JIS for example. */
601 return 0;
602}
603
605
606static inline const char *
607search_nonascii(const char *p, const char *e)
608{
609 const uintptr_t *s, *t;
610
611#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
612# if SIZEOF_UINTPTR_T == 8
613# define NONASCII_MASK UINT64_C(0x8080808080808080)
614# elif SIZEOF_UINTPTR_T == 4
615# define NONASCII_MASK UINT32_C(0x80808080)
616# else
617# error "don't know what to do."
618# endif
619#else
620# if SIZEOF_UINTPTR_T == 8
621# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
622# elif SIZEOF_UINTPTR_T == 4
623# define NONASCII_MASK 0x80808080UL /* or...? */
624# else
625# error "don't know what to do."
626# endif
627#endif
628
629 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
630#if !UNALIGNED_WORD_ACCESS
631 if ((uintptr_t)p % SIZEOF_VOIDP) {
632 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
633 p += l;
634 switch (l) {
635 default: UNREACHABLE;
636#if SIZEOF_VOIDP > 4
637 case 7: if (p[-7]&0x80) return p-7;
638 case 6: if (p[-6]&0x80) return p-6;
639 case 5: if (p[-5]&0x80) return p-5;
640 case 4: if (p[-4]&0x80) return p-4;
641#endif
642 case 3: if (p[-3]&0x80) return p-3;
643 case 2: if (p[-2]&0x80) return p-2;
644 case 1: if (p[-1]&0x80) return p-1;
645 case 0: break;
646 }
647 }
648#endif
649#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
650#define aligned_ptr(value) \
651 __builtin_assume_aligned((value), sizeof(uintptr_t))
652#else
653#define aligned_ptr(value) (uintptr_t *)(value)
654#endif
655 s = aligned_ptr(p);
656 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
657#undef aligned_ptr
658 for (;s < t; s++) {
659 if (*s & NONASCII_MASK) {
660#ifdef WORDS_BIGENDIAN
661 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
662#else
663 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
664#endif
665 }
666 }
667 p = (const char *)s;
668 }
669
670 switch (e - p) {
671 default: UNREACHABLE;
672#if SIZEOF_VOIDP > 4
673 case 7: if (e[-7]&0x80) return e-7;
674 case 6: if (e[-6]&0x80) return e-6;
675 case 5: if (e[-5]&0x80) return e-5;
676 case 4: if (e[-4]&0x80) return e-4;
677#endif
678 case 3: if (e[-3]&0x80) return e-3;
679 case 2: if (e[-2]&0x80) return e-2;
680 case 1: if (e[-1]&0x80) return e-1;
681 case 0: return NULL;
682 }
683}
684
685static int
686coderange_scan(const char *p, long len, rb_encoding *enc)
687{
688 const char *e = p + len;
689
690 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
691 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
692 p = search_nonascii(p, e);
694 }
695
696 if (rb_enc_asciicompat(enc)) {
697 p = search_nonascii(p, e);
698 if (!p) return ENC_CODERANGE_7BIT;
699 for (;;) {
700 int ret = rb_enc_precise_mbclen(p, e, enc);
702 p += MBCLEN_CHARFOUND_LEN(ret);
703 if (p == e) break;
704 p = search_nonascii(p, e);
705 if (!p) break;
706 }
707 }
708 else {
709 while (p < e) {
710 int ret = rb_enc_precise_mbclen(p, e, enc);
712 p += MBCLEN_CHARFOUND_LEN(ret);
713 }
714 }
715 return ENC_CODERANGE_VALID;
716}
717
718long
719rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
720{
721 const char *p = s;
722
723 if (*cr == ENC_CODERANGE_BROKEN)
724 return e - s;
725
726 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
727 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
728 if (*cr == ENC_CODERANGE_VALID) return e - s;
729 p = search_nonascii(p, e);
731 return e - s;
732 }
733 else if (rb_enc_asciicompat(enc)) {
734 p = search_nonascii(p, e);
735 if (!p) {
736 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
737 return e - s;
738 }
739 for (;;) {
740 int ret = rb_enc_precise_mbclen(p, e, enc);
741 if (!MBCLEN_CHARFOUND_P(ret)) {
743 return p - s;
744 }
745 p += MBCLEN_CHARFOUND_LEN(ret);
746 if (p == e) break;
747 p = search_nonascii(p, e);
748 if (!p) break;
749 }
750 }
751 else {
752 while (p < e) {
753 int ret = rb_enc_precise_mbclen(p, e, enc);
754 if (!MBCLEN_CHARFOUND_P(ret)) {
756 return p - s;
757 }
758 p += MBCLEN_CHARFOUND_LEN(ret);
759 }
760 }
762 return e - s;
763}
764
765static inline void
766str_enc_copy(VALUE str1, VALUE str2)
767{
768 rb_enc_set_index(str1, ENCODING_GET(str2));
769}
770
771static void
772rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
773{
774 /* this function is designed for copying encoding and coderange
775 * from src to new string "dest" which is made from the part of src.
776 */
777 str_enc_copy(dest, src);
778 if (RSTRING_LEN(dest) == 0) {
779 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
781 else
783 return;
784 }
785 switch (ENC_CODERANGE(src)) {
788 break;
790 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
791 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
793 else
795 break;
796 default:
797 break;
798 }
799}
800
801static void
802rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
803{
804 str_enc_copy(dest, src);
806}
807
808static int
809enc_coderange_scan(VALUE str, rb_encoding *enc)
810{
811 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
812}
813
814int
815rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
816{
817 return enc_coderange_scan(str, enc);
818}
819
820int
822{
823 int cr = ENC_CODERANGE(str);
824
825 if (cr == ENC_CODERANGE_UNKNOWN) {
826 cr = enc_coderange_scan(str, get_encoding(str));
827 ENC_CODERANGE_SET(str, cr);
828 }
829 return cr;
830}
831
832int
834{
835 rb_encoding *enc = STR_ENC_GET(str);
836
837 if (!rb_enc_asciicompat(enc))
838 return FALSE;
839 else if (is_ascii_string(str))
840 return TRUE;
841 return FALSE;
842}
843
844static inline void
845str_mod_check(VALUE s, const char *p, long len)
846{
847 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
848 rb_raise(rb_eRuntimeError, "string modified");
849 }
850}
851
852static size_t
853str_capacity(VALUE str, const int termlen)
854{
855 if (STR_EMBED_P(str)) {
856#if USE_RVARGC
857 return str_embed_capa(str) - termlen;
858#else
859 return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
860#endif
861 }
862 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
863 return RSTRING(str)->as.heap.len;
864 }
865 else {
866 return RSTRING(str)->as.heap.aux.capa;
867 }
868}
869
870size_t
872{
873 return str_capacity(str, TERM_LEN(str));
874}
875
876static inline void
877must_not_null(const char *ptr)
878{
879 if (!ptr) {
880 rb_raise(rb_eArgError, "NULL pointer given");
881 }
882}
883
884static inline VALUE
885str_alloc_embed(VALUE klass, size_t capa)
886{
887 size_t size = rb_str_embed_size(capa);
888 assert(size > 0);
889 assert(rb_gc_size_allocatable_p(size));
890#if !USE_RVARGC
891 assert(size <= sizeof(struct RString));
892#endif
893
894 RVARGC_NEWOBJ_OF(str, struct RString, klass,
896
897 return (VALUE)str;
898}
899
900static inline VALUE
901str_alloc_heap(VALUE klass)
902{
903 RVARGC_NEWOBJ_OF(str, struct RString, klass,
904 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString));
905
906 return (VALUE)str;
907}
908
909static inline VALUE
910empty_str_alloc(VALUE klass)
911{
912 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
913 VALUE str = str_alloc_embed(klass, 0);
914 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
915 return str;
916}
917
918static VALUE
919str_new0(VALUE klass, const char *ptr, long len, int termlen)
920{
921 VALUE str;
922
923 if (len < 0) {
924 rb_raise(rb_eArgError, "negative string size (or size too big)");
925 }
926
927 RUBY_DTRACE_CREATE_HOOK(STRING, len);
928
929 if (STR_EMBEDDABLE_P(len, termlen)) {
930 str = str_alloc_embed(klass, len + termlen);
931 if (len == 0) {
933 }
934 }
935 else {
936 str = str_alloc_heap(klass);
937 RSTRING(str)->as.heap.aux.capa = len;
938 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
939 * integer overflow. If we can STATIC_ASSERT that, the following
940 * mul_add_mul can be reverted to a simple ALLOC_N. */
941 RSTRING(str)->as.heap.ptr =
942 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
943 }
944 if (ptr) {
945 memcpy(RSTRING_PTR(str), ptr, len);
946 }
947 STR_SET_LEN(str, len);
948 TERM_FILL(RSTRING_PTR(str) + len, termlen);
949 return str;
950}
951
952static VALUE
953str_new(VALUE klass, const char *ptr, long len)
954{
955 return str_new0(klass, ptr, len, 1);
956}
957
958VALUE
959rb_str_new(const char *ptr, long len)
960{
961 return str_new(rb_cString, ptr, len);
962}
963
964VALUE
965rb_usascii_str_new(const char *ptr, long len)
966{
967 VALUE str = rb_str_new(ptr, len);
968 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
969 return str;
970}
971
972VALUE
973rb_utf8_str_new(const char *ptr, long len)
974{
975 VALUE str = str_new(rb_cString, ptr, len);
976 rb_enc_associate_index(str, rb_utf8_encindex());
977 return str;
978}
979
980VALUE
981rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
982{
983 VALUE str;
984
985 if (!enc) return rb_str_new(ptr, len);
986
987 str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
988 rb_enc_associate(str, enc);
989 return str;
990}
991
992VALUE
994{
995 must_not_null(ptr);
996 /* rb_str_new_cstr() can take pointer from non-malloc-generated
997 * memory regions, and that cannot be detected by the MSAN. Just
998 * trust the programmer that the argument passed here is a sane C
999 * string. */
1000 __msan_unpoison_string(ptr);
1001 return rb_str_new(ptr, strlen(ptr));
1002}
1003
1004VALUE
1006{
1007 VALUE str = rb_str_new_cstr(ptr);
1008 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
1009 return str;
1010}
1011
1012VALUE
1014{
1015 VALUE str = rb_str_new_cstr(ptr);
1016 rb_enc_associate_index(str, rb_utf8_encindex());
1017 return str;
1018}
1019
1020VALUE
1022{
1023 must_not_null(ptr);
1024 if (rb_enc_mbminlen(enc) != 1) {
1025 rb_raise(rb_eArgError, "wchar encoding given");
1026 }
1027 return rb_enc_str_new(ptr, strlen(ptr), enc);
1028}
1029
1030static VALUE
1031str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1032{
1033 VALUE str;
1034
1035 if (len < 0) {
1036 rb_raise(rb_eArgError, "negative string size (or size too big)");
1037 }
1038
1039 if (!ptr) {
1040 rb_encoding *enc = rb_enc_get_from_index(encindex);
1041 str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
1042 }
1043 else {
1044 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1045 str = str_alloc_heap(klass);
1046 RSTRING(str)->as.heap.len = len;
1047 RSTRING(str)->as.heap.ptr = (char *)ptr;
1048 RSTRING(str)->as.heap.aux.capa = len;
1049 RBASIC(str)->flags |= STR_NOFREE;
1050 }
1051 rb_enc_associate_index(str, encindex);
1052 return str;
1053}
1054
1055VALUE
1056rb_str_new_static(const char *ptr, long len)
1057{
1058 return str_new_static(rb_cString, ptr, len, 0);
1059}
1060
1061VALUE
1063{
1064 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1065}
1066
1067VALUE
1069{
1070 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1071}
1072
1073VALUE
1075{
1076 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1077}
1078
1079static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1080 rb_encoding *from, rb_encoding *to,
1081 int ecflags, VALUE ecopts);
1082
1083static inline bool
1084is_enc_ascii_string(VALUE str, rb_encoding *enc)
1085{
1086 int encidx = rb_enc_to_index(enc);
1087 if (rb_enc_get_index(str) == encidx)
1088 return is_ascii_string(str);
1089 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1090}
1091
1092VALUE
1093rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1094{
1095 long len;
1096 const char *ptr;
1097 VALUE newstr;
1098
1099 if (!to) return str;
1100 if (!from) from = rb_enc_get(str);
1101 if (from == to) return str;
1102 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1103 rb_is_ascii8bit_enc(to)) {
1104 if (STR_ENC_GET(str) != to) {
1105 str = rb_str_dup(str);
1106 rb_enc_associate(str, to);
1107 }
1108 return str;
1109 }
1110
1111 RSTRING_GETMEM(str, ptr, len);
1112 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1113 from, to, ecflags, ecopts);
1114 if (NIL_P(newstr)) {
1115 /* some error, return original */
1116 return str;
1117 }
1118 return newstr;
1119}
1120
1121VALUE
1122rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1123 rb_encoding *from, int ecflags, VALUE ecopts)
1124{
1125 long olen;
1126
1127 olen = RSTRING_LEN(newstr);
1128 if (ofs < -olen || olen < ofs)
1129 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1130 if (ofs < 0) ofs += olen;
1131 if (!from) {
1132 STR_SET_LEN(newstr, ofs);
1133 return rb_str_cat(newstr, ptr, len);
1134 }
1135
1136 rb_str_modify(newstr);
1137 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1138 rb_enc_get(newstr),
1139 ecflags, ecopts);
1140}
1141
1142VALUE
1143rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1144{
1145 STR_SET_LEN(str, 0);
1146 rb_enc_associate(str, enc);
1147 rb_str_cat(str, ptr, len);
1148 return str;
1149}
1150
1151static VALUE
1152str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1153 rb_encoding *from, rb_encoding *to,
1154 int ecflags, VALUE ecopts)
1155{
1156 rb_econv_t *ec;
1158 long olen;
1159 VALUE econv_wrapper;
1160 const unsigned char *start, *sp;
1161 unsigned char *dest, *dp;
1162 size_t converted_output = (size_t)ofs;
1163
1164 olen = rb_str_capacity(newstr);
1165
1166 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1167 RBASIC_CLEAR_CLASS(econv_wrapper);
1168 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1169 if (!ec) return Qnil;
1170 DATA_PTR(econv_wrapper) = ec;
1171
1172 sp = (unsigned char*)ptr;
1173 start = sp;
1174 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1175 (dp = dest + converted_output),
1176 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1178 /* destination buffer short */
1179 size_t converted_input = sp - start;
1180 size_t rest = len - converted_input;
1181 converted_output = dp - dest;
1182 rb_str_set_len(newstr, converted_output);
1183 if (converted_input && converted_output &&
1184 rest < (LONG_MAX / converted_output)) {
1185 rest = (rest * converted_output) / converted_input;
1186 }
1187 else {
1188 rest = olen;
1189 }
1190 olen += rest < 2 ? 2 : rest;
1191 rb_str_resize(newstr, olen);
1192 }
1193 DATA_PTR(econv_wrapper) = 0;
1194 rb_econv_close(ec);
1195 switch (ret) {
1196 case econv_finished:
1197 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1198 rb_str_set_len(newstr, len);
1199 rb_enc_associate(newstr, to);
1200 return newstr;
1201
1202 default:
1203 return Qnil;
1204 }
1205}
1206
1207VALUE
1209{
1210 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1211}
1212
1213VALUE
1215{
1216 rb_encoding *ienc;
1217 VALUE str;
1218 const int eidx = rb_enc_to_index(eenc);
1219
1220 if (!ptr) {
1221 return rb_enc_str_new(ptr, len, eenc);
1222 }
1223
1224 /* ASCII-8BIT case, no conversion */
1225 if ((eidx == rb_ascii8bit_encindex()) ||
1226 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1227 return rb_str_new(ptr, len);
1228 }
1229 /* no default_internal or same encoding, no conversion */
1230 ienc = rb_default_internal_encoding();
1231 if (!ienc || eenc == ienc) {
1232 return rb_enc_str_new(ptr, len, eenc);
1233 }
1234 /* ASCII compatible, and ASCII only string, no conversion in
1235 * default_internal */
1236 if ((eidx == rb_ascii8bit_encindex()) ||
1237 (eidx == rb_usascii_encindex()) ||
1238 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1239 return rb_enc_str_new(ptr, len, ienc);
1240 }
1241 /* convert from the given encoding to default_internal */
1242 str = rb_enc_str_new(NULL, 0, ienc);
1243 /* when the conversion failed for some reason, just ignore the
1244 * default_internal and result in the given encoding as-is. */
1245 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1246 rb_str_initialize(str, ptr, len, eenc);
1247 }
1248 return str;
1249}
1250
1251VALUE
1252rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1253{
1254 int eidx = rb_enc_to_index(eenc);
1255 if (eidx == rb_usascii_encindex() &&
1256 !is_ascii_string(str)) {
1257 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1258 return str;
1259 }
1260 rb_enc_associate_index(str, eidx);
1261 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1262}
1263
1264VALUE
1265rb_external_str_new(const char *ptr, long len)
1266{
1267 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1268}
1269
1270VALUE
1272{
1273 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1274}
1275
1276VALUE
1277rb_locale_str_new(const char *ptr, long len)
1278{
1279 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1280}
1281
1282VALUE
1284{
1285 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1286}
1287
1288VALUE
1290{
1291 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1292}
1293
1294VALUE
1296{
1297 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1298}
1299
1300VALUE
1302{
1303 return rb_str_export_to_enc(str, rb_default_external_encoding());
1304}
1305
1306VALUE
1308{
1309 return rb_str_export_to_enc(str, rb_locale_encoding());
1310}
1311
1312VALUE
1314{
1315 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1316}
1317
1318static VALUE
1319str_replace_shared_without_enc(VALUE str2, VALUE str)
1320{
1321 const int termlen = TERM_LEN(str);
1322 char *ptr;
1323 long len;
1324
1325 RSTRING_GETMEM(str, ptr, len);
1326 if (str_embed_capa(str2) >= len + termlen) {
1327 char *ptr2 = RSTRING(str2)->as.embed.ary;
1328 STR_SET_EMBED(str2);
1329 memcpy(ptr2, RSTRING_PTR(str), len);
1330 STR_SET_EMBED_LEN(str2, len);
1331 TERM_FILL(ptr2+len, termlen);
1332 }
1333 else {
1334 VALUE root;
1335 if (STR_SHARED_P(str)) {
1336 root = RSTRING(str)->as.heap.aux.shared;
1337 RSTRING_GETMEM(str, ptr, len);
1338 }
1339 else {
1340 root = rb_str_new_frozen(str);
1341 RSTRING_GETMEM(root, ptr, len);
1342 }
1343 assert(OBJ_FROZEN(root));
1344 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1345 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1346 rb_fatal("about to free a possible shared root");
1347 }
1348 char *ptr2 = STR_HEAP_PTR(str2);
1349 if (ptr2 != ptr) {
1350 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1351 }
1352 }
1353 FL_SET(str2, STR_NOEMBED);
1354 RSTRING(str2)->as.heap.len = len;
1355 RSTRING(str2)->as.heap.ptr = ptr;
1356 STR_SET_SHARED(str2, root);
1357 }
1358 return str2;
1359}
1360
1361static VALUE
1362str_replace_shared(VALUE str2, VALUE str)
1363{
1364 str_replace_shared_without_enc(str2, str);
1365 rb_enc_cr_str_exact_copy(str2, str);
1366 return str2;
1367}
1368
1369static VALUE
1370str_new_shared(VALUE klass, VALUE str)
1371{
1372 return str_replace_shared(str_alloc_heap(klass), str);
1373}
1374
1375VALUE
1377{
1378 return str_new_shared(rb_obj_class(str), str);
1379}
1380
1381VALUE
1383{
1384 if (OBJ_FROZEN(orig)) return orig;
1385 return str_new_frozen(rb_obj_class(orig), orig);
1386}
1387
1388static VALUE
1389rb_str_new_frozen_String(VALUE orig)
1390{
1391 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1392 return str_new_frozen(rb_cString, orig);
1393}
1394
1395VALUE
1396rb_str_tmp_frozen_acquire(VALUE orig)
1397{
1398 if (OBJ_FROZEN_RAW(orig)) return orig;
1399 return str_new_frozen_buffer(0, orig, FALSE);
1400}
1401
1402void
1403rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1404{
1405 if (RBASIC_CLASS(tmp) != 0)
1406 return;
1407
1408 if (STR_EMBED_P(tmp)) {
1409 assert(OBJ_FROZEN_RAW(tmp));
1410 }
1411 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1412 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1413 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1414
1415 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1416 assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1417 assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1418
1419 /* Unshare orig since the root (tmp) only has this one child. */
1420 FL_UNSET_RAW(orig, STR_SHARED);
1421 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1422 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1423 assert(OBJ_FROZEN_RAW(tmp));
1424
1425 /* Make tmp embedded and empty so it is safe for sweeping. */
1426 STR_SET_EMBED(tmp);
1427 STR_SET_EMBED_LEN(tmp, 0);
1428 }
1429 }
1430}
1431
1432static VALUE
1433str_new_frozen(VALUE klass, VALUE orig)
1434{
1435 return str_new_frozen_buffer(klass, orig, TRUE);
1436}
1437
1438static VALUE
1439heap_str_make_shared(VALUE klass, VALUE orig)
1440{
1441 assert(!STR_EMBED_P(orig));
1442 assert(!STR_SHARED_P(orig));
1443
1444 VALUE str = str_alloc_heap(klass);
1445 RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1446 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1447 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1448 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1449 RBASIC(orig)->flags &= ~STR_NOFREE;
1450 STR_SET_SHARED(orig, str);
1451 if (klass == 0)
1452 FL_UNSET_RAW(str, STR_BORROWED);
1453 return str;
1454}
1455
1456static VALUE
1457str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1458{
1459 VALUE str;
1460
1461 long len = RSTRING_LEN(orig);
1462 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1463
1464 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1465 str = str_new0(klass, RSTRING_PTR(orig), len, termlen);
1466 assert(STR_EMBED_P(str));
1467 }
1468 else {
1469 if (FL_TEST_RAW(orig, STR_SHARED)) {
1470 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1471 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1472 long rest = RSTRING_LEN(shared) - ofs - RSTRING(orig)->as.heap.len;
1473 assert(ofs >= 0);
1474 assert(rest >= 0);
1475 assert(ofs + rest <= RSTRING_LEN(shared));
1476#if !USE_RVARGC
1477 assert(!STR_EMBED_P(shared));
1478#endif
1479 assert(OBJ_FROZEN(shared));
1480
1481 if ((ofs > 0) || (rest > 0) ||
1482 (klass != RBASIC(shared)->klass) ||
1483 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1484 str = str_new_shared(klass, shared);
1485 assert(!STR_EMBED_P(str));
1486 RSTRING(str)->as.heap.ptr += ofs;
1487 RSTRING(str)->as.heap.len -= ofs + rest;
1488 }
1489 else {
1490 if (RBASIC_CLASS(shared) == 0)
1491 FL_SET_RAW(shared, STR_BORROWED);
1492 return shared;
1493 }
1494 }
1495 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1496 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1497 STR_SET_EMBED(str);
1498 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1499 STR_SET_EMBED_LEN(str, RSTRING_LEN(orig));
1500 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1501 }
1502 else {
1503 str = heap_str_make_shared(klass, orig);
1504 }
1505 }
1506
1507 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1508 OBJ_FREEZE(str);
1509 return str;
1510}
1511
1512VALUE
1513rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1514{
1515 return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1516}
1517
1518static VALUE
1519str_new_empty_String(VALUE str)
1520{
1521 VALUE v = rb_str_new(0, 0);
1522 rb_enc_copy(v, str);
1523 return v;
1524}
1525
1526#define STR_BUF_MIN_SIZE 63
1527#if !USE_RVARGC
1528STATIC_ASSERT(STR_BUF_MIN_SIZE, STR_BUF_MIN_SIZE > RSTRING_EMBED_LEN_MAX);
1529#endif
1530
1531VALUE
1533{
1534 if (STR_EMBEDDABLE_P(capa, 1)) {
1535 return str_alloc_embed(rb_cString, capa + 1);
1536 }
1537
1538 VALUE str = str_alloc_heap(rb_cString);
1539
1540#if !USE_RVARGC
1541 if (capa < STR_BUF_MIN_SIZE) {
1542 capa = STR_BUF_MIN_SIZE;
1543 }
1544#endif
1545 RSTRING(str)->as.heap.aux.capa = capa;
1546 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1547 RSTRING(str)->as.heap.ptr[0] = '\0';
1548
1549 return str;
1550}
1551
1552VALUE
1554{
1555 VALUE str;
1556 long len = strlen(ptr);
1557
1558 str = rb_str_buf_new(len);
1559 rb_str_buf_cat(str, ptr, len);
1560
1561 return str;
1562}
1563
1564VALUE
1566{
1567 return str_new(0, 0, len);
1568}
1569
1570void
1572{
1573 if (FL_TEST(str, RSTRING_FSTR)) {
1574 st_data_t fstr = (st_data_t)str;
1575
1576 RB_VM_LOCK_ENTER();
1577 {
1578 st_delete(rb_vm_fstring_table(), &fstr, NULL);
1579 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1580 }
1581 RB_VM_LOCK_LEAVE();
1582 }
1583
1584 if (STR_EMBED_P(str)) {
1585 RB_DEBUG_COUNTER_INC(obj_str_embed);
1586 }
1587 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1588 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1589 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1590 }
1591 else {
1592 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1593 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1594 }
1595}
1596
1597RUBY_FUNC_EXPORTED size_t
1598rb_str_memsize(VALUE str)
1599{
1600 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1601 return STR_HEAP_SIZE(str);
1602 }
1603 else {
1604 return 0;
1605 }
1606}
1607
1608VALUE
1610{
1611 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1612}
1613
1614static inline void str_discard(VALUE str);
1615static void str_shared_replace(VALUE str, VALUE str2);
1616
1617void
1619{
1620 if (str != str2) str_shared_replace(str, str2);
1621}
1622
1623static void
1624str_shared_replace(VALUE str, VALUE str2)
1625{
1626 rb_encoding *enc;
1627 int cr;
1628 int termlen;
1629
1630 RUBY_ASSERT(str2 != str);
1631 enc = STR_ENC_GET(str2);
1632 cr = ENC_CODERANGE(str2);
1633 str_discard(str);
1634 termlen = rb_enc_mbminlen(enc);
1635
1636 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1637 STR_SET_EMBED(str);
1638 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1639 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
1640 rb_enc_associate(str, enc);
1641 ENC_CODERANGE_SET(str, cr);
1642 }
1643 else {
1644#if USE_RVARGC
1645 if (STR_EMBED_P(str2)) {
1646 assert(!FL_TEST(str2, STR_SHARED));
1647 long len = RSTRING(str2)->as.embed.len;
1648 assert(len + termlen <= str_embed_capa(str2));
1649
1650 char *new_ptr = ALLOC_N(char, len + termlen);
1651 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1652 RSTRING(str2)->as.heap.ptr = new_ptr;
1653 RSTRING(str2)->as.heap.len = len;
1654 RSTRING(str2)->as.heap.aux.capa = len;
1655 STR_SET_NOEMBED(str2);
1656 }
1657#endif
1658
1659 STR_SET_NOEMBED(str);
1660 FL_UNSET(str, STR_SHARED);
1661 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1662 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1663
1664 if (FL_TEST(str2, STR_SHARED)) {
1665 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1666 STR_SET_SHARED(str, shared);
1667 }
1668 else {
1669 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1670 }
1671
1672 /* abandon str2 */
1673 STR_SET_EMBED(str2);
1674 RSTRING_PTR(str2)[0] = 0;
1675 STR_SET_EMBED_LEN(str2, 0);
1676 rb_enc_associate(str, enc);
1677 ENC_CODERANGE_SET(str, cr);
1678 }
1679}
1680
1681VALUE
1683{
1684 VALUE str;
1685
1686 if (RB_TYPE_P(obj, T_STRING)) {
1687 return obj;
1688 }
1689 str = rb_funcall(obj, idTo_s, 0);
1690 return rb_obj_as_string_result(str, obj);
1691}
1692
1693MJIT_FUNC_EXPORTED VALUE
1694rb_obj_as_string_result(VALUE str, VALUE obj)
1695{
1696 if (!RB_TYPE_P(str, T_STRING))
1697 return rb_any_to_s(obj);
1698 return str;
1699}
1700
1701static VALUE
1702str_replace(VALUE str, VALUE str2)
1703{
1704 long len;
1705
1706 len = RSTRING_LEN(str2);
1707 if (STR_SHARED_P(str2)) {
1708 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1709 assert(OBJ_FROZEN(shared));
1710 STR_SET_NOEMBED(str);
1711 RSTRING(str)->as.heap.len = len;
1712 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1713 STR_SET_SHARED(str, shared);
1714 rb_enc_cr_str_exact_copy(str, str2);
1715 }
1716 else {
1717 str_replace_shared(str, str2);
1718 }
1719
1720 return str;
1721}
1722
1723static inline VALUE
1724ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1725{
1726 size_t size = rb_str_embed_size(capa);
1727 assert(size > 0);
1728 assert(rb_gc_size_allocatable_p(size));
1729#if !USE_RVARGC
1730 assert(size <= sizeof(struct RString));
1731#endif
1732
1733 RB_RVARGC_EC_NEWOBJ_OF(ec, str, struct RString, klass,
1735
1736 return (VALUE)str;
1737}
1738
1739static inline VALUE
1740ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1741{
1742 RB_RVARGC_EC_NEWOBJ_OF(ec, str, struct RString, klass,
1743 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString));
1744
1745 return (VALUE)str;
1746}
1747
1748static inline VALUE
1749str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1750{
1751 const VALUE flag_mask =
1752#if !USE_RVARGC
1753 RSTRING_NOEMBED | RSTRING_EMBED_LEN_MASK |
1754#endif
1756 FL_FREEZE
1757 ;
1758 VALUE flags = FL_TEST_RAW(str, flag_mask);
1759 int encidx = 0;
1760 if (STR_EMBED_P(str)) {
1761 long len = RSTRING_EMBED_LEN(str);
1762
1763 assert(STR_EMBED_P(dup));
1764 assert(str_embed_capa(dup) >= len + 1);
1765 STR_SET_EMBED_LEN(dup, len);
1766 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1767 }
1768 else {
1769 VALUE root = str;
1770 if (FL_TEST_RAW(str, STR_SHARED)) {
1771 root = RSTRING(str)->as.heap.aux.shared;
1772 }
1773 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1774 root = str = str_new_frozen(klass, str);
1775 flags = FL_TEST_RAW(str, flag_mask);
1776 }
1777 assert(!STR_SHARED_P(root));
1778 assert(RB_OBJ_FROZEN_RAW(root));
1779 if (0) {}
1780#if !USE_RVARGC
1781 else if (STR_EMBED_P(root)) {
1782 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(root)->as.embed.ary,
1783 char, RSTRING_EMBED_LEN_MAX + 1);
1784 FL_UNSET(dup, STR_NOEMBED);
1785 }
1786#endif
1787 else {
1788 RSTRING(dup)->as.heap.len = RSTRING_LEN(str);
1789 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1790 FL_SET(root, STR_SHARED_ROOT);
1791 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1792 flags |= RSTRING_NOEMBED | STR_SHARED;
1793 }
1794 }
1795
1796 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1797 encidx = rb_enc_get_index(str);
1798 flags &= ~ENCODING_MASK;
1799 }
1800 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1801 if (encidx) rb_enc_associate_index(dup, encidx);
1802 return dup;
1803}
1804
1805static inline VALUE
1806ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1807{
1808 VALUE dup;
1809 if (FL_TEST(str, STR_NOEMBED)) {
1810 dup = ec_str_alloc_heap(ec, klass);
1811 }
1812 else {
1813 dup = ec_str_alloc_embed(ec, klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1814 }
1815
1816 return str_duplicate_setup(klass, str, dup);
1817}
1818
1819static inline VALUE
1820str_duplicate(VALUE klass, VALUE str)
1821{
1822 VALUE dup;
1823 if (FL_TEST(str, STR_NOEMBED)) {
1824 dup = str_alloc_heap(klass);
1825 }
1826 else {
1827 dup = str_alloc_embed(klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1828 }
1829
1830 return str_duplicate_setup(klass, str, dup);
1831}
1832
1833VALUE
1835{
1836 return str_duplicate(rb_obj_class(str), str);
1837}
1838
1839VALUE
1841{
1842 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1843 return str_duplicate(rb_cString, str);
1844}
1845
1846VALUE
1847rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str)
1848{
1849 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1850 return ec_str_duplicate(ec, rb_cString, str);
1851}
1852
1853/*
1854 *
1855 * call-seq:
1856 * String.new(string = '', **opts) -> new_string
1857 *
1858 * :include: doc/string/new.rdoc
1859 *
1860 */
1861
1862static VALUE
1863rb_str_init(int argc, VALUE *argv, VALUE str)
1864{
1865 static ID keyword_ids[2];
1866 VALUE orig, opt, venc, vcapa;
1867 VALUE kwargs[2];
1868 rb_encoding *enc = 0;
1869 int n;
1870
1871 if (!keyword_ids[0]) {
1872 keyword_ids[0] = rb_id_encoding();
1873 CONST_ID(keyword_ids[1], "capacity");
1874 }
1875
1876 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1877 if (!NIL_P(opt)) {
1878 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1879 venc = kwargs[0];
1880 vcapa = kwargs[1];
1881 if (!UNDEF_P(venc) && !NIL_P(venc)) {
1882 enc = rb_to_encoding(venc);
1883 }
1884 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
1885 long capa = NUM2LONG(vcapa);
1886 long len = 0;
1887 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1888
1889 if (capa < STR_BUF_MIN_SIZE) {
1890 capa = STR_BUF_MIN_SIZE;
1891 }
1892 if (n == 1) {
1893 StringValue(orig);
1894 len = RSTRING_LEN(orig);
1895 if (capa < len) {
1896 capa = len;
1897 }
1898 if (orig == str) n = 0;
1899 }
1900 str_modifiable(str);
1901 if (STR_EMBED_P(str)) { /* make noembed always */
1902 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1903#if USE_RVARGC
1904 assert(RSTRING(str)->as.embed.len + 1 <= str_embed_capa(str));
1905 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING(str)->as.embed.len + 1);
1906#else
1907 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING_EMBED_LEN_MAX + 1);
1908#endif
1909 RSTRING(str)->as.heap.ptr = new_ptr;
1910 }
1911 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1912 const size_t size = (size_t)capa + termlen;
1913 const char *const old_ptr = RSTRING_PTR(str);
1914 const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str);
1915 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1916 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1917 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
1918 RSTRING(str)->as.heap.ptr = new_ptr;
1919 }
1920 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1921 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1922 (size_t)capa + termlen, STR_HEAP_SIZE(str));
1923 }
1924 RSTRING(str)->as.heap.len = len;
1925 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1926 if (n == 1) {
1927 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1928 rb_enc_cr_str_exact_copy(str, orig);
1929 }
1930 FL_SET(str, STR_NOEMBED);
1931 RSTRING(str)->as.heap.aux.capa = capa;
1932 }
1933 else if (n == 1) {
1934 rb_str_replace(str, orig);
1935 }
1936 if (enc) {
1937 rb_enc_associate(str, enc);
1939 }
1940 }
1941 else if (n == 1) {
1942 rb_str_replace(str, orig);
1943 }
1944 return str;
1945}
1946
1947#ifdef NONASCII_MASK
1948#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1949
1950/*
1951 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1952 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1953 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1954 *
1955 * if (!(byte & 0x80))
1956 * byte |= 0x40; // turn on bit6
1957 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1958 *
1959 * This function calculates whether a byte is leading or not for all bytes
1960 * in the argument word by concurrently using the above logic, and then
1961 * adds up the number of leading bytes in the word.
1962 */
1963static inline uintptr_t
1964count_utf8_lead_bytes_with_word(const uintptr_t *s)
1965{
1966 uintptr_t d = *s;
1967
1968 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1969 d = (d>>6) | (~d>>7);
1970 d &= NONASCII_MASK >> 7;
1971
1972 /* Gather all bytes. */
1973#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1974 /* use only if it can use POPCNT */
1975 return rb_popcount_intptr(d);
1976#else
1977 d += (d>>8);
1978 d += (d>>16);
1979# if SIZEOF_VOIDP == 8
1980 d += (d>>32);
1981# endif
1982 return (d&0xF);
1983#endif
1984}
1985#endif
1986
1987static inline long
1988enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
1989{
1990 long c;
1991 const char *q;
1992
1993 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1994 long diff = (long)(e - p);
1995 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1996 }
1997#ifdef NONASCII_MASK
1998 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
1999 uintptr_t len = 0;
2000 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2001 const uintptr_t *s, *t;
2002 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2003 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2004 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2005 while (p < (const char *)s) {
2006 if (is_utf8_lead_byte(*p)) len++;
2007 p++;
2008 }
2009 while (s < t) {
2010 len += count_utf8_lead_bytes_with_word(s);
2011 s++;
2012 }
2013 p = (const char *)s;
2014 }
2015 while (p < e) {
2016 if (is_utf8_lead_byte(*p)) len++;
2017 p++;
2018 }
2019 return (long)len;
2020 }
2021#endif
2022 else if (rb_enc_asciicompat(enc)) {
2023 c = 0;
2024 if (ENC_CODERANGE_CLEAN_P(cr)) {
2025 while (p < e) {
2026 if (ISASCII(*p)) {
2027 q = search_nonascii(p, e);
2028 if (!q)
2029 return c + (e - p);
2030 c += q - p;
2031 p = q;
2032 }
2033 p += rb_enc_fast_mbclen(p, e, enc);
2034 c++;
2035 }
2036 }
2037 else {
2038 while (p < e) {
2039 if (ISASCII(*p)) {
2040 q = search_nonascii(p, e);
2041 if (!q)
2042 return c + (e - p);
2043 c += q - p;
2044 p = q;
2045 }
2046 p += rb_enc_mbclen(p, e, enc);
2047 c++;
2048 }
2049 }
2050 return c;
2051 }
2052
2053 for (c=0; p<e; c++) {
2054 p += rb_enc_mbclen(p, e, enc);
2055 }
2056 return c;
2057}
2058
2059long
2060rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2061{
2062 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2063}
2064
2065/* To get strlen with cr
2066 * Note that given cr is not used.
2067 */
2068long
2069rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2070{
2071 long c;
2072 const char *q;
2073 int ret;
2074
2075 *cr = 0;
2076 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2077 long diff = (long)(e - p);
2078 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2079 }
2080 else if (rb_enc_asciicompat(enc)) {
2081 c = 0;
2082 while (p < e) {
2083 if (ISASCII(*p)) {
2084 q = search_nonascii(p, e);
2085 if (!q) {
2086 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2087 return c + (e - p);
2088 }
2089 c += q - p;
2090 p = q;
2091 }
2092 ret = rb_enc_precise_mbclen(p, e, enc);
2093 if (MBCLEN_CHARFOUND_P(ret)) {
2094 *cr |= ENC_CODERANGE_VALID;
2095 p += MBCLEN_CHARFOUND_LEN(ret);
2096 }
2097 else {
2099 p++;
2100 }
2101 c++;
2102 }
2103 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2104 return c;
2105 }
2106
2107 for (c=0; p<e; c++) {
2108 ret = rb_enc_precise_mbclen(p, e, enc);
2109 if (MBCLEN_CHARFOUND_P(ret)) {
2110 *cr |= ENC_CODERANGE_VALID;
2111 p += MBCLEN_CHARFOUND_LEN(ret);
2112 }
2113 else {
2115 if (p + rb_enc_mbminlen(enc) <= e)
2116 p += rb_enc_mbminlen(enc);
2117 else
2118 p = e;
2119 }
2120 }
2121 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2122 return c;
2123}
2124
2125/* enc must be str's enc or rb_enc_check(str, str2) */
2126static long
2127str_strlen(VALUE str, rb_encoding *enc)
2128{
2129 const char *p, *e;
2130 int cr;
2131
2132 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2133 if (!enc) enc = STR_ENC_GET(str);
2134 p = RSTRING_PTR(str);
2135 e = RSTRING_END(str);
2136 cr = ENC_CODERANGE(str);
2137
2138 if (cr == ENC_CODERANGE_UNKNOWN) {
2139 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2140 if (cr) ENC_CODERANGE_SET(str, cr);
2141 return n;
2142 }
2143 else {
2144 return enc_strlen(p, e, enc, cr);
2145 }
2146}
2147
2148long
2150{
2151 return str_strlen(str, NULL);
2152}
2153
2154/*
2155 * call-seq:
2156 * length -> integer
2157 *
2158 * :include: doc/string/length.rdoc
2159 *
2160 */
2161
2162VALUE
2164{
2165 return LONG2NUM(str_strlen(str, NULL));
2166}
2167
2168/*
2169 * call-seq:
2170 * bytesize -> integer
2171 *
2172 * :include: doc/string/bytesize.rdoc
2173 *
2174 */
2175
2176static VALUE
2177rb_str_bytesize(VALUE str)
2178{
2179 return LONG2NUM(RSTRING_LEN(str));
2180}
2181
2182/*
2183 * call-seq:
2184 * empty? -> true or false
2185 *
2186 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2187 *
2188 * "hello".empty? # => false
2189 * " ".empty? # => false
2190 * "".empty? # => true
2191 *
2192 */
2193
2194static VALUE
2195rb_str_empty(VALUE str)
2196{
2197 return RBOOL(RSTRING_LEN(str) == 0);
2198}
2199
2200/*
2201 * call-seq:
2202 * string + other_string -> new_string
2203 *
2204 * Returns a new \String containing +other_string+ concatenated to +self+:
2205 *
2206 * "Hello from " + self.to_s # => "Hello from main"
2207 *
2208 */
2209
2210VALUE
2212{
2213 VALUE str3;
2214 rb_encoding *enc;
2215 char *ptr1, *ptr2, *ptr3;
2216 long len1, len2;
2217 int termlen;
2218
2219 StringValue(str2);
2220 enc = rb_enc_check_str(str1, str2);
2221 RSTRING_GETMEM(str1, ptr1, len1);
2222 RSTRING_GETMEM(str2, ptr2, len2);
2223 termlen = rb_enc_mbminlen(enc);
2224 if (len1 > LONG_MAX - len2) {
2225 rb_raise(rb_eArgError, "string size too big");
2226 }
2227 str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2228 ptr3 = RSTRING_PTR(str3);
2229 memcpy(ptr3, ptr1, len1);
2230 memcpy(ptr3+len1, ptr2, len2);
2231 TERM_FILL(&ptr3[len1+len2], termlen);
2232
2233 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2235 RB_GC_GUARD(str1);
2236 RB_GC_GUARD(str2);
2237 return str3;
2238}
2239
2240/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2241MJIT_FUNC_EXPORTED VALUE
2242rb_str_opt_plus(VALUE str1, VALUE str2)
2243{
2244 assert(RBASIC_CLASS(str1) == rb_cString);
2245 assert(RBASIC_CLASS(str2) == rb_cString);
2246 long len1, len2;
2247 MAYBE_UNUSED(char) *ptr1, *ptr2;
2248 RSTRING_GETMEM(str1, ptr1, len1);
2249 RSTRING_GETMEM(str2, ptr2, len2);
2250 int enc1 = rb_enc_get_index(str1);
2251 int enc2 = rb_enc_get_index(str2);
2252
2253 if (enc1 < 0) {
2254 return Qundef;
2255 }
2256 else if (enc2 < 0) {
2257 return Qundef;
2258 }
2259 else if (enc1 != enc2) {
2260 return Qundef;
2261 }
2262 else if (len1 > LONG_MAX - len2) {
2263 return Qundef;
2264 }
2265 else {
2266 return rb_str_plus(str1, str2);
2267 }
2268
2269}
2270
2271/*
2272 * call-seq:
2273 * string * integer -> new_string
2274 *
2275 * Returns a new \String containing +integer+ copies of +self+:
2276 *
2277 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2278 * "Ho! " * 0 # => ""
2279 *
2280 */
2281
2282VALUE
2284{
2285 VALUE str2;
2286 long n, len;
2287 char *ptr2;
2288 int termlen;
2289
2290 if (times == INT2FIX(1)) {
2291 return str_duplicate(rb_cString, str);
2292 }
2293 if (times == INT2FIX(0)) {
2294 str2 = str_alloc_embed(rb_cString, 0);
2295 rb_enc_copy(str2, str);
2296 return str2;
2297 }
2298 len = NUM2LONG(times);
2299 if (len < 0) {
2300 rb_raise(rb_eArgError, "negative argument");
2301 }
2302 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2303 if (STR_EMBEDDABLE_P(len, 1)) {
2304 str2 = str_alloc_embed(rb_cString, len + 1);
2305 memset(RSTRING_PTR(str2), 0, len + 1);
2306 }
2307 else {
2308 str2 = str_alloc_heap(rb_cString);
2309 RSTRING(str2)->as.heap.aux.capa = len;
2310 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2311 }
2312 STR_SET_LEN(str2, len);
2313 rb_enc_copy(str2, str);
2314 return str2;
2315 }
2316 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2317 rb_raise(rb_eArgError, "argument too big");
2318 }
2319
2320 len *= RSTRING_LEN(str);
2321 termlen = TERM_LEN(str);
2322 str2 = str_new0(rb_cString, 0, len, termlen);
2323 ptr2 = RSTRING_PTR(str2);
2324 if (len) {
2325 n = RSTRING_LEN(str);
2326 memcpy(ptr2, RSTRING_PTR(str), n);
2327 while (n <= len/2) {
2328 memcpy(ptr2 + n, ptr2, n);
2329 n *= 2;
2330 }
2331 memcpy(ptr2 + n, ptr2, len-n);
2332 }
2333 STR_SET_LEN(str2, len);
2334 TERM_FILL(&ptr2[len], termlen);
2335 rb_enc_cr_str_copy_for_substr(str2, str);
2336
2337 return str2;
2338}
2339
2340/*
2341 * call-seq:
2342 * string % object -> new_string
2343 *
2344 * Returns the result of formatting +object+ into the format specification +self+
2345 * (see Kernel#sprintf for formatting details):
2346 *
2347 * "%05d" % 123 # => "00123"
2348 *
2349 * If +self+ contains multiple substitutions, +object+ must be
2350 * an \Array or \Hash containing the values to be substituted:
2351 *
2352 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2353 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2354 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2355 *
2356 */
2357
2358static VALUE
2359rb_str_format_m(VALUE str, VALUE arg)
2360{
2361 VALUE tmp = rb_check_array_type(arg);
2362
2363 if (!NIL_P(tmp)) {
2364 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2365 }
2366 return rb_str_format(1, &arg, str);
2367}
2368
2369static inline void
2370rb_check_lockedtmp(VALUE str)
2371{
2372 if (FL_TEST(str, STR_TMPLOCK)) {
2373 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2374 }
2375}
2376
2377static inline void
2378str_modifiable(VALUE str)
2379{
2380 rb_check_lockedtmp(str);
2381 rb_check_frozen(str);
2382}
2383
2384static inline int
2385str_dependent_p(VALUE str)
2386{
2387 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2388 return 0;
2389 }
2390 else {
2391 return 1;
2392 }
2393}
2394
2395static inline int
2396str_independent(VALUE str)
2397{
2398 str_modifiable(str);
2399 return !str_dependent_p(str);
2400}
2401
2402static void
2403str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2404{
2405 char *ptr;
2406 char *oldptr;
2407 long capa = len + expand;
2408
2409 if (len > capa) len = capa;
2410
2411 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2412 ptr = RSTRING(str)->as.heap.ptr;
2413 STR_SET_EMBED(str);
2414 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2415 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2416 STR_SET_EMBED_LEN(str, len);
2417 return;
2418 }
2419
2420 ptr = ALLOC_N(char, (size_t)capa + termlen);
2421 oldptr = RSTRING_PTR(str);
2422 if (oldptr) {
2423 memcpy(ptr, oldptr, len);
2424 }
2425 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2426 xfree(oldptr);
2427 }
2428 STR_SET_NOEMBED(str);
2429 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2430 TERM_FILL(ptr + len, termlen);
2431 RSTRING(str)->as.heap.ptr = ptr;
2432 RSTRING(str)->as.heap.len = len;
2433 RSTRING(str)->as.heap.aux.capa = capa;
2434}
2435
2436void
2438{
2439 if (!str_independent(str))
2440 str_make_independent(str);
2442}
2443
2444void
2446{
2447 int termlen = TERM_LEN(str);
2448 long len = RSTRING_LEN(str);
2449
2450 if (expand < 0) {
2451 rb_raise(rb_eArgError, "negative expanding string size");
2452 }
2453 if (expand >= LONG_MAX - len) {
2454 rb_raise(rb_eArgError, "string size too big");
2455 }
2456
2457 if (!str_independent(str)) {
2458 str_make_independent_expand(str, len, expand, termlen);
2459 }
2460 else if (expand > 0) {
2461 RESIZE_CAPA_TERM(str, len + expand, termlen);
2462 }
2464}
2465
2466/* As rb_str_modify(), but don't clear coderange */
2467static void
2468str_modify_keep_cr(VALUE str)
2469{
2470 if (!str_independent(str))
2471 str_make_independent(str);
2473 /* Force re-scan later */
2475}
2476
2477static inline void
2478str_discard(VALUE str)
2479{
2480 str_modifiable(str);
2481 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2482 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2483 RSTRING(str)->as.heap.ptr = 0;
2484 RSTRING(str)->as.heap.len = 0;
2485 }
2486}
2487
2488void
2490{
2491 rb_encoding *enc = rb_enc_get(str);
2492 if (!enc) {
2493 rb_raise(rb_eTypeError, "not encoding capable object");
2494 }
2495 if (!rb_enc_asciicompat(enc)) {
2496 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2497 }
2498}
2499
2500VALUE
2502{
2503 VALUE s = *ptr;
2504 if (!RB_TYPE_P(s, T_STRING)) {
2505 s = rb_str_to_str(s);
2506 *ptr = s;
2507 }
2508 return s;
2509}
2510
2511char *
2513{
2514 VALUE str = rb_string_value(ptr);
2515 return RSTRING_PTR(str);
2516}
2517
2518static int
2519zero_filled(const char *s, int n)
2520{
2521 for (; n > 0; --n) {
2522 if (*s++) return 0;
2523 }
2524 return 1;
2525}
2526
2527static const char *
2528str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2529{
2530 const char *e = s + len;
2531
2532 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2533 if (zero_filled(s, minlen)) return s;
2534 }
2535 return 0;
2536}
2537
2538static char *
2539str_fill_term(VALUE str, char *s, long len, int termlen)
2540{
2541 /* This function assumes that (capa + termlen) bytes of memory
2542 * is allocated, like many other functions in this file.
2543 */
2544 if (str_dependent_p(str)) {
2545 if (!zero_filled(s + len, termlen))
2546 str_make_independent_expand(str, len, 0L, termlen);
2547 }
2548 else {
2549 TERM_FILL(s + len, termlen);
2550 return s;
2551 }
2552 return RSTRING_PTR(str);
2553}
2554
2555void
2556rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2557{
2558 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2559 long len = RSTRING_LEN(str);
2560
2561 assert(capa >= len);
2562 if (capa - len < termlen) {
2563 rb_check_lockedtmp(str);
2564 str_make_independent_expand(str, len, 0L, termlen);
2565 }
2566 else if (str_dependent_p(str)) {
2567 if (termlen > oldtermlen)
2568 str_make_independent_expand(str, len, 0L, termlen);
2569 }
2570 else {
2571 if (!STR_EMBED_P(str)) {
2572 /* modify capa instead of realloc */
2573 assert(!FL_TEST((str), STR_SHARED));
2574 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2575 }
2576 if (termlen > oldtermlen) {
2577 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2578 }
2579 }
2580
2581 return;
2582}
2583
2584static char *
2585str_null_check(VALUE str, int *w)
2586{
2587 char *s = RSTRING_PTR(str);
2588 long len = RSTRING_LEN(str);
2589 rb_encoding *enc = rb_enc_get(str);
2590 const int minlen = rb_enc_mbminlen(enc);
2591
2592 if (minlen > 1) {
2593 *w = 1;
2594 if (str_null_char(s, len, minlen, enc)) {
2595 return NULL;
2596 }
2597 return str_fill_term(str, s, len, minlen);
2598 }
2599 *w = 0;
2600 if (!s || memchr(s, 0, len)) {
2601 return NULL;
2602 }
2603 if (s[len]) {
2604 s = str_fill_term(str, s, len, minlen);
2605 }
2606 return s;
2607}
2608
2609char *
2610rb_str_to_cstr(VALUE str)
2611{
2612 int w;
2613 return str_null_check(str, &w);
2614}
2615
2616char *
2618{
2619 VALUE str = rb_string_value(ptr);
2620 int w;
2621 char *s = str_null_check(str, &w);
2622 if (!s) {
2623 if (w) {
2624 rb_raise(rb_eArgError, "string contains null char");
2625 }
2626 rb_raise(rb_eArgError, "string contains null byte");
2627 }
2628 return s;
2629}
2630
2631char *
2632rb_str_fill_terminator(VALUE str, const int newminlen)
2633{
2634 char *s = RSTRING_PTR(str);
2635 long len = RSTRING_LEN(str);
2636 return str_fill_term(str, s, len, newminlen);
2637}
2638
2639VALUE
2641{
2642 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2643 return str;
2644}
2645
2646/*
2647 * call-seq:
2648 * String.try_convert(object) -> object, new_string, or nil
2649 *
2650 * If +object+ is a \String object, returns +object+.
2651 *
2652 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2653 * calls <tt>object.to_str</tt> and returns the result.
2654 *
2655 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2656 *
2657 * Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2658 */
2659static VALUE
2660rb_str_s_try_convert(VALUE dummy, VALUE str)
2661{
2662 return rb_check_string_type(str);
2663}
2664
2665static char*
2666str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2667{
2668 long nth = *nthp;
2669 if (rb_enc_mbmaxlen(enc) == 1) {
2670 p += nth;
2671 }
2672 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2673 p += nth * rb_enc_mbmaxlen(enc);
2674 }
2675 else if (rb_enc_asciicompat(enc)) {
2676 const char *p2, *e2;
2677 int n;
2678
2679 while (p < e && 0 < nth) {
2680 e2 = p + nth;
2681 if (e < e2) {
2682 *nthp = nth;
2683 return (char *)e;
2684 }
2685 if (ISASCII(*p)) {
2686 p2 = search_nonascii(p, e2);
2687 if (!p2) {
2688 nth -= e2 - p;
2689 *nthp = nth;
2690 return (char *)e2;
2691 }
2692 nth -= p2 - p;
2693 p = p2;
2694 }
2695 n = rb_enc_mbclen(p, e, enc);
2696 p += n;
2697 nth--;
2698 }
2699 *nthp = nth;
2700 if (nth != 0) {
2701 return (char *)e;
2702 }
2703 return (char *)p;
2704 }
2705 else {
2706 while (p < e && nth--) {
2707 p += rb_enc_mbclen(p, e, enc);
2708 }
2709 }
2710 if (p > e) p = e;
2711 *nthp = nth;
2712 return (char*)p;
2713}
2714
2715char*
2716rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2717{
2718 return str_nth_len(p, e, &nth, enc);
2719}
2720
2721static char*
2722str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2723{
2724 if (singlebyte)
2725 p += nth;
2726 else {
2727 p = str_nth_len(p, e, &nth, enc);
2728 }
2729 if (!p) return 0;
2730 if (p > e) p = e;
2731 return (char *)p;
2732}
2733
2734/* char offset to byte offset */
2735static long
2736str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2737{
2738 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2739 if (!pp) return e - p;
2740 return pp - p;
2741}
2742
2743long
2744rb_str_offset(VALUE str, long pos)
2745{
2746 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2747 STR_ENC_GET(str), single_byte_optimizable(str));
2748}
2749
2750#ifdef NONASCII_MASK
2751static char *
2752str_utf8_nth(const char *p, const char *e, long *nthp)
2753{
2754 long nth = *nthp;
2755 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2756 const uintptr_t *s, *t;
2757 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2758 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2759 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2760 while (p < (const char *)s) {
2761 if (is_utf8_lead_byte(*p)) nth--;
2762 p++;
2763 }
2764 do {
2765 nth -= count_utf8_lead_bytes_with_word(s);
2766 s++;
2767 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2768 p = (char *)s;
2769 }
2770 while (p < e) {
2771 if (is_utf8_lead_byte(*p)) {
2772 if (nth == 0) break;
2773 nth--;
2774 }
2775 p++;
2776 }
2777 *nthp = nth;
2778 return (char *)p;
2779}
2780
2781static long
2782str_utf8_offset(const char *p, const char *e, long nth)
2783{
2784 const char *pp = str_utf8_nth(p, e, &nth);
2785 return pp - p;
2786}
2787#endif
2788
2789/* byte offset to char offset */
2790long
2791rb_str_sublen(VALUE str, long pos)
2792{
2793 if (single_byte_optimizable(str) || pos < 0)
2794 return pos;
2795 else {
2796 char *p = RSTRING_PTR(str);
2797 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2798 }
2799}
2800
2801static VALUE
2802str_subseq(VALUE str, long beg, long len)
2803{
2804 VALUE str2;
2805
2806 const long rstring_embed_capa_max = ((sizeof(struct RString) - offsetof(struct RString, as.embed.ary)) / sizeof(char)) - 1;
2807
2808 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str)) ||
2809 len <= rstring_embed_capa_max) {
2810 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
2811 RB_GC_GUARD(str);
2812 }
2813 else {
2814 str2 = str_new_shared(rb_cString, str);
2815 ENC_CODERANGE_CLEAR(str2);
2816 RSTRING(str2)->as.heap.ptr += beg;
2817 if (RSTRING(str2)->as.heap.len > len) {
2818 RSTRING(str2)->as.heap.len = len;
2819 }
2820 }
2821
2822 return str2;
2823}
2824
2825VALUE
2826rb_str_subseq(VALUE str, long beg, long len)
2827{
2828 VALUE str2 = str_subseq(str, beg, len);
2829 rb_enc_cr_str_copy_for_substr(str2, str);
2830 return str2;
2831}
2832
2833char *
2834rb_str_subpos(VALUE str, long beg, long *lenp)
2835{
2836 long len = *lenp;
2837 long slen = -1L;
2838 long blen = RSTRING_LEN(str);
2839 rb_encoding *enc = STR_ENC_GET(str);
2840 char *p, *s = RSTRING_PTR(str), *e = s + blen;
2841
2842 if (len < 0) return 0;
2843 if (!blen) {
2844 len = 0;
2845 }
2846 if (single_byte_optimizable(str)) {
2847 if (beg > blen) return 0;
2848 if (beg < 0) {
2849 beg += blen;
2850 if (beg < 0) return 0;
2851 }
2852 if (len > blen - beg)
2853 len = blen - beg;
2854 if (len < 0) return 0;
2855 p = s + beg;
2856 goto end;
2857 }
2858 if (beg < 0) {
2859 if (len > -beg) len = -beg;
2860 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2861 beg = -beg;
2862 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2863 p = e;
2864 if (!p) return 0;
2865 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2866 if (!p) return 0;
2867 len = e - p;
2868 goto end;
2869 }
2870 else {
2871 slen = str_strlen(str, enc);
2872 beg += slen;
2873 if (beg < 0) return 0;
2874 p = s + beg;
2875 if (len == 0) goto end;
2876 }
2877 }
2878 else if (beg > 0 && beg > RSTRING_LEN(str)) {
2879 return 0;
2880 }
2881 if (len == 0) {
2882 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2883 p = s + beg;
2884 }
2885#ifdef NONASCII_MASK
2886 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2887 enc == rb_utf8_encoding()) {
2888 p = str_utf8_nth(s, e, &beg);
2889 if (beg > 0) return 0;
2890 len = str_utf8_offset(p, e, len);
2891 }
2892#endif
2893 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2894 int char_sz = rb_enc_mbmaxlen(enc);
2895
2896 p = s + beg * char_sz;
2897 if (p > e) {
2898 return 0;
2899 }
2900 else if (len * char_sz > e - p)
2901 len = e - p;
2902 else
2903 len *= char_sz;
2904 }
2905 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2906 if (beg > 0) return 0;
2907 len = 0;
2908 }
2909 else {
2910 len = str_offset(p, e, len, enc, 0);
2911 }
2912 end:
2913 *lenp = len;
2914 RB_GC_GUARD(str);
2915 return p;
2916}
2917
2918static VALUE str_substr(VALUE str, long beg, long len, int empty);
2919
2920VALUE
2921rb_str_substr(VALUE str, long beg, long len)
2922{
2923 return str_substr(str, beg, len, TRUE);
2924}
2925
2926static VALUE
2927str_substr(VALUE str, long beg, long len, int empty)
2928{
2929 char *p = rb_str_subpos(str, beg, &len);
2930
2931 if (!p) return Qnil;
2932 if (!len && !empty) return Qnil;
2933
2934 beg = p - RSTRING_PTR(str);
2935
2936 VALUE str2 = str_subseq(str, beg, len);
2937 rb_enc_cr_str_copy_for_substr(str2, str);
2938 return str2;
2939}
2940
2941VALUE
2943{
2944 if (OBJ_FROZEN(str)) return str;
2945 rb_str_resize(str, RSTRING_LEN(str));
2946 return rb_obj_freeze(str);
2947}
2948
2949
2950/*
2951 * call-seq:
2952 * +string -> new_string or self
2953 *
2954 * Returns +self+ if +self+ is not frozen.
2955 *
2956 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
2957 */
2958static VALUE
2959str_uplus(VALUE str)
2960{
2961 if (OBJ_FROZEN(str)) {
2962 return rb_str_dup(str);
2963 }
2964 else {
2965 return str;
2966 }
2967}
2968
2969/*
2970 * call-seq:
2971 * -string -> frozen_string
2972 *
2973 * Returns a frozen, possibly pre-existing copy of the string.
2974 *
2975 * The returned \String will be deduplicated as long as it does not have
2976 * any instance variables set on it and is not a String subclass.
2977 *
2978 * String#dedup is an alias for String#-@.
2979 */
2980static VALUE
2981str_uminus(VALUE str)
2982{
2983 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
2984 str = rb_str_dup(str);
2985 }
2986 return rb_fstring(str);
2987}
2988
2989RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
2990#define rb_str_dup_frozen rb_str_new_frozen
2991
2992VALUE
2994{
2995 if (FL_TEST(str, STR_TMPLOCK)) {
2996 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
2997 }
2998 FL_SET(str, STR_TMPLOCK);
2999 return str;
3000}
3001
3002VALUE
3004{
3005 if (!FL_TEST(str, STR_TMPLOCK)) {
3006 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3007 }
3008 FL_UNSET(str, STR_TMPLOCK);
3009 return str;
3010}
3011
3012RUBY_FUNC_EXPORTED VALUE
3013rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3014{
3015 rb_str_locktmp(str);
3016 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3017}
3018
3019void
3021{
3022 long capa;
3023 const int termlen = TERM_LEN(str);
3024
3025 str_modifiable(str);
3026 if (STR_SHARED_P(str)) {
3027 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3028 }
3029 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3030 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3031 }
3032
3033 int cr = ENC_CODERANGE(str);
3034 if (cr == ENC_CODERANGE_UNKNOWN) {
3035 /* Leave unknown. */
3036 }
3037 else if (len > RSTRING_LEN(str)) {
3038 if (ENC_CODERANGE_CLEAN_P(cr)) {
3039 /* Update the coderange regarding the extended part. */
3040 const char *const prev_end = RSTRING_END(str);
3041 const char *const new_end = RSTRING_PTR(str) + len;
3042 rb_encoding *enc = rb_enc_get(str);
3043 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3044 ENC_CODERANGE_SET(str, cr);
3045 }
3046 else if (cr == ENC_CODERANGE_BROKEN) {
3047 /* May be valid now, by appended part. */
3049 }
3050 }
3051 else if (len < RSTRING_LEN(str)) {
3052 if (cr != ENC_CODERANGE_7BIT) {
3053 /* ASCII-only string is keeping after truncated. Valid
3054 * and broken may be invalid or valid, leave unknown. */
3056 }
3057 }
3058
3059 STR_SET_LEN(str, len);
3060 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3061}
3062
3063VALUE
3065{
3066 if (len < 0) {
3067 rb_raise(rb_eArgError, "negative string size (or size too big)");
3068 }
3069
3070 int independent = str_independent(str);
3071 long slen = RSTRING_LEN(str);
3072
3073 if (slen > len && ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
3075 }
3076
3077 {
3078 long capa;
3079 const int termlen = TERM_LEN(str);
3080 if (STR_EMBED_P(str)) {
3081 if (len == slen) return str;
3082 if (str_embed_capa(str) >= len + termlen) {
3083 STR_SET_EMBED_LEN(str, len);
3084 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3085 return str;
3086 }
3087 str_make_independent_expand(str, slen, len - slen, termlen);
3088 }
3089 else if (str_embed_capa(str) >= len + termlen) {
3090 char *ptr = STR_HEAP_PTR(str);
3091 STR_SET_EMBED(str);
3092 if (slen > len) slen = len;
3093 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3094 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3095 STR_SET_EMBED_LEN(str, len);
3096 if (independent) ruby_xfree(ptr);
3097 return str;
3098 }
3099 else if (!independent) {
3100 if (len == slen) return str;
3101 str_make_independent_expand(str, slen, len - slen, termlen);
3102 }
3103 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3104 (capa - len) > (len < 1024 ? len : 1024)) {
3105 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3106 (size_t)len + termlen, STR_HEAP_SIZE(str));
3107 RSTRING(str)->as.heap.aux.capa = len;
3108 }
3109 else if (len == slen) return str;
3110 RSTRING(str)->as.heap.len = len;
3111 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3112 }
3113 return str;
3114}
3115
3116static VALUE
3117str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3118{
3119 if (keep_cr) {
3120 str_modify_keep_cr(str);
3121 }
3122 else {
3123 rb_str_modify(str);
3124 }
3125 if (len == 0) return 0;
3126
3127 long capa, total, olen, off = -1;
3128 char *sptr;
3129 const int termlen = TERM_LEN(str);
3130#if !USE_RVARGC
3131 assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
3132#endif
3133
3134 RSTRING_GETMEM(str, sptr, olen);
3135 if (ptr >= sptr && ptr <= sptr + olen) {
3136 off = ptr - sptr;
3137 }
3138
3139 if (STR_EMBED_P(str)) {
3140 capa = str_embed_capa(str) - termlen;
3141 sptr = RSTRING(str)->as.embed.ary;
3142 olen = RSTRING_EMBED_LEN(str);
3143 }
3144 else {
3145 capa = RSTRING(str)->as.heap.aux.capa;
3146 sptr = RSTRING(str)->as.heap.ptr;
3147 olen = RSTRING(str)->as.heap.len;
3148 }
3149 if (olen > LONG_MAX - len) {
3150 rb_raise(rb_eArgError, "string sizes too big");
3151 }
3152 total = olen + len;
3153 if (capa < total) {
3154 if (total >= LONG_MAX / 2) {
3155 capa = total;
3156 }
3157 while (total > capa) {
3158 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3159 }
3160 RESIZE_CAPA_TERM(str, capa, termlen);
3161 sptr = RSTRING_PTR(str);
3162 }
3163 if (off != -1) {
3164 ptr = sptr + off;
3165 }
3166 memcpy(sptr + olen, ptr, len);
3167 STR_SET_LEN(str, total);
3168 TERM_FILL(sptr + total, termlen); /* sentinel */
3169
3170 return str;
3171}
3172
3173#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3174#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3175
3176VALUE
3177rb_str_cat(VALUE str, const char *ptr, long len)
3178{
3179 if (len == 0) return str;
3180 if (len < 0) {
3181 rb_raise(rb_eArgError, "negative string size (or size too big)");
3182 }
3183 return str_buf_cat(str, ptr, len);
3184}
3185
3186VALUE
3187rb_str_cat_cstr(VALUE str, const char *ptr)
3188{
3189 must_not_null(ptr);
3190 return rb_str_buf_cat(str, ptr, strlen(ptr));
3191}
3192
3193RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3194RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3195RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3196
3197static VALUE
3198rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3199 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3200{
3201 int str_encindex = ENCODING_GET(str);
3202 int res_encindex;
3203 int str_cr, res_cr;
3204 rb_encoding *str_enc, *ptr_enc;
3205
3206 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3207
3208 if (str_encindex == ptr_encindex) {
3209 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3210 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3211 }
3212 }
3213 else {
3214 str_enc = rb_enc_from_index(str_encindex);
3215 ptr_enc = rb_enc_from_index(ptr_encindex);
3216 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3217 if (len == 0)
3218 return str;
3219 if (RSTRING_LEN(str) == 0) {
3220 rb_str_buf_cat(str, ptr, len);
3221 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3222 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3223 return str;
3224 }
3225 goto incompatible;
3226 }
3227 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3228 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3229 }
3230 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3231 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3232 str_cr = rb_enc_str_coderange(str);
3233 }
3234 }
3235 }
3236 if (ptr_cr_ret)
3237 *ptr_cr_ret = ptr_cr;
3238
3239 if (str_encindex != ptr_encindex &&
3240 str_cr != ENC_CODERANGE_7BIT &&
3241 ptr_cr != ENC_CODERANGE_7BIT) {
3242 str_enc = rb_enc_from_index(str_encindex);
3243 ptr_enc = rb_enc_from_index(ptr_encindex);
3244 goto incompatible;
3245 }
3246
3247 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3248 res_encindex = str_encindex;
3249 res_cr = ENC_CODERANGE_UNKNOWN;
3250 }
3251 else if (str_cr == ENC_CODERANGE_7BIT) {
3252 if (ptr_cr == ENC_CODERANGE_7BIT) {
3253 res_encindex = str_encindex;
3254 res_cr = ENC_CODERANGE_7BIT;
3255 }
3256 else {
3257 res_encindex = ptr_encindex;
3258 res_cr = ptr_cr;
3259 }
3260 }
3261 else if (str_cr == ENC_CODERANGE_VALID) {
3262 res_encindex = str_encindex;
3263 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3264 res_cr = str_cr;
3265 else
3266 res_cr = ptr_cr;
3267 }
3268 else { /* str_cr == ENC_CODERANGE_BROKEN */
3269 res_encindex = str_encindex;
3270 res_cr = str_cr;
3271 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3272 }
3273
3274 if (len < 0) {
3275 rb_raise(rb_eArgError, "negative string size (or size too big)");
3276 }
3277 str_buf_cat(str, ptr, len);
3278 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3279 return str;
3280
3281 incompatible:
3282 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3283 rb_enc_name(str_enc), rb_enc_name(ptr_enc));
3285}
3286
3287VALUE
3288rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3289{
3290 return rb_enc_cr_str_buf_cat(str, ptr, len,
3291 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3292}
3293
3294VALUE
3296{
3297 /* ptr must reference NUL terminated ASCII string. */
3298 int encindex = ENCODING_GET(str);
3299 rb_encoding *enc = rb_enc_from_index(encindex);
3300 if (rb_enc_asciicompat(enc)) {
3301 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3302 encindex, ENC_CODERANGE_7BIT, 0);
3303 }
3304 else {
3305 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3306 while (*ptr) {
3307 unsigned int c = (unsigned char)*ptr;
3308 int len = rb_enc_codelen(c, enc);
3309 rb_enc_mbcput(c, buf, enc);
3310 rb_enc_cr_str_buf_cat(str, buf, len,
3311 encindex, ENC_CODERANGE_VALID, 0);
3312 ptr++;
3313 }
3314 return str;
3315 }
3316}
3317
3318VALUE
3320{
3321 int str2_cr = rb_enc_str_coderange(str2);
3322
3323 if (str_enc_fastpath(str)) {
3324 switch (str2_cr) {
3325 case ENC_CODERANGE_7BIT:
3326 // If RHS is 7bit we can do simple concatenation
3327 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3328 RB_GC_GUARD(str2);
3329 return str;
3331 // If RHS is valid, we can do simple concatenation if encodings are the same
3332 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3333 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3334 int str_cr = ENC_CODERANGE(str);
3335 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3336 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3337 }
3338 RB_GC_GUARD(str2);
3339 return str;
3340 }
3341 }
3342 }
3343
3344 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3345 ENCODING_GET(str2), str2_cr, &str2_cr);
3346
3347 ENC_CODERANGE_SET(str2, str2_cr);
3348
3349 return str;
3350}
3351
3352VALUE
3354{
3355 StringValue(str2);
3356 return rb_str_buf_append(str, str2);
3357}
3358
3359#define MIN_PRE_ALLOC_SIZE 48
3360
3361MJIT_FUNC_EXPORTED VALUE
3362rb_str_concat_literals(size_t num, const VALUE *strary)
3363{
3364 VALUE str;
3365 size_t i, s;
3366 long len = 1;
3367
3368 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3369 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3370
3371 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3372 if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) {
3373 str = rb_str_resurrect(strary[0]);
3374 s = 1;
3375 }
3376 else {
3377 str = rb_str_buf_new(len);
3378 rb_enc_copy(str, strary[0]);
3379 s = 0;
3380 }
3381
3382 for (i = s; i < num; ++i) {
3383 const VALUE v = strary[i];
3384 int encidx = ENCODING_GET(v);
3385
3386 rb_str_buf_append(str, v);
3387 if (encidx != ENCINDEX_US_ASCII) {
3388 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3389 rb_enc_set_index(str, encidx);
3390 }
3391 }
3392 return str;
3393}
3394
3395/*
3396 * call-seq:
3397 * concat(*objects) -> string
3398 *
3399 * Concatenates each object in +objects+ to +self+ and returns +self+:
3400 *
3401 * s = 'foo'
3402 * s.concat('bar', 'baz') # => "foobarbaz"
3403 * s # => "foobarbaz"
3404 *
3405 * For each given object +object+ that is an \Integer,
3406 * the value is considered a codepoint and converted to a character before concatenation:
3407 *
3408 * s = 'foo'
3409 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3410 *
3411 * Related: String#<<, which takes a single argument.
3412 */
3413static VALUE
3414rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3415{
3416 str_modifiable(str);
3417
3418 if (argc == 1) {
3419 return rb_str_concat(str, argv[0]);
3420 }
3421 else if (argc > 1) {
3422 int i;
3423 VALUE arg_str = rb_str_tmp_new(0);
3424 rb_enc_copy(arg_str, str);
3425 for (i = 0; i < argc; i++) {
3426 rb_str_concat(arg_str, argv[i]);
3427 }
3428 rb_str_buf_append(str, arg_str);
3429 }
3430
3431 return str;
3432}
3433
3434/*
3435 * call-seq:
3436 * string << object -> string
3437 *
3438 * Concatenates +object+ to +self+ and returns +self+:
3439 *
3440 * s = 'foo'
3441 * s << 'bar' # => "foobar"
3442 * s # => "foobar"
3443 *
3444 * If +object+ is an \Integer,
3445 * the value is considered a codepoint and converted to a character before concatenation:
3446 *
3447 * s = 'foo'
3448 * s << 33 # => "foo!"
3449 *
3450 * Related: String#concat, which takes multiple arguments.
3451 */
3452VALUE
3454{
3455 unsigned int code;
3456 rb_encoding *enc = STR_ENC_GET(str1);
3457 int encidx;
3458
3459 if (RB_INTEGER_TYPE_P(str2)) {
3460 if (rb_num_to_uint(str2, &code) == 0) {
3461 }
3462 else if (FIXNUM_P(str2)) {
3463 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3464 }
3465 else {
3466 rb_raise(rb_eRangeError, "bignum out of char range");
3467 }
3468 }
3469 else {
3470 return rb_str_append(str1, str2);
3471 }
3472
3473 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3474 if (encidx >= 0) {
3475 char buf[1];
3476 buf[0] = (char)code;
3477 rb_str_cat(str1, buf, 1);
3478 if (encidx != rb_enc_to_index(enc)) {
3479 rb_enc_associate_index(str1, encidx);
3481 }
3482 }
3483 else {
3484 long pos = RSTRING_LEN(str1);
3485 int cr = ENC_CODERANGE(str1);
3486 int len;
3487 char *buf;
3488
3489 switch (len = rb_enc_codelen(code, enc)) {
3490 case ONIGERR_INVALID_CODE_POINT_VALUE:
3491 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3492 break;
3493 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3494 case 0:
3495 rb_raise(rb_eRangeError, "%u out of char range", code);
3496 break;
3497 }
3498 buf = ALLOCA_N(char, len + 1);
3499 rb_enc_mbcput(code, buf, enc);
3500 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3501 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3502 }
3503 rb_str_resize(str1, pos+len);
3504 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3505 if (cr == ENC_CODERANGE_7BIT && code > 127) {
3507 }
3508 else if (cr == ENC_CODERANGE_BROKEN) {
3510 }
3511 ENC_CODERANGE_SET(str1, cr);
3512 }
3513 return str1;
3514}
3515
3516int
3517rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3518{
3519 int encidx = rb_enc_to_index(enc);
3520
3521 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3522 /* US-ASCII automatically extended to ASCII-8BIT */
3523 if (code > 0xFF) {
3524 rb_raise(rb_eRangeError, "%u out of char range", code);
3525 }
3526 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3527 return ENCINDEX_ASCII_8BIT;
3528 }
3529 return encidx;
3530 }
3531 else {
3532 return -1;
3533 }
3534}
3535
3536/*
3537 * call-seq:
3538 * prepend(*other_strings) -> string
3539 *
3540 * Prepends each string in +other_strings+ to +self+ and returns +self+:
3541 *
3542 * s = 'foo'
3543 * s.prepend('bar', 'baz') # => "barbazfoo"
3544 * s # => "barbazfoo"
3545 *
3546 * Related: String#concat.
3547 */
3548
3549static VALUE
3550rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3551{
3552 str_modifiable(str);
3553
3554 if (argc == 1) {
3555 rb_str_update(str, 0L, 0L, argv[0]);
3556 }
3557 else if (argc > 1) {
3558 int i;
3559 VALUE arg_str = rb_str_tmp_new(0);
3560 rb_enc_copy(arg_str, str);
3561 for (i = 0; i < argc; i++) {
3562 rb_str_append(arg_str, argv[i]);
3563 }
3564 rb_str_update(str, 0L, 0L, arg_str);
3565 }
3566
3567 return str;
3568}
3569
3570st_index_t
3572{
3573 int e = ENCODING_GET(str);
3574 if (e && is_ascii_string(str)) {
3575 e = 0;
3576 }
3577 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
3578}
3579
3580int
3582{
3583 long len1, len2;
3584 const char *ptr1, *ptr2;
3585 RSTRING_GETMEM(str1, ptr1, len1);
3586 RSTRING_GETMEM(str2, ptr2, len2);
3587 return (len1 != len2 ||
3588 !rb_str_comparable(str1, str2) ||
3589 memcmp(ptr1, ptr2, len1) != 0);
3590}
3591
3592/*
3593 * call-seq:
3594 * hash -> integer
3595 *
3596 * Returns the integer hash value for +self+.
3597 * The value is based on the length, content and encoding of +self+.
3598 *
3599 * Related: Object#hash.
3600 */
3601
3602static VALUE
3603rb_str_hash_m(VALUE str)
3604{
3605 st_index_t hval = rb_str_hash(str);
3606 return ST2FIX(hval);
3607}
3608
3609#define lesser(a,b) (((a)>(b))?(b):(a))
3610
3611int
3613{
3614 int idx1, idx2;
3615 int rc1, rc2;
3616
3617 if (RSTRING_LEN(str1) == 0) return TRUE;
3618 if (RSTRING_LEN(str2) == 0) return TRUE;
3619 idx1 = ENCODING_GET(str1);
3620 idx2 = ENCODING_GET(str2);
3621 if (idx1 == idx2) return TRUE;
3622 rc1 = rb_enc_str_coderange(str1);
3623 rc2 = rb_enc_str_coderange(str2);
3624 if (rc1 == ENC_CODERANGE_7BIT) {
3625 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3626 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3627 return TRUE;
3628 }
3629 if (rc2 == ENC_CODERANGE_7BIT) {
3630 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3631 return TRUE;
3632 }
3633 return FALSE;
3634}
3635
3636int
3638{
3639 long len1, len2;
3640 const char *ptr1, *ptr2;
3641 int retval;
3642
3643 if (str1 == str2) return 0;
3644 RSTRING_GETMEM(str1, ptr1, len1);
3645 RSTRING_GETMEM(str2, ptr2, len2);
3646 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3647 if (len1 == len2) {
3648 if (!rb_str_comparable(str1, str2)) {
3649 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3650 return 1;
3651 return -1;
3652 }
3653 return 0;
3654 }
3655 if (len1 > len2) return 1;
3656 return -1;
3657 }
3658 if (retval > 0) return 1;
3659 return -1;
3660}
3661
3662/*
3663 * call-seq:
3664 * string == object -> true or false
3665 * string === object -> true or false
3666 *
3667 * Returns +true+ if +object+ has the same length and content;
3668 * as +self+; +false+ otherwise:
3669 *
3670 * s = 'foo'
3671 * s == 'foo' # => true
3672 * s == 'food' # => false
3673 * s == 'FOO' # => false
3674 *
3675 * Returns +false+ if the two strings' encodings are not compatible:
3676 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3677 *
3678 * If +object+ is not an instance of \String but responds to +to_str+, then the
3679 * two strings are compared using <code>object.==</code>.
3680 */
3681
3682VALUE
3684{
3685 if (str1 == str2) return Qtrue;
3686 if (!RB_TYPE_P(str2, T_STRING)) {
3687 if (!rb_respond_to(str2, idTo_str)) {
3688 return Qfalse;
3689 }
3690 return rb_equal(str2, str1);
3691 }
3692 return rb_str_eql_internal(str1, str2);
3693}
3694
3695/*
3696 * call-seq:
3697 * eql?(object) -> true or false
3698 *
3699 * Returns +true+ if +object+ has the same length and content;
3700 * as +self+; +false+ otherwise:
3701 *
3702 * s = 'foo'
3703 * s.eql?('foo') # => true
3704 * s.eql?('food') # => false
3705 * s.eql?('FOO') # => false
3706 *
3707 * Returns +false+ if the two strings' encodings are not compatible:
3708 *
3709 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3710 *
3711 */
3712
3713MJIT_FUNC_EXPORTED VALUE
3714rb_str_eql(VALUE str1, VALUE str2)
3715{
3716 if (str1 == str2) return Qtrue;
3717 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3718 return rb_str_eql_internal(str1, str2);
3719}
3720
3721/*
3722 * call-seq:
3723 * string <=> other_string -> -1, 0, 1, or nil
3724 *
3725 * Compares +self+ and +other_string+, returning:
3726 *
3727 * - -1 if +other_string+ is larger.
3728 * - 0 if the two are equal.
3729 * - 1 if +other_string+ is smaller.
3730 * - +nil+ if the two are incomparable.
3731 *
3732 * Examples:
3733 *
3734 * 'foo' <=> 'foo' # => 0
3735 * 'foo' <=> 'food' # => -1
3736 * 'food' <=> 'foo' # => 1
3737 * 'FOO' <=> 'foo' # => -1
3738 * 'foo' <=> 'FOO' # => 1
3739 * 'foo' <=> 1 # => nil
3740 *
3741 */
3742
3743static VALUE
3744rb_str_cmp_m(VALUE str1, VALUE str2)
3745{
3746 int result;
3747 VALUE s = rb_check_string_type(str2);
3748 if (NIL_P(s)) {
3749 return rb_invcmp(str1, str2);
3750 }
3751 result = rb_str_cmp(str1, s);
3752 return INT2FIX(result);
3753}
3754
3755static VALUE str_casecmp(VALUE str1, VALUE str2);
3756static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3757
3758/*
3759 * call-seq:
3760 * casecmp(other_string) -> -1, 0, 1, or nil
3761 *
3762 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3763 *
3764 * - -1 if <tt>other_string.downcase</tt> is larger.
3765 * - 0 if the two are equal.
3766 * - 1 if <tt>other_string.downcase</tt> is smaller.
3767 * - +nil+ if the two are incomparable.
3768 *
3769 * Examples:
3770 *
3771 * 'foo'.casecmp('foo') # => 0
3772 * 'foo'.casecmp('food') # => -1
3773 * 'food'.casecmp('foo') # => 1
3774 * 'FOO'.casecmp('foo') # => 0
3775 * 'foo'.casecmp('FOO') # => 0
3776 * 'foo'.casecmp(1) # => nil
3777 *
3778 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3779 *
3780 * Related: String#casecmp?.
3781 *
3782 */
3783
3784static VALUE
3785rb_str_casecmp(VALUE str1, VALUE str2)
3786{
3787 VALUE s = rb_check_string_type(str2);
3788 if (NIL_P(s)) {
3789 return Qnil;
3790 }
3791 return str_casecmp(str1, s);
3792}
3793
3794static VALUE
3795str_casecmp(VALUE str1, VALUE str2)
3796{
3797 long len;
3798 rb_encoding *enc;
3799 const char *p1, *p1end, *p2, *p2end;
3800
3801 enc = rb_enc_compatible(str1, str2);
3802 if (!enc) {
3803 return Qnil;
3804 }
3805
3806 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3807 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3808 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3809 while (p1 < p1end && p2 < p2end) {
3810 if (*p1 != *p2) {
3811 unsigned int c1 = TOLOWER(*p1 & 0xff);
3812 unsigned int c2 = TOLOWER(*p2 & 0xff);
3813 if (c1 != c2)
3814 return INT2FIX(c1 < c2 ? -1 : 1);
3815 }
3816 p1++;
3817 p2++;
3818 }
3819 }
3820 else {
3821 while (p1 < p1end && p2 < p2end) {
3822 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3823 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3824
3825 if (0 <= c1 && 0 <= c2) {
3826 c1 = TOLOWER(c1);
3827 c2 = TOLOWER(c2);
3828 if (c1 != c2)
3829 return INT2FIX(c1 < c2 ? -1 : 1);
3830 }
3831 else {
3832 int r;
3833 l1 = rb_enc_mbclen(p1, p1end, enc);
3834 l2 = rb_enc_mbclen(p2, p2end, enc);
3835 len = l1 < l2 ? l1 : l2;
3836 r = memcmp(p1, p2, len);
3837 if (r != 0)
3838 return INT2FIX(r < 0 ? -1 : 1);
3839 if (l1 != l2)
3840 return INT2FIX(l1 < l2 ? -1 : 1);
3841 }
3842 p1 += l1;
3843 p2 += l2;
3844 }
3845 }
3846 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3847 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3848 return INT2FIX(-1);
3849}
3850
3851/*
3852 * call-seq:
3853 * casecmp?(other_string) -> true, false, or nil
3854 *
3855 * Returns +true+ if +self+ and +other_string+ are equal after
3856 * Unicode case folding, otherwise +false+:
3857 *
3858 * 'foo'.casecmp?('foo') # => true
3859 * 'foo'.casecmp?('food') # => false
3860 * 'food'.casecmp?('foo') # => false
3861 * 'FOO'.casecmp?('foo') # => true
3862 * 'foo'.casecmp?('FOO') # => true
3863 *
3864 * Returns +nil+ if the two values are incomparable:
3865 *
3866 * 'foo'.casecmp?(1) # => nil
3867 *
3868 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3869 *
3870 * Related: String#casecmp.
3871 *
3872 */
3873
3874static VALUE
3875rb_str_casecmp_p(VALUE str1, VALUE str2)
3876{
3877 VALUE s = rb_check_string_type(str2);
3878 if (NIL_P(s)) {
3879 return Qnil;
3880 }
3881 return str_casecmp_p(str1, s);
3882}
3883
3884static VALUE
3885str_casecmp_p(VALUE str1, VALUE str2)
3886{
3887 rb_encoding *enc;
3888 VALUE folded_str1, folded_str2;
3889 VALUE fold_opt = sym_fold;
3890
3891 enc = rb_enc_compatible(str1, str2);
3892 if (!enc) {
3893 return Qnil;
3894 }
3895
3896 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3897 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3898
3899 return rb_str_eql(folded_str1, folded_str2);
3900}
3901
3902static long
3903strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3904 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3905{
3906 const char *search_start = str_ptr;
3907 long pos, search_len = str_len - offset;
3908
3909 for (;;) {
3910 const char *t;
3911 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3912 if (pos < 0) return pos;
3913 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3914 if (t == search_start + pos) break;
3915 search_len -= t - search_start;
3916 if (search_len <= 0) return -1;
3917 offset += t - search_start;
3918 search_start = t;
3919 }
3920 return pos + offset;
3921}
3922
3923#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3924
3925static long
3926rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3927{
3928 const char *str_ptr, *str_ptr_end, *sub_ptr;
3929 long str_len, sub_len;
3930 rb_encoding *enc;
3931
3932 enc = rb_enc_check(str, sub);
3933 if (is_broken_string(sub)) return -1;
3934
3935 str_ptr = RSTRING_PTR(str);
3936 str_ptr_end = RSTRING_END(str);
3937 str_len = RSTRING_LEN(str);
3938 sub_ptr = RSTRING_PTR(sub);
3939 sub_len = RSTRING_LEN(sub);
3940
3941 if (str_len < sub_len) return -1;
3942
3943 if (offset != 0) {
3944 long str_len_char, sub_len_char;
3945 int single_byte = single_byte_optimizable(str);
3946 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3947 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3948 if (offset < 0) {
3949 offset += str_len_char;
3950 if (offset < 0) return -1;
3951 }
3952 if (str_len_char - offset < sub_len_char) return -1;
3953 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3954 str_ptr += offset;
3955 }
3956 if (sub_len == 0) return offset;
3957
3958 /* need proceed one character at a time */
3959 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3960}
3961
3962
3963/*
3964 * call-seq:
3965 * index(substring, offset = 0) -> integer or nil
3966 * index(regexp, offset = 0) -> integer or nil
3967 *
3968 * :include: doc/string/index.rdoc
3969 *
3970 */
3971
3972static VALUE
3973rb_str_index_m(int argc, VALUE *argv, VALUE str)
3974{
3975 VALUE sub;
3976 VALUE initpos;
3977 long pos;
3978
3979 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3980 pos = NUM2LONG(initpos);
3981 }
3982 else {
3983 pos = 0;
3984 }
3985 if (pos < 0) {
3986 pos += str_strlen(str, NULL);
3987 if (pos < 0) {
3988 if (RB_TYPE_P(sub, T_REGEXP)) {
3990 }
3991 return Qnil;
3992 }
3993 }
3994
3995 if (RB_TYPE_P(sub, T_REGEXP)) {
3996 if (pos > str_strlen(str, NULL)) {
3998 return Qnil;
3999 }
4000 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4001 rb_enc_check(str, sub), single_byte_optimizable(str));
4002
4003 if (rb_reg_search(sub, str, pos, 0) < 0) {
4004 return Qnil;
4005 }
4006 else {
4007 VALUE match = rb_backref_get();
4008 struct re_registers *regs = RMATCH_REGS(match);
4009 pos = rb_str_sublen(str, BEG(0));
4010 return LONG2NUM(pos);
4011 }
4012 }
4013 else {
4014 StringValue(sub);
4015 pos = rb_str_index(str, sub, pos);
4016 pos = rb_str_sublen(str, pos);
4017 }
4018
4019 if (pos == -1) return Qnil;
4020 return LONG2NUM(pos);
4021}
4022
4023/* whether given pos is valid character boundary or not
4024 * Note that in this function, "character" means a code point
4025 * (Unicode scalar value), not a grapheme cluster.
4026 */
4027static bool
4028str_check_byte_pos(VALUE str, long pos)
4029{
4030 const char *s = RSTRING_PTR(str);
4031 const char *e = RSTRING_END(str);
4032 const char *p = s + pos;
4033 const char *pp = rb_enc_left_char_head(s, p, e, rb_enc_get(str));
4034 return p == pp;
4035}
4036
4037/*
4038 * call-seq:
4039 * byteindex(substring, offset = 0) -> integer or nil
4040 * byteindex(regexp, offset = 0) -> integer or nil
4041 *
4042 * Returns the \Integer byte-based index of the first occurrence of the given +substring+,
4043 * or +nil+ if none found:
4044 *
4045 * 'foo'.byteindex('f') # => 0
4046 * 'foo'.byteindex('o') # => 1
4047 * 'foo'.byteindex('oo') # => 1
4048 * 'foo'.byteindex('ooo') # => nil
4049 *
4050 * Returns the \Integer byte-based index of the first match for the given \Regexp +regexp+,
4051 * or +nil+ if none found:
4052 *
4053 * 'foo'.byteindex(/f/) # => 0
4054 * 'foo'.byteindex(/o/) # => 1
4055 * 'foo'.byteindex(/oo/) # => 1
4056 * 'foo'.byteindex(/ooo/) # => nil
4057 *
4058 * \Integer argument +offset+, if given, specifies the byte-based position in the
4059 * string to begin the search:
4060 *
4061 * 'foo'.byteindex('o', 1) # => 1
4062 * 'foo'.byteindex('o', 2) # => 2
4063 * 'foo'.byteindex('o', 3) # => nil
4064 *
4065 * If +offset+ is negative, counts backward from the end of +self+:
4066 *
4067 * 'foo'.byteindex('o', -1) # => 2
4068 * 'foo'.byteindex('o', -2) # => 1
4069 * 'foo'.byteindex('o', -3) # => 1
4070 * 'foo'.byteindex('o', -4) # => nil
4071 *
4072 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4073 * raised.
4074 *
4075 * Related: String#index, String#byterindex.
4076 */
4077
4078static VALUE
4079rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4080{
4081 VALUE sub;
4082 VALUE initpos;
4083 long pos;
4084
4085 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4086 long slen = RSTRING_LEN(str);
4087 pos = NUM2LONG(initpos);
4088 if (pos < 0) {
4089 pos += slen;
4090 }
4091 if (pos < 0 || pos > slen) {
4092 if (RB_TYPE_P(sub, T_REGEXP)) {
4094 }
4095 return Qnil;
4096 }
4097 }
4098 else {
4099 pos = 0;
4100 }
4101
4102 if (!str_check_byte_pos(str, pos)) {
4104 "offset %ld does not land on character boundary", pos);
4105 }
4106
4107 if (RB_TYPE_P(sub, T_REGEXP)) {
4108 if (rb_reg_search(sub, str, pos, 0) < 0) {
4109 return Qnil;
4110 }
4111 else {
4112 VALUE match = rb_backref_get();
4113 struct re_registers *regs = RMATCH_REGS(match);
4114 pos = BEG(0);
4115 return LONG2NUM(pos);
4116 }
4117 }
4118 else {
4119 StringValue(sub);
4120 pos = rb_strseq_index(str, sub, pos, 1);
4121 }
4122
4123 if (pos == -1) return Qnil;
4124 return LONG2NUM(pos);
4125}
4126
4127#ifdef HAVE_MEMRCHR
4128static long
4129str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4130{
4131 char *hit, *adjusted;
4132 int c;
4133 long slen, searchlen;
4134 char *sbeg, *e, *t;
4135
4136 sbeg = RSTRING_PTR(str);
4137 slen = RSTRING_LEN(sub);
4138 if (slen == 0) return s - sbeg;
4139 e = RSTRING_END(str);
4140 t = RSTRING_PTR(sub);
4141 c = *t & 0xff;
4142 searchlen = s - sbeg + 1;
4143
4144 do {
4145 hit = memrchr(sbeg, c, searchlen);
4146 if (!hit) break;
4147 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4148 if (hit != adjusted) {
4149 searchlen = adjusted - sbeg;
4150 continue;
4151 }
4152 if (memcmp(hit, t, slen) == 0)
4153 return hit - sbeg;
4154 searchlen = adjusted - sbeg;
4155 } while (searchlen > 0);
4156
4157 return -1;
4158}
4159#else
4160static long
4161str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4162{
4163 long slen;
4164 char *sbeg, *e, *t;
4165
4166 sbeg = RSTRING_PTR(str);
4167 e = RSTRING_END(str);
4168 t = RSTRING_PTR(sub);
4169 slen = RSTRING_LEN(sub);
4170
4171 while (s) {
4172 if (memcmp(s, t, slen) == 0) {
4173 return s - sbeg;
4174 }
4175 if (s <= sbeg) break;
4176 s = rb_enc_prev_char(sbeg, s, e, enc);
4177 }
4178
4179 return -1;
4180}
4181#endif
4182
4183static long
4184rb_str_rindex(VALUE str, VALUE sub, long pos)
4185{
4186 long len, slen;
4187 char *sbeg, *s;
4188 rb_encoding *enc;
4189 int singlebyte;
4190
4191 enc = rb_enc_check(str, sub);
4192 if (is_broken_string(sub)) return -1;
4193 singlebyte = single_byte_optimizable(str);
4194 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4195 slen = str_strlen(sub, enc); /* rb_enc_check */
4196
4197 /* substring longer than string */
4198 if (len < slen) return -1;
4199 if (len - pos < slen) pos = len - slen;
4200 if (len == 0) return pos;
4201
4202 sbeg = RSTRING_PTR(str);
4203
4204 if (pos == 0) {
4205 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4206 return 0;
4207 else
4208 return -1;
4209 }
4210
4211 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4212 return rb_str_sublen(str, str_rindex(str, sub, s, enc));
4213}
4214
4215/*
4216 * call-seq:
4217 * rindex(substring, offset = self.length) -> integer or nil
4218 * rindex(regexp, offset = self.length) -> integer or nil
4219 *
4220 * Returns the \Integer index of the _last_ occurrence of the given +substring+,
4221 * or +nil+ if none found:
4222 *
4223 * 'foo'.rindex('f') # => 0
4224 * 'foo'.rindex('o') # => 2
4225 * 'foo'.rindex('oo') # => 1
4226 * 'foo'.rindex('ooo') # => nil
4227 *
4228 * Returns the \Integer index of the _last_ match for the given \Regexp +regexp+,
4229 * or +nil+ if none found:
4230 *
4231 * 'foo'.rindex(/f/) # => 0
4232 * 'foo'.rindex(/o/) # => 2
4233 * 'foo'.rindex(/oo/) # => 1
4234 * 'foo'.rindex(/ooo/) # => nil
4235 *
4236 * The _last_ match means starting at the possible last position, not
4237 * the last of longest matches.
4238 *
4239 * 'foo'.rindex(/o+/) # => 2
4240 * $~ #=> #<MatchData "o">
4241 *
4242 * To get the last longest match, needs to combine with negative
4243 * lookbehind.
4244 *
4245 * 'foo'.rindex(/(?<!o)o+/) # => 1
4246 * $~ #=> #<MatchData "oo">
4247 *
4248 * Or String#index with negative lookforward.
4249 *
4250 * 'foo'.index(/o+(?!.*o)/) # => 1
4251 * $~ #=> #<MatchData "oo">
4252 *
4253 * \Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4254 * string to _end_ the search:
4255 *
4256 * 'foo'.rindex('o', 0) # => nil
4257 * 'foo'.rindex('o', 1) # => 1
4258 * 'foo'.rindex('o', 2) # => 2
4259 * 'foo'.rindex('o', 3) # => 2
4260 *
4261 * If +offset+ is a negative \Integer, the maximum starting position in the
4262 * string to _end_ the search is the sum of the string's length and +offset+:
4263 *
4264 * 'foo'.rindex('o', -1) # => 2
4265 * 'foo'.rindex('o', -2) # => 1
4266 * 'foo'.rindex('o', -3) # => nil
4267 * 'foo'.rindex('o', -4) # => nil
4268 *
4269 * Related: String#index.
4270 */
4271
4272static VALUE
4273rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4274{
4275 VALUE sub;
4276 VALUE vpos;
4277 rb_encoding *enc = STR_ENC_GET(str);
4278 long pos, len = str_strlen(str, enc); /* str's enc */
4279
4280 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
4281 pos = NUM2LONG(vpos);
4282 if (pos < 0) {
4283 pos += len;
4284 if (pos < 0) {
4285 if (RB_TYPE_P(sub, T_REGEXP)) {
4287 }
4288 return Qnil;
4289 }
4290 }
4291 if (pos > len) pos = len;
4292 }
4293 else {
4294 pos = len;
4295 }
4296
4297 if (RB_TYPE_P(sub, T_REGEXP)) {
4298 /* enc = rb_get_check(str, sub); */
4299 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4300 enc, single_byte_optimizable(str));
4301
4302 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4303 VALUE match = rb_backref_get();
4304 struct re_registers *regs = RMATCH_REGS(match);
4305 pos = rb_str_sublen(str, BEG(0));
4306 return LONG2NUM(pos);
4307 }
4308 }
4309 else {
4310 StringValue(sub);
4311 pos = rb_str_rindex(str, sub, pos);
4312 if (pos >= 0) return LONG2NUM(pos);
4313 }
4314 return Qnil;
4315}
4316
4317static long
4318rb_str_byterindex(VALUE str, VALUE sub, long pos)
4319{
4320 long len, slen;
4321 char *sbeg, *s;
4322 rb_encoding *enc;
4323
4324 enc = rb_enc_check(str, sub);
4325 if (is_broken_string(sub)) return -1;
4326 len = RSTRING_LEN(str);
4327 slen = RSTRING_LEN(sub);
4328
4329 /* substring longer than string */
4330 if (len < slen) return -1;
4331 if (len - pos < slen) pos = len - slen;
4332 if (len == 0) return pos;
4333
4334 sbeg = RSTRING_PTR(str);
4335
4336 if (pos == 0) {
4337 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4338 return 0;
4339 else
4340 return -1;
4341 }
4342
4343 s = sbeg + pos;
4344 return str_rindex(str, sub, s, enc);
4345}
4346
4347
4348/*
4349 * call-seq:
4350 * byterindex(substring, offset = self.bytesize) -> integer or nil
4351 * byterindex(regexp, offset = self.bytesize) -> integer or nil
4352 *
4353 * Returns the \Integer byte-based index of the _last_ occurrence of the given +substring+,
4354 * or +nil+ if none found:
4355 *
4356 * 'foo'.byterindex('f') # => 0
4357 * 'foo'.byterindex('o') # => 2
4358 * 'foo'.byterindex('oo') # => 1
4359 * 'foo'.byterindex('ooo') # => nil
4360 *
4361 * Returns the \Integer byte-based index of the _last_ match for the given \Regexp +regexp+,
4362 * or +nil+ if none found:
4363 *
4364 * 'foo'.byterindex(/f/) # => 0
4365 * 'foo'.byterindex(/o/) # => 2
4366 * 'foo'.byterindex(/oo/) # => 1
4367 * 'foo'.byterindex(/ooo/) # => nil
4368 *
4369 * The _last_ match means starting at the possible last position, not
4370 * the last of longest matches.
4371 *
4372 * 'foo'.byterindex(/o+/) # => 2
4373 * $~ #=> #<MatchData "o">
4374 *
4375 * To get the last longest match, needs to combine with negative
4376 * lookbehind.
4377 *
4378 * 'foo'.byterindex(/(?<!o)o+/) # => 1
4379 * $~ #=> #<MatchData "oo">
4380 *
4381 * Or String#byteindex with negative lookforward.
4382 *
4383 * 'foo'.byteindex(/o+(?!.*o)/) # => 1
4384 * $~ #=> #<MatchData "oo">
4385 *
4386 * \Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4387 * string to _end_ the search:
4388 *
4389 * 'foo'.byterindex('o', 0) # => nil
4390 * 'foo'.byterindex('o', 1) # => 1
4391 * 'foo'.byterindex('o', 2) # => 2
4392 * 'foo'.byterindex('o', 3) # => 2
4393 *
4394 * If +offset+ is a negative \Integer, the maximum starting position in the
4395 * string to _end_ the search is the sum of the string's length and +offset+:
4396 *
4397 * 'foo'.byterindex('o', -1) # => 2
4398 * 'foo'.byterindex('o', -2) # => 1
4399 * 'foo'.byterindex('o', -3) # => nil
4400 * 'foo'.byterindex('o', -4) # => nil
4401 *
4402 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4403 * raised.
4404 *
4405 * Related: String#byteindex.
4406 */
4407
4408static VALUE
4409rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4410{
4411 VALUE sub;
4412 VALUE vpos;
4413 long pos, len = RSTRING_LEN(str);
4414
4415 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
4416 pos = NUM2LONG(vpos);
4417 if (pos < 0) {
4418 pos += len;
4419 if (pos < 0) {
4420 if (RB_TYPE_P(sub, T_REGEXP)) {
4422 }
4423 return Qnil;
4424 }
4425 }
4426 if (pos > len) pos = len;
4427 }
4428 else {
4429 pos = len;
4430 }
4431
4432 if (!str_check_byte_pos(str, pos)) {
4434 "offset %ld does not land on character boundary", pos);
4435 }
4436
4437 if (RB_TYPE_P(sub, T_REGEXP)) {
4438 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4439 VALUE match = rb_backref_get();
4440 struct re_registers *regs = RMATCH_REGS(match);
4441 pos = BEG(0);
4442 return LONG2NUM(pos);
4443 }
4444 }
4445 else {
4446 StringValue(sub);
4447 pos = rb_str_byterindex(str, sub, pos);
4448 if (pos >= 0) return LONG2NUM(pos);
4449 }
4450 return Qnil;
4451}
4452
4453/*
4454 * call-seq:
4455 * string =~ regexp -> integer or nil
4456 * string =~ object -> integer or nil
4457 *
4458 * Returns the \Integer index of the first substring that matches
4459 * the given +regexp+, or +nil+ if no match found:
4460 *
4461 * 'foo' =~ /f/ # => 0
4462 * 'foo' =~ /o/ # => 1
4463 * 'foo' =~ /x/ # => nil
4464 *
4465 * Note: also updates Regexp@Special+global+variables.
4466 *
4467 * If the given +object+ is not a \Regexp, returns the value
4468 * returned by <tt>object =~ self</tt>.
4469 *
4470 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4471 * (see Regexp#=~):
4472 *
4473 * number= nil
4474 * "no. 9" =~ /(?<number>\d+)/
4475 * number # => nil (not assigned)
4476 * /(?<number>\d+)/ =~ "no. 9"
4477 * number #=> "9"
4478 *
4479 */
4480
4481static VALUE
4482rb_str_match(VALUE x, VALUE y)
4483{
4484 switch (OBJ_BUILTIN_TYPE(y)) {
4485 case T_STRING:
4486 rb_raise(rb_eTypeError, "type mismatch: String given");
4487
4488 case T_REGEXP:
4489 return rb_reg_match(y, x);
4490
4491 default:
4492 return rb_funcall(y, idEqTilde, 1, x);
4493 }
4494}
4495
4496
4497static VALUE get_pat(VALUE);
4498
4499
4500/*
4501 * call-seq:
4502 * match(pattern, offset = 0) -> matchdata or nil
4503 * match(pattern, offset = 0) {|matchdata| ... } -> object
4504 *
4505 * Returns a \MatchData object (or +nil+) based on +self+ and the given +pattern+.
4506 *
4507 * Note: also updates Regexp@Special+global+variables.
4508 *
4509 * - Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4510 * regexp = Regexp.new(pattern)
4511 * - Computes +matchdata+, which will be either a \MatchData object or +nil+
4512 * (see Regexp#match):
4513 * matchdata = <tt>regexp.match(self)
4514 *
4515 * With no block given, returns the computed +matchdata+:
4516 *
4517 * 'foo'.match('f') # => #<MatchData "f">
4518 * 'foo'.match('o') # => #<MatchData "o">
4519 * 'foo'.match('x') # => nil
4520 *
4521 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4522 *
4523 * 'foo'.match('f', 1) # => nil
4524 * 'foo'.match('o', 1) # => #<MatchData "o">
4525 *
4526 * With a block given, calls the block with the computed +matchdata+
4527 * and returns the block's return value:
4528 *
4529 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4530 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4531 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4532 *
4533 */
4534
4535static VALUE
4536rb_str_match_m(int argc, VALUE *argv, VALUE str)
4537{
4538 VALUE re, result;
4539 if (argc < 1)
4540 rb_check_arity(argc, 1, 2);
4541 re = argv[0];
4542 argv[0] = str;
4543 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4544 if (!NIL_P(result) && rb_block_given_p()) {
4545 return rb_yield(result);
4546 }
4547 return result;
4548}
4549
4550/*
4551 * call-seq:
4552 * match?(pattern, offset = 0) -> true or false
4553 *
4554 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4555 *
4556 * Note: does not update Regexp@Special+global+variables.
4557 *
4558 * Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4559 * regexp = Regexp.new(pattern)
4560 *
4561 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a \MatchData object,
4562 * +false+ otherwise:
4563 *
4564 * 'foo'.match?(/o/) # => true
4565 * 'foo'.match?('o') # => true
4566 * 'foo'.match?(/x/) # => false
4567 *
4568 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4569 * 'foo'.match?('f', 1) # => false
4570 * 'foo'.match?('o', 1) # => true
4571 *
4572 */
4573
4574static VALUE
4575rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4576{
4577 VALUE re;
4578 rb_check_arity(argc, 1, 2);
4579 re = get_pat(argv[0]);
4580 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4581}
4582
4583enum neighbor_char {
4584 NEIGHBOR_NOT_CHAR,
4585 NEIGHBOR_FOUND,
4586 NEIGHBOR_WRAPPED
4587};
4588
4589static enum neighbor_char
4590enc_succ_char(char *p, long len, rb_encoding *enc)
4591{
4592 long i;
4593 int l;
4594
4595 if (rb_enc_mbminlen(enc) > 1) {
4596 /* wchar, trivial case */
4597 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4598 if (!MBCLEN_CHARFOUND_P(r)) {
4599 return NEIGHBOR_NOT_CHAR;
4600 }
4601 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4602 l = rb_enc_code_to_mbclen(c, enc);
4603 if (!l) return NEIGHBOR_NOT_CHAR;
4604 if (l != len) return NEIGHBOR_WRAPPED;
4605 rb_enc_mbcput(c, p, enc);
4606 r = rb_enc_precise_mbclen(p, p + len, enc);
4607 if (!MBCLEN_CHARFOUND_P(r)) {
4608 return NEIGHBOR_NOT_CHAR;
4609 }
4610 return NEIGHBOR_FOUND;
4611 }
4612 while (1) {
4613 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4614 p[i] = '\0';
4615 if (i < 0)
4616 return NEIGHBOR_WRAPPED;
4617 ++((unsigned char*)p)[i];
4618 l = rb_enc_precise_mbclen(p, p+len, enc);
4619 if (MBCLEN_CHARFOUND_P(l)) {
4620 l = MBCLEN_CHARFOUND_LEN(l);
4621 if (l == len) {
4622 return NEIGHBOR_FOUND;
4623 }
4624 else {
4625 memset(p+l, 0xff, len-l);
4626 }
4627 }
4628 if (MBCLEN_INVALID_P(l) && i < len-1) {
4629 long len2;
4630 int l2;
4631 for (len2 = len-1; 0 < len2; len2--) {
4632 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4633 if (!MBCLEN_INVALID_P(l2))
4634 break;
4635 }
4636 memset(p+len2+1, 0xff, len-(len2+1));
4637 }
4638 }
4639}
4640
4641static enum neighbor_char
4642enc_pred_char(char *p, long len, rb_encoding *enc)
4643{
4644 long i;
4645 int l;
4646 if (rb_enc_mbminlen(enc) > 1) {
4647 /* wchar, trivial case */
4648 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4649 if (!MBCLEN_CHARFOUND_P(r)) {
4650 return NEIGHBOR_NOT_CHAR;
4651 }
4652 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4653 if (!c) return NEIGHBOR_NOT_CHAR;
4654 --c;
4655 l = rb_enc_code_to_mbclen(c, enc);
4656 if (!l) return NEIGHBOR_NOT_CHAR;
4657 if (l != len) return NEIGHBOR_WRAPPED;
4658 rb_enc_mbcput(c, p, enc);
4659 r = rb_enc_precise_mbclen(p, p + len, enc);
4660 if (!MBCLEN_CHARFOUND_P(r)) {
4661 return NEIGHBOR_NOT_CHAR;
4662 }
4663 return NEIGHBOR_FOUND;
4664 }
4665 while (1) {
4666 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4667 p[i] = '\xff';
4668 if (i < 0)
4669 return NEIGHBOR_WRAPPED;
4670 --((unsigned char*)p)[i];
4671 l = rb_enc_precise_mbclen(p, p+len, enc);
4672 if (MBCLEN_CHARFOUND_P(l)) {
4673 l = MBCLEN_CHARFOUND_LEN(l);
4674 if (l == len) {
4675 return NEIGHBOR_FOUND;
4676 }
4677 else {
4678 memset(p+l, 0, len-l);
4679 }
4680 }
4681 if (MBCLEN_INVALID_P(l) && i < len-1) {
4682 long len2;
4683 int l2;
4684 for (len2 = len-1; 0 < len2; len2--) {
4685 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4686 if (!MBCLEN_INVALID_P(l2))
4687 break;
4688 }
4689 memset(p+len2+1, 0, len-(len2+1));
4690 }
4691 }
4692}
4693
4694/*
4695 overwrite +p+ by succeeding letter in +enc+ and returns
4696 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4697 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4698 assuming each ranges are successive, and mbclen
4699 never change in each ranges.
4700 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4701 character.
4702 */
4703static enum neighbor_char
4704enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4705{
4706 enum neighbor_char ret;
4707 unsigned int c;
4708 int ctype;
4709 int range;
4710 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4711
4712 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4713 int try;
4714 const int max_gaps = 1;
4715
4716 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4717 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4718 ctype = ONIGENC_CTYPE_DIGIT;
4719 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4720 ctype = ONIGENC_CTYPE_ALPHA;
4721 else
4722 return NEIGHBOR_NOT_CHAR;
4723
4724 MEMCPY(save, p, char, len);
4725 for (try = 0; try <= max_gaps; ++try) {
4726 ret = enc_succ_char(p, len, enc);
4727 if (ret == NEIGHBOR_FOUND) {
4728 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4729 if (rb_enc_isctype(c, ctype, enc))
4730 return NEIGHBOR_FOUND;
4731 }
4732 }
4733 MEMCPY(p, save, char, len);
4734 range = 1;
4735 while (1) {
4736 MEMCPY(save, p, char, len);
4737 ret = enc_pred_char(p, len, enc);
4738 if (ret == NEIGHBOR_FOUND) {
4739 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4740 if (!rb_enc_isctype(c, ctype, enc)) {
4741 MEMCPY(p, save, char, len);
4742 break;
4743 }
4744 }
4745 else {
4746 MEMCPY(p, save, char, len);
4747 break;
4748 }
4749 range++;
4750 }
4751 if (range == 1) {
4752 return NEIGHBOR_NOT_CHAR;
4753 }
4754
4755 if (ctype != ONIGENC_CTYPE_DIGIT) {
4756 MEMCPY(carry, p, char, len);
4757 return NEIGHBOR_WRAPPED;
4758 }
4759
4760 MEMCPY(carry, p, char, len);
4761 enc_succ_char(carry, len, enc);
4762 return NEIGHBOR_WRAPPED;
4763}
4764
4765
4766static VALUE str_succ(VALUE str);
4767
4768/*
4769 * call-seq:
4770 * succ -> new_str
4771 *
4772 * Returns the successor to +self+. The successor is calculated by
4773 * incrementing characters.
4774 *
4775 * The first character to be incremented is the rightmost alphanumeric:
4776 * or, if no alphanumerics, the rightmost character:
4777 *
4778 * 'THX1138'.succ # => "THX1139"
4779 * '<<koala>>'.succ # => "<<koalb>>"
4780 * '***'.succ # => '**+'
4781 *
4782 * The successor to a digit is another digit, "carrying" to the next-left
4783 * character for a "rollover" from 9 to 0, and prepending another digit
4784 * if necessary:
4785 *
4786 * '00'.succ # => "01"
4787 * '09'.succ # => "10"
4788 * '99'.succ # => "100"
4789 *
4790 * The successor to a letter is another letter of the same case,
4791 * carrying to the next-left character for a rollover,
4792 * and prepending another same-case letter if necessary:
4793 *
4794 * 'aa'.succ # => "ab"
4795 * 'az'.succ # => "ba"
4796 * 'zz'.succ # => "aaa"
4797 * 'AA'.succ # => "AB"
4798 * 'AZ'.succ # => "BA"
4799 * 'ZZ'.succ # => "AAA"
4800 *
4801 * The successor to a non-alphanumeric character is the next character
4802 * in the underlying character set's collating sequence,
4803 * carrying to the next-left character for a rollover,
4804 * and prepending another character if necessary:
4805 *
4806 * s = 0.chr * 3
4807 * s # => "\x00\x00\x00"
4808 * s.succ # => "\x00\x00\x01"
4809 * s = 255.chr * 3
4810 * s # => "\xFF\xFF\xFF"
4811 * s.succ # => "\x01\x00\x00\x00"
4812 *
4813 * Carrying can occur between and among mixtures of alphanumeric characters:
4814 *
4815 * s = 'zz99zz99'
4816 * s.succ # => "aaa00aa00"
4817 * s = '99zz99zz'
4818 * s.succ # => "100aa00aa"
4819 *
4820 * The successor to an empty \String is a new empty \String:
4821 *
4822 * ''.succ # => ""
4823 *
4824 * String#next is an alias for String#succ.
4825 */
4826
4827VALUE
4829{
4830 VALUE str;
4831 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4832 rb_enc_cr_str_copy_for_substr(str, orig);
4833 return str_succ(str);
4834}
4835
4836static VALUE
4837str_succ(VALUE str)
4838{
4839 rb_encoding *enc;
4840 char *sbeg, *s, *e, *last_alnum = 0;
4841 int found_alnum = 0;
4842 long l, slen;
4843 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4844 long carry_pos = 0, carry_len = 1;
4845 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4846
4847 slen = RSTRING_LEN(str);
4848 if (slen == 0) return str;
4849
4850 enc = STR_ENC_GET(str);
4851 sbeg = RSTRING_PTR(str);
4852 s = e = sbeg + slen;
4853
4854 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4855 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4856 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4857 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4858 break;
4859 }
4860 }
4861 l = rb_enc_precise_mbclen(s, e, enc);
4862 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4863 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4864 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4865 switch (neighbor) {
4866 case NEIGHBOR_NOT_CHAR:
4867 continue;
4868 case NEIGHBOR_FOUND:
4869 return str;
4870 case NEIGHBOR_WRAPPED:
4871 last_alnum = s;
4872 break;
4873 }
4874 found_alnum = 1;
4875 carry_pos = s - sbeg;
4876 carry_len = l;
4877 }
4878 if (!found_alnum) { /* str contains no alnum */
4879 s = e;
4880 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4881 enum neighbor_char neighbor;
4882 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4883 l = rb_enc_precise_mbclen(s, e, enc);
4884 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4885 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4886 MEMCPY(tmp, s, char, l);
4887 neighbor = enc_succ_char(tmp, l, enc);
4888 switch (neighbor) {
4889 case NEIGHBOR_FOUND:
4890 MEMCPY(s, tmp, char, l);
4891 return str;
4892 break;
4893 case NEIGHBOR_WRAPPED:
4894 MEMCPY(s, tmp, char, l);
4895 break;
4896 case NEIGHBOR_NOT_CHAR:
4897 break;
4898 }
4899 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4900 /* wrapped to \0...\0. search next valid char. */
4901 enc_succ_char(s, l, enc);
4902 }
4903 if (!rb_enc_asciicompat(enc)) {
4904 MEMCPY(carry, s, char, l);
4905 carry_len = l;
4906 }
4907 carry_pos = s - sbeg;
4908 }
4910 }
4911 RESIZE_CAPA(str, slen + carry_len);
4912 sbeg = RSTRING_PTR(str);
4913 s = sbeg + carry_pos;
4914 memmove(s + carry_len, s, slen - carry_pos);
4915 memmove(s, carry, carry_len);
4916 slen += carry_len;
4917 STR_SET_LEN(str, slen);
4918 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4920 return str;
4921}
4922
4923
4924/*
4925 * call-seq:
4926 * succ! -> self
4927 *
4928 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4929 *
4930 * String#next! is an alias for String#succ!.
4931 */
4932
4933static VALUE
4934rb_str_succ_bang(VALUE str)
4935{
4936 rb_str_modify(str);
4937 str_succ(str);
4938 return str;
4939}
4940
4941static int
4942all_digits_p(const char *s, long len)
4943{
4944 while (len-- > 0) {
4945 if (!ISDIGIT(*s)) return 0;
4946 s++;
4947 }
4948 return 1;
4949}
4950
4951static int
4952str_upto_i(VALUE str, VALUE arg)
4953{
4954 rb_yield(str);
4955 return 0;
4956}
4957
4958/*
4959 * call-seq:
4960 * upto(other_string, exclusive = false) {|string| ... } -> self
4961 * upto(other_string, exclusive = false) -> new_enumerator
4962 *
4963 * With a block given, calls the block with each \String value
4964 * returned by successive calls to String#succ;
4965 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4966 * the sequence terminates when value +other_string+ is reached;
4967 * returns +self+:
4968 *
4969 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4970 * Output:
4971 *
4972 * a8 a9 b0 b1 b2 b3 b4 b5 b6
4973 *
4974 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
4975 *
4976 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
4977 *
4978 * Output:
4979 *
4980 * a8 a9 b0 b1 b2 b3 b4 b5
4981 *
4982 * If +other_string+ would not be reached, does not call the block:
4983 *
4984 * '25'.upto('5') {|s| fail s }
4985 * 'aa'.upto('a') {|s| fail s }
4986 *
4987 * With no block given, returns a new \Enumerator:
4988 *
4989 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
4990 *
4991 */
4992
4993static VALUE
4994rb_str_upto(int argc, VALUE *argv, VALUE beg)
4995{
4996 VALUE end, exclusive;
4997
4998 rb_scan_args(argc, argv, "11", &end, &exclusive);
4999 RETURN_ENUMERATOR(beg, argc, argv);
5000 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5001}
5002
5003VALUE
5004rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5005{
5006 VALUE current, after_end;
5007 ID succ;
5008 int n, ascii;
5009 rb_encoding *enc;
5010
5011 CONST_ID(succ, "succ");
5012 StringValue(end);
5013 enc = rb_enc_check(beg, end);
5014 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5015 /* single character */
5016 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5017 char c = RSTRING_PTR(beg)[0];
5018 char e = RSTRING_PTR(end)[0];
5019
5020 if (c > e || (excl && c == e)) return beg;
5021 for (;;) {
5022 if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
5023 if (!excl && c == e) break;
5024 c++;
5025 if (excl && c == e) break;
5026 }
5027 return beg;
5028 }
5029 /* both edges are all digits */
5030 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5031 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5032 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5033 VALUE b, e;
5034 int width;
5035
5036 width = RSTRING_LENINT(beg);
5037 b = rb_str_to_inum(beg, 10, FALSE);
5038 e = rb_str_to_inum(end, 10, FALSE);
5039 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5040 long bi = FIX2LONG(b);
5041 long ei = FIX2LONG(e);
5042 rb_encoding *usascii = rb_usascii_encoding();
5043
5044 while (bi <= ei) {
5045 if (excl && bi == ei) break;
5046 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5047 bi++;
5048 }
5049 }
5050 else {
5051 ID op = excl ? '<' : idLE;
5052 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5053
5054 args[0] = INT2FIX(width);
5055 while (rb_funcall(b, op, 1, e)) {
5056 args[1] = b;
5057 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5058 b = rb_funcallv(b, succ, 0, 0);
5059 }
5060 }
5061 return beg;
5062 }
5063 /* normal case */
5064 n = rb_str_cmp(beg, end);
5065 if (n > 0 || (excl && n == 0)) return beg;
5066
5067 after_end = rb_funcallv(end, succ, 0, 0);
5068 current = str_duplicate(rb_cString, beg);
5069 while (!rb_str_equal(current, after_end)) {
5070 VALUE next = Qnil;
5071 if (excl || !rb_str_equal(current, end))
5072 next = rb_funcallv(current, succ, 0, 0);
5073 if ((*each)(current, arg)) break;
5074 if (NIL_P(next)) break;
5075 current = next;
5076 StringValue(current);
5077 if (excl && rb_str_equal(current, end)) break;
5078 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5079 break;
5080 }
5081
5082 return beg;
5083}
5084
5085VALUE
5086rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5087{
5088 VALUE current;
5089 ID succ;
5090
5091 CONST_ID(succ, "succ");
5092 /* both edges are all digits */
5093 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5094 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5095 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5096 int width = RSTRING_LENINT(beg);
5097 b = rb_str_to_inum(beg, 10, FALSE);
5098 if (FIXNUM_P(b)) {
5099 long bi = FIX2LONG(b);
5100 rb_encoding *usascii = rb_usascii_encoding();
5101
5102 while (FIXABLE(bi)) {
5103 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5104 bi++;
5105 }
5106 b = LONG2NUM(bi);
5107 }
5108 args[0] = INT2FIX(width);
5109 while (1) {
5110 args[1] = b;
5111 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5112 b = rb_funcallv(b, succ, 0, 0);
5113 }
5114 }
5115 /* normal case */
5116 current = str_duplicate(rb_cString, beg);
5117 while (1) {
5118 VALUE next = rb_funcallv(current, succ, 0, 0);
5119 if ((*each)(current, arg)) break;
5120 current = next;
5121 StringValue(current);
5122 if (RSTRING_LEN(current) == 0)
5123 break;
5124 }
5125
5126 return beg;
5127}
5128
5129static int
5130include_range_i(VALUE str, VALUE arg)
5131{
5132 VALUE *argp = (VALUE *)arg;
5133 if (!rb_equal(str, *argp)) return 0;
5134 *argp = Qnil;
5135 return 1;
5136}
5137
5138VALUE
5139rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5140{
5141 beg = rb_str_new_frozen(beg);
5142 StringValue(end);
5143 end = rb_str_new_frozen(end);
5144 if (NIL_P(val)) return Qfalse;
5145 val = rb_check_string_type(val);
5146 if (NIL_P(val)) return Qfalse;
5147 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5148 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5149 rb_enc_asciicompat(STR_ENC_GET(val))) {
5150 const char *bp = RSTRING_PTR(beg);
5151 const char *ep = RSTRING_PTR(end);
5152 const char *vp = RSTRING_PTR(val);
5153 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5154 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5155 return Qfalse;
5156 else {
5157 char b = *bp;
5158 char e = *ep;
5159 char v = *vp;
5160
5161 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5162 if (b <= v && v < e) return Qtrue;
5163 return RBOOL(!RTEST(exclusive) && v == e);
5164 }
5165 }
5166 }
5167#if 0
5168 /* both edges are all digits */
5169 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5170 all_digits_p(bp, RSTRING_LEN(beg)) &&
5171 all_digits_p(ep, RSTRING_LEN(end))) {
5172 /* TODO */
5173 }
5174#endif
5175 }
5176 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5177
5178 return RBOOL(NIL_P(val));
5179}
5180
5181static VALUE
5182rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5183{
5184 if (rb_reg_search(re, str, 0, 0) >= 0) {
5185 VALUE match = rb_backref_get();
5186 int nth = rb_reg_backref_number(match, backref);
5187 return rb_reg_nth_match(nth, match);
5188 }
5189 return Qnil;
5190}
5191
5192static VALUE
5193rb_str_aref(VALUE str, VALUE indx)
5194{
5195 long idx;
5196
5197 if (FIXNUM_P(indx)) {
5198 idx = FIX2LONG(indx);
5199 }
5200 else if (RB_TYPE_P(indx, T_REGEXP)) {
5201 return rb_str_subpat(str, indx, INT2FIX(0));
5202 }
5203 else if (RB_TYPE_P(indx, T_STRING)) {
5204 if (rb_str_index(str, indx, 0) != -1)
5205 return str_duplicate(rb_cString, indx);
5206 return Qnil;
5207 }
5208 else {
5209 /* check if indx is Range */
5210 long beg, len = str_strlen(str, NULL);
5211 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5212 case Qfalse:
5213 break;
5214 case Qnil:
5215 return Qnil;
5216 default:
5217 return rb_str_substr(str, beg, len);
5218 }
5219 idx = NUM2LONG(indx);
5220 }
5221
5222 return str_substr(str, idx, 1, FALSE);
5223}
5224
5225
5226/*
5227 * call-seq:
5228 * string[index] -> new_string or nil
5229 * string[start, length] -> new_string or nil
5230 * string[range] -> new_string or nil
5231 * string[regexp, capture = 0] -> new_string or nil
5232 * string[substring] -> new_string or nil
5233 *
5234 * Returns the substring of +self+ specified by the arguments.
5235 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5236 *
5237 *
5238 */
5239
5240static VALUE
5241rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5242{
5243 if (argc == 2) {
5244 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5245 return rb_str_subpat(str, argv[0], argv[1]);
5246 }
5247 else {
5248 long beg = NUM2LONG(argv[0]);
5249 long len = NUM2LONG(argv[1]);
5250 return rb_str_substr(str, beg, len);
5251 }
5252 }
5253 rb_check_arity(argc, 1, 2);
5254 return rb_str_aref(str, argv[0]);
5255}
5256
5257VALUE
5259{
5260 char *ptr = RSTRING_PTR(str);
5261 long olen = RSTRING_LEN(str), nlen;
5262
5263 str_modifiable(str);
5264 if (len > olen) len = olen;
5265 nlen = olen - len;
5266 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5267 char *oldptr = ptr;
5268 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5269 STR_SET_EMBED(str);
5270 STR_SET_EMBED_LEN(str, nlen);
5271 ptr = RSTRING(str)->as.embed.ary;
5272 memmove(ptr, oldptr + len, nlen);
5273 if (fl == STR_NOEMBED) xfree(oldptr);
5274 }
5275 else {
5276 if (!STR_SHARED_P(str)) {
5277 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5278 rb_enc_cr_str_exact_copy(shared, str);
5279 OBJ_FREEZE(shared);
5280 }
5281 ptr = RSTRING(str)->as.heap.ptr += len;
5282 RSTRING(str)->as.heap.len = nlen;
5283 }
5284 ptr[nlen] = 0;
5286 return str;
5287}
5288
5289static void
5290rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
5291{
5292 char *sptr;
5293 long slen, vlen = RSTRING_LEN(val);
5294 int cr;
5295
5296 if (beg == 0 && vlen == 0) {
5297 rb_str_drop_bytes(str, len);
5298 return;
5299 }
5300
5301 str_modify_keep_cr(str);
5302 RSTRING_GETMEM(str, sptr, slen);
5303 if (len < vlen) {
5304 /* expand string */
5305 RESIZE_CAPA(str, slen + vlen - len);
5306 sptr = RSTRING_PTR(str);
5307 }
5308
5310 cr = rb_enc_str_coderange(val);
5311 else
5313
5314 if (vlen != len) {
5315 memmove(sptr + beg + vlen,
5316 sptr + beg + len,
5317 slen - (beg + len));
5318 }
5319 if (vlen < beg && len < 0) {
5320 MEMZERO(sptr + slen, char, -len);
5321 }
5322 if (vlen > 0) {
5323 memmove(sptr + beg, RSTRING_PTR(val), vlen);
5324 }
5325 slen += vlen - len;
5326 STR_SET_LEN(str, slen);
5327 TERM_FILL(&sptr[slen], TERM_LEN(str));
5328 ENC_CODERANGE_SET(str, cr);
5329}
5330
5331void
5332rb_str_update(VALUE str, long beg, long len, VALUE val)
5333{
5334 long slen;
5335 char *p, *e;
5336 rb_encoding *enc;
5337 int singlebyte = single_byte_optimizable(str);
5338 int cr;
5339
5340 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5341
5342 StringValue(val);
5343 enc = rb_enc_check(str, val);
5344 slen = str_strlen(str, enc); /* rb_enc_check */
5345
5346 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5347 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5348 }
5349 if (beg < 0) {
5350 beg += slen;
5351 }
5352 assert(beg >= 0);
5353 assert(beg <= slen);
5354 if (len > slen - beg) {
5355 len = slen - beg;
5356 }
5357 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5358 if (!p) p = RSTRING_END(str);
5359 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5360 if (!e) e = RSTRING_END(str);
5361 /* error check */
5362 beg = p - RSTRING_PTR(str); /* physical position */
5363 len = e - p; /* physical length */
5364 rb_str_splice_0(str, beg, len, val);
5365 rb_enc_associate(str, enc);
5367 if (cr != ENC_CODERANGE_BROKEN)
5368 ENC_CODERANGE_SET(str, cr);
5369}
5370
5371#define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
5372
5373static void
5374rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5375{
5376 int nth;
5377 VALUE match;
5378 long start, end, len;
5379 rb_encoding *enc;
5380 struct re_registers *regs;
5381
5382 if (rb_reg_search(re, str, 0, 0) < 0) {
5383 rb_raise(rb_eIndexError, "regexp not matched");
5384 }
5385 match = rb_backref_get();
5386 nth = rb_reg_backref_number(match, backref);
5387 regs = RMATCH_REGS(match);
5388 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5389 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5390 }
5391 if (nth < 0) {
5392 nth += regs->num_regs;
5393 }
5394
5395 start = BEG(nth);
5396 if (start == -1) {
5397 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5398 }
5399 end = END(nth);
5400 len = end - start;
5401 StringValue(val);
5402 enc = rb_enc_check_str(str, val);
5403 rb_str_splice_0(str, start, len, val);
5404 rb_enc_associate(str, enc);
5405}
5406
5407static VALUE
5408rb_str_aset(VALUE str, VALUE indx, VALUE val)
5409{
5410 long idx, beg;
5411
5412 switch (TYPE(indx)) {
5413 case T_REGEXP:
5414 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5415 return val;
5416
5417 case T_STRING:
5418 beg = rb_str_index(str, indx, 0);
5419 if (beg < 0) {
5420 rb_raise(rb_eIndexError, "string not matched");
5421 }
5422 beg = rb_str_sublen(str, beg);
5423 rb_str_splice(str, beg, str_strlen(indx, NULL), val);
5424 return val;
5425
5426 default:
5427 /* check if indx is Range */
5428 {
5429 long beg, len;
5430 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5431 rb_str_splice(str, beg, len, val);
5432 return val;
5433 }
5434 }
5435 /* FALLTHROUGH */
5436
5437 case T_FIXNUM:
5438 idx = NUM2LONG(indx);
5439 rb_str_splice(str, idx, 1, val);
5440 return val;
5441 }
5442}
5443
5444/*
5445 * call-seq:
5446 * string[index] = new_string
5447 * string[start, length] = new_string
5448 * string[range] = new_string
5449 * string[regexp, capture = 0] = new_string
5450 * string[substring] = new_string
5451 *
5452 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
5453 * See {String Slices}[rdoc-ref:String@String+Slices].
5454 *
5455 * A few examples:
5456 *
5457 * s = 'foo'
5458 * s[2] = 'rtune' # => "rtune"
5459 * s # => "fortune"
5460 * s[1, 5] = 'init' # => "init"
5461 * s # => "finite"
5462 * s[3..4] = 'al' # => "al"
5463 * s # => "finale"
5464 * s[/e$/] = 'ly' # => "ly"
5465 * s # => "finally"
5466 * s['lly'] = 'ncial' # => "ncial"
5467 * s # => "financial"
5468 *
5469 * String#slice is an alias for String#[].
5470 *
5471 */
5472
5473static VALUE
5474rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5475{
5476 if (argc == 3) {
5477 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5478 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5479 }
5480 else {
5481 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5482 }
5483 return argv[2];
5484 }
5485 rb_check_arity(argc, 2, 3);
5486 return rb_str_aset(str, argv[0], argv[1]);
5487}
5488
5489/*
5490 * call-seq:
5491 * insert(index, other_string) -> self
5492 *
5493 * Inserts the given +other_string+ into +self+; returns +self+.
5494 *
5495 * If the \Integer +index+ is positive, inserts +other_string+ at offset +index+:
5496 *
5497 * 'foo'.insert(1, 'bar') # => "fbaroo"
5498 *
5499 * If the \Integer +index+ is negative, counts backward from the end of +self+
5500 * and inserts +other_string+ at offset <tt>index+1</tt>
5501 * (that is, _after_ <tt>self[index]</tt>):
5502 *
5503 * 'foo'.insert(-2, 'bar') # => "fobaro"
5504 *
5505 */
5506
5507static VALUE
5508rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5509{
5510 long pos = NUM2LONG(idx);
5511
5512 if (pos == -1) {
5513 return rb_str_append(str, str2);
5514 }
5515 else if (pos < 0) {
5516 pos++;
5517 }
5518 rb_str_splice(str, pos, 0, str2);
5519 return str;
5520}
5521
5522
5523/*
5524 * call-seq:
5525 * slice!(index) -> new_string or nil
5526 * slice!(start, length) -> new_string or nil
5527 * slice!(range) -> new_string or nil
5528 * slice!(regexp, capture = 0) -> new_string or nil
5529 * slice!(substring) -> new_string or nil
5530 *
5531 * Removes and returns the substring of +self+ specified by the arguments.
5532 * See {String Slices}[rdoc-ref:String@String+Slices].
5533 *
5534 * A few examples:
5535 *
5536 * string = "This is a string"
5537 * string.slice!(2) #=> "i"
5538 * string.slice!(3..6) #=> " is "
5539 * string.slice!(/s.*t/) #=> "sa st"
5540 * string.slice!("r") #=> "r"
5541 * string #=> "Thing"
5542 *
5543 */
5544
5545static VALUE
5546rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5547{
5548 VALUE result = Qnil;
5549 VALUE indx;
5550 long beg, len = 1;
5551 char *p;
5552
5553 rb_check_arity(argc, 1, 2);
5554 str_modify_keep_cr(str);
5555 indx = argv[0];
5556 if (RB_TYPE_P(indx, T_REGEXP)) {
5557 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5558 VALUE match = rb_backref_get();
5559 struct re_registers *regs = RMATCH_REGS(match);
5560 int nth = 0;
5561 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5562 if ((nth += regs->num_regs) <= 0) return Qnil;
5563 }
5564 else if (nth >= regs->num_regs) return Qnil;
5565 beg = BEG(nth);
5566 len = END(nth) - beg;
5567 goto subseq;
5568 }
5569 else if (argc == 2) {
5570 beg = NUM2LONG(indx);
5571 len = NUM2LONG(argv[1]);
5572 goto num_index;
5573 }
5574 else if (FIXNUM_P(indx)) {
5575 beg = FIX2LONG(indx);
5576 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5577 if (!len) return Qnil;
5578 beg = p - RSTRING_PTR(str);
5579 goto subseq;
5580 }
5581 else if (RB_TYPE_P(indx, T_STRING)) {
5582 beg = rb_str_index(str, indx, 0);
5583 if (beg == -1) return Qnil;
5584 len = RSTRING_LEN(indx);
5585 result = str_duplicate(rb_cString, indx);
5586 goto squash;
5587 }
5588 else {
5589 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5590 case Qnil:
5591 return Qnil;
5592 case Qfalse:
5593 beg = NUM2LONG(indx);
5594 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5595 if (!len) return Qnil;
5596 beg = p - RSTRING_PTR(str);
5597 goto subseq;
5598 default:
5599 goto num_index;
5600 }
5601 }
5602
5603 num_index:
5604 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5605 beg = p - RSTRING_PTR(str);
5606
5607 subseq:
5608 result = rb_str_new(RSTRING_PTR(str)+beg, len);
5609 rb_enc_cr_str_copy_for_substr(result, str);
5610
5611 squash:
5612 if (len > 0) {
5613 if (beg == 0) {
5614 rb_str_drop_bytes(str, len);
5615 }
5616 else {
5617 char *sptr = RSTRING_PTR(str);
5618 long slen = RSTRING_LEN(str);
5619 if (beg + len > slen) /* pathological check */
5620 len = slen - beg;
5621 memmove(sptr + beg,
5622 sptr + beg + len,
5623 slen - (beg + len));
5624 slen -= len;
5625 STR_SET_LEN(str, slen);
5626 TERM_FILL(&sptr[slen], TERM_LEN(str));
5627 }
5628 }
5629 return result;
5630}
5631
5632static VALUE
5633get_pat(VALUE pat)
5634{
5635 VALUE val;
5636
5637 switch (OBJ_BUILTIN_TYPE(pat)) {
5638 case T_REGEXP:
5639 return pat;
5640
5641 case T_STRING:
5642 break;
5643
5644 default:
5645 val = rb_check_string_type(pat);
5646 if (NIL_P(val)) {
5647 Check_Type(pat, T_REGEXP);
5648 }
5649 pat = val;
5650 }
5651
5652 return rb_reg_regcomp(pat);
5653}
5654
5655static VALUE
5656get_pat_quoted(VALUE pat, int check)
5657{
5658 VALUE val;
5659
5660 switch (OBJ_BUILTIN_TYPE(pat)) {
5661 case T_REGEXP:
5662 return pat;
5663
5664 case T_STRING:
5665 break;
5666
5667 default:
5668 val = rb_check_string_type(pat);
5669 if (NIL_P(val)) {
5670 Check_Type(pat, T_REGEXP);
5671 }
5672 pat = val;
5673 }
5674 if (check && is_broken_string(pat)) {
5675 rb_exc_raise(rb_reg_check_preprocess(pat));
5676 }
5677 return pat;
5678}
5679
5680static long
5681rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5682{
5683 if (BUILTIN_TYPE(pat) == T_STRING) {
5684 pos = rb_strseq_index(str, pat, pos, 1);
5685 if (set_backref_str) {
5686 if (pos >= 0) {
5687 str = rb_str_new_frozen_String(str);
5688 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5689 }
5690 else {
5692 }
5693 }
5694 return pos;
5695 }
5696 else {
5697 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5698 }
5699}
5700
5701
5702/*
5703 * call-seq:
5704 * sub!(pattern, replacement) -> self or nil
5705 * sub!(pattern) {|match| ... } -> self or nil
5706 *
5707 * Returns +self+ with only the first occurrence
5708 * (not all occurrences) of the given +pattern+ replaced.
5709 *
5710 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5711 *
5712 * Related: String#sub, String#gsub, String#gsub!.
5713 *
5714 */
5715
5716static VALUE
5717rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5718{
5719 VALUE pat, repl, hash = Qnil;
5720 int iter = 0;
5721 long plen;
5722 int min_arity = rb_block_given_p() ? 1 : 2;
5723 long beg;
5724
5725 rb_check_arity(argc, min_arity, 2);
5726 if (argc == 1) {
5727 iter = 1;
5728 }
5729 else {
5730 repl = argv[1];
5731 hash = rb_check_hash_type(argv[1]);
5732 if (NIL_P(hash)) {
5733 StringValue(repl);
5734 }
5735 }
5736
5737 pat = get_pat_quoted(argv[0], 1);
5738
5739 str_modifiable(str);
5740 beg = rb_pat_search(pat, str, 0, 1);
5741 if (beg >= 0) {
5742 rb_encoding *enc;
5743 int cr = ENC_CODERANGE(str);
5744 long beg0, end0;
5745 VALUE match, match0 = Qnil;
5746 struct re_registers *regs;
5747 char *p, *rp;
5748 long len, rlen;
5749
5750 match = rb_backref_get();
5751 regs = RMATCH_REGS(match);
5752 if (RB_TYPE_P(pat, T_STRING)) {
5753 beg0 = beg;
5754 end0 = beg0 + RSTRING_LEN(pat);
5755 match0 = pat;
5756 }
5757 else {
5758 beg0 = BEG(0);
5759 end0 = END(0);
5760 if (iter) match0 = rb_reg_nth_match(0, match);
5761 }
5762
5763 if (iter || !NIL_P(hash)) {
5764 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5765
5766 if (iter) {
5767 repl = rb_obj_as_string(rb_yield(match0));
5768 }
5769 else {
5770 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5771 repl = rb_obj_as_string(repl);
5772 }
5773 str_mod_check(str, p, len);
5774 rb_check_frozen(str);
5775 }
5776 else {
5777 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5778 }
5779
5780 enc = rb_enc_compatible(str, repl);
5781 if (!enc) {
5782 rb_encoding *str_enc = STR_ENC_GET(str);
5783 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5784 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5785 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5786 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5787 rb_enc_name(str_enc),
5788 rb_enc_name(STR_ENC_GET(repl)));
5789 }
5790 enc = STR_ENC_GET(repl);
5791 }
5792 rb_str_modify(str);
5793 rb_enc_associate(str, enc);
5795 int cr2 = ENC_CODERANGE(repl);
5796 if (cr2 == ENC_CODERANGE_BROKEN ||
5797 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5799 else
5800 cr = cr2;
5801 }
5802 plen = end0 - beg0;
5803 rlen = RSTRING_LEN(repl);
5804 len = RSTRING_LEN(str);
5805 if (rlen > plen) {
5806 RESIZE_CAPA(str, len + rlen - plen);
5807 }
5808 p = RSTRING_PTR(str);
5809 if (rlen != plen) {
5810 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5811 }
5812 rp = RSTRING_PTR(repl);
5813 memmove(p + beg0, rp, rlen);
5814 len += rlen - plen;
5815 STR_SET_LEN(str, len);
5816 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5817 ENC_CODERANGE_SET(str, cr);
5818
5819 return str;
5820 }
5821 return Qnil;
5822}
5823
5824
5825/*
5826 * call-seq:
5827 * sub(pattern, replacement) -> new_string
5828 * sub(pattern) {|match| ... } -> new_string
5829 *
5830 * Returns a copy of +self+ with only the first occurrence
5831 * (not all occurrences) of the given +pattern+ replaced.
5832 *
5833 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5834 *
5835 * Related: String#sub!, String#gsub, String#gsub!.
5836 *
5837 */
5838
5839static VALUE
5840rb_str_sub(int argc, VALUE *argv, VALUE str)
5841{
5842 str = str_duplicate(rb_cString, str);
5843 rb_str_sub_bang(argc, argv, str);
5844 return str;
5845}
5846
5847static VALUE
5848str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5849{
5850 VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
5851 struct re_registers *regs;
5852 long beg, beg0, end0;
5853 long offset, blen, slen, len, last;
5854 enum {STR, ITER, MAP} mode = STR;
5855 char *sp, *cp;
5856 int need_backref = -1;
5857 rb_encoding *str_enc;
5858
5859 switch (argc) {
5860 case 1:
5861 RETURN_ENUMERATOR(str, argc, argv);
5862 mode = ITER;
5863 break;
5864 case 2:
5865 repl = argv[1];
5866 hash = rb_check_hash_type(argv[1]);
5867 if (NIL_P(hash)) {
5868 StringValue(repl);
5869 }
5870 else {
5871 mode = MAP;
5872 }
5873 break;
5874 default:
5875 rb_error_arity(argc, 1, 2);
5876 }
5877
5878 pat = get_pat_quoted(argv[0], 1);
5879 beg = rb_pat_search(pat, str, 0, need_backref);
5880 if (beg < 0) {
5881 if (bang) return Qnil; /* no match, no substitution */
5882 return str_duplicate(rb_cString, str);
5883 }
5884
5885 offset = 0;
5886 blen = RSTRING_LEN(str) + 30; /* len + margin */
5887 dest = rb_str_buf_new(blen);
5888 sp = RSTRING_PTR(str);
5889 slen = RSTRING_LEN(str);
5890 cp = sp;
5891 str_enc = STR_ENC_GET(str);
5892 rb_enc_associate(dest, str_enc);
5894
5895 do {
5896 match = rb_backref_get();
5897 regs = RMATCH_REGS(match);
5898 if (RB_TYPE_P(pat, T_STRING)) {
5899 beg0 = beg;
5900 end0 = beg0 + RSTRING_LEN(pat);
5901 match0 = pat;
5902 }
5903 else {
5904 beg0 = BEG(0);
5905 end0 = END(0);
5906 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5907 }
5908
5909 if (mode) {
5910 if (mode == ITER) {
5911 val = rb_obj_as_string(rb_yield(match0));
5912 }
5913 else {
5914 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5915 val = rb_obj_as_string(val);
5916 }
5917 str_mod_check(str, sp, slen);
5918 if (val == dest) { /* paranoid check [ruby-dev:24827] */
5919 rb_raise(rb_eRuntimeError, "block should not cheat");
5920 }
5921 }
5922 else if (need_backref) {
5923 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5924 if (need_backref < 0) {
5925 need_backref = val != repl;
5926 }
5927 }
5928 else {
5929 val = repl;
5930 }
5931
5932 len = beg0 - offset; /* copy pre-match substr */
5933 if (len) {
5934 rb_enc_str_buf_cat(dest, cp, len, str_enc);
5935 }
5936
5937 rb_str_buf_append(dest, val);
5938
5939 last = offset;
5940 offset = end0;
5941 if (beg0 == end0) {
5942 /*
5943 * Always consume at least one character of the input string
5944 * in order to prevent infinite loops.
5945 */
5946 if (RSTRING_LEN(str) <= end0) break;
5947 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5948 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5949 offset = end0 + len;
5950 }
5951 cp = RSTRING_PTR(str) + offset;
5952 if (offset > RSTRING_LEN(str)) break;
5953 beg = rb_pat_search(pat, str, offset, need_backref);
5954 } while (beg >= 0);
5955 if (RSTRING_LEN(str) > offset) {
5956 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5957 }
5958 rb_pat_search(pat, str, last, 1);
5959 if (bang) {
5960 str_shared_replace(str, dest);
5961 }
5962 else {
5963 str = dest;
5964 }
5965
5966 return str;
5967}
5968
5969
5970/*
5971 * call-seq:
5972 * gsub!(pattern, replacement) -> self or nil
5973 * gsub!(pattern) {|match| ... } -> self or nil
5974 * gsub!(pattern) -> an_enumerator
5975 *
5976 * Performs the specified substring replacement(s) on +self+;
5977 * returns +self+ if any replacement occurred, +nil+ otherwise.
5978 *
5979 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5980 *
5981 * Returns an Enumerator if no +replacement+ and no block given.
5982 *
5983 * Related: String#sub, String#gsub, String#sub!.
5984 *
5985 */
5986
5987static VALUE
5988rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5989{
5990 str_modify_keep_cr(str);
5991 return str_gsub(argc, argv, str, 1);
5992}
5993
5994
5995/*
5996 * call-seq:
5997 * gsub(pattern, replacement) -> new_string
5998 * gsub(pattern) {|match| ... } -> new_string
5999 * gsub(pattern) -> enumerator
6000 *
6001 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6002 *
6003 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6004 *
6005 * Returns an Enumerator if no +replacement+ and no block given.
6006 *
6007 * Related: String#sub, String#sub!, String#gsub!.
6008 *
6009 */
6010
6011static VALUE
6012rb_str_gsub(int argc, VALUE *argv, VALUE str)
6013{
6014 return str_gsub(argc, argv, str, 0);
6015}
6016
6017
6018/*
6019 * call-seq:
6020 * replace(other_string) -> self
6021 *
6022 * Replaces the contents of +self+ with the contents of +other_string+:
6023 *
6024 * s = 'foo' # => "foo"
6025 * s.replace('bar') # => "bar"
6026 *
6027 */
6028
6029VALUE
6031{
6032 str_modifiable(str);
6033 if (str == str2) return str;
6034
6035 StringValue(str2);
6036 str_discard(str);
6037 return str_replace(str, str2);
6038}
6039
6040/*
6041 * call-seq:
6042 * clear -> self
6043 *
6044 * Removes the contents of +self+:
6045 *
6046 * s = 'foo' # => "foo"
6047 * s.clear # => ""
6048 *
6049 */
6050
6051static VALUE
6052rb_str_clear(VALUE str)
6053{
6054 str_discard(str);
6055 STR_SET_EMBED(str);
6056 STR_SET_EMBED_LEN(str, 0);
6057 RSTRING_PTR(str)[0] = 0;
6058 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6060 else
6062 return str;
6063}
6064
6065/*
6066 * call-seq:
6067 * chr -> string
6068 *
6069 * Returns a string containing the first character of +self+:
6070 *
6071 * s = 'foo' # => "foo"
6072 * s.chr # => "f"
6073 *
6074 */
6075
6076static VALUE
6077rb_str_chr(VALUE str)
6078{
6079 return rb_str_substr(str, 0, 1);
6080}
6081
6082/*
6083 * call-seq:
6084 * getbyte(index) -> integer or nil
6085 *
6086 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6087 *
6088 * s = 'abcde' # => "abcde"
6089 * s.getbyte(0) # => 97
6090 * s.getbyte(-1) # => 101
6091 * s.getbyte(5) # => nil
6092 *
6093 * Related: String#setbyte.
6094 */
6095static VALUE
6096rb_str_getbyte(VALUE str, VALUE index)
6097{
6098 long pos = NUM2LONG(index);
6099
6100 if (pos < 0)
6101 pos += RSTRING_LEN(str);
6102 if (pos < 0 || RSTRING_LEN(str) <= pos)
6103 return Qnil;
6104
6105 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6106}
6107
6108/*
6109 * call-seq:
6110 * setbyte(index, integer) -> integer
6111 *
6112 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6113 *
6114 * s = 'abcde' # => "abcde"
6115 * s.setbyte(0, 98) # => 98
6116 * s # => "bbcde"
6117 *
6118 * Related: String#getbyte.
6119 */
6120static VALUE
6121rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6122{
6123 long pos = NUM2LONG(index);
6124 long len = RSTRING_LEN(str);
6125 char *ptr, *head, *left = 0;
6126 rb_encoding *enc;
6127 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6128
6129 if (pos < -len || len <= pos)
6130 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6131 if (pos < 0)
6132 pos += len;
6133
6134 VALUE v = rb_to_int(value);
6135 VALUE w = rb_int_and(v, INT2FIX(0xff));
6136 char byte = (char)(NUM2INT(w) & 0xFF);
6137
6138 if (!str_independent(str))
6139 str_make_independent(str);
6140 enc = STR_ENC_GET(str);
6141 head = RSTRING_PTR(str);
6142 ptr = &head[pos];
6143 if (!STR_EMBED_P(str)) {
6144 cr = ENC_CODERANGE(str);
6145 switch (cr) {
6146 case ENC_CODERANGE_7BIT:
6147 left = ptr;
6148 *ptr = byte;
6149 if (ISASCII(byte)) goto end;
6150 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6151 if (!MBCLEN_CHARFOUND_P(nlen))
6153 else
6155 goto end;
6157 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6158 width = rb_enc_precise_mbclen(left, head+len, enc);
6159 *ptr = byte;
6160 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6161 if (!MBCLEN_CHARFOUND_P(nlen))
6163 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6165 goto end;
6166 }
6167 }
6169 *ptr = byte;
6170
6171 end:
6172 return value;
6173}
6174
6175static VALUE
6176str_byte_substr(VALUE str, long beg, long len, int empty)
6177{
6178 long n = RSTRING_LEN(str);
6179
6180 if (beg > n || len < 0) return Qnil;
6181 if (beg < 0) {
6182 beg += n;
6183 if (beg < 0) return Qnil;
6184 }
6185 if (len > n - beg)
6186 len = n - beg;
6187 if (len <= 0) {
6188 if (!empty) return Qnil;
6189 len = 0;
6190 }
6191
6192 VALUE str2 = str_subseq(str, beg, len);
6193
6194 str_enc_copy(str2, str);
6195
6196 if (RSTRING_LEN(str2) == 0) {
6197 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6199 else
6201 }
6202 else {
6203 switch (ENC_CODERANGE(str)) {
6204 case ENC_CODERANGE_7BIT:
6206 break;
6207 default:
6209 break;
6210 }
6211 }
6212
6213 return str2;
6214}
6215
6216static VALUE
6217str_byte_aref(VALUE str, VALUE indx)
6218{
6219 long idx;
6220 if (FIXNUM_P(indx)) {
6221 idx = FIX2LONG(indx);
6222 }
6223 else {
6224 /* check if indx is Range */
6225 long beg, len = RSTRING_LEN(str);
6226
6227 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6228 case Qfalse:
6229 break;
6230 case Qnil:
6231 return Qnil;
6232 default:
6233 return str_byte_substr(str, beg, len, TRUE);
6234 }
6235
6236 idx = NUM2LONG(indx);
6237 }
6238 return str_byte_substr(str, idx, 1, FALSE);
6239}
6240
6241/*
6242 * call-seq:
6243 * byteslice(index, length = 1) -> string or nil
6244 * byteslice(range) -> string or nil
6245 *
6246 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6247 *
6248 * With integer arguments +index+ and +length+ given,
6249 * returns the substring beginning at the given +index+
6250 * of the given +length+ (if possible),
6251 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6252 *
6253 * s = '0123456789' # => "0123456789"
6254 * s.byteslice(2) # => "2"
6255 * s.byteslice(200) # => nil
6256 * s.byteslice(4, 3) # => "456"
6257 * s.byteslice(4, 30) # => "456789"
6258 * s.byteslice(4, -1) # => nil
6259 * s.byteslice(40, 2) # => nil
6260 *
6261 * In either case above, counts backwards from the end of +self+
6262 * if +index+ is negative:
6263 *
6264 * s = '0123456789' # => "0123456789"
6265 * s.byteslice(-4) # => "6"
6266 * s.byteslice(-4, 3) # => "678"
6267 *
6268 * With Range argument +range+ given, returns
6269 * <tt>byteslice(range.begin, range.size)</tt>:
6270 *
6271 * s = '0123456789' # => "0123456789"
6272 * s.byteslice(4..6) # => "456"
6273 * s.byteslice(-6..-4) # => "456"
6274 * s.byteslice(5..2) # => "" # range.size is zero.
6275 * s.byteslice(40..42) # => nil
6276 *
6277 * In all cases, a returned string has the same encoding as +self+:
6278 *
6279 * s.encoding # => #<Encoding:UTF-8>
6280 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6281 *
6282 */
6283
6284static VALUE
6285rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6286{
6287 if (argc == 2) {
6288 long beg = NUM2LONG(argv[0]);
6289 long len = NUM2LONG(argv[1]);
6290 return str_byte_substr(str, beg, len, TRUE);
6291 }
6292 rb_check_arity(argc, 1, 2);
6293 return str_byte_aref(str, argv[0]);
6294}
6295
6296/*
6297 * call-seq:
6298 * bytesplice(index, length, str) -> string
6299 * bytesplice(range, str) -> string
6300 *
6301 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
6302 * The portion of the string affected is determined using
6303 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
6304 * If the replacement string is not the same length as the text it is replacing,
6305 * the string will be adjusted accordingly.
6306 * The form that take an Integer will raise an IndexError if the value is out
6307 * of range; the Range form will raise a RangeError.
6308 * If the beginning or ending offset does not land on character (codepoint)
6309 * boundary, an IndexError will be raised.
6310 */
6311
6312static VALUE
6313rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6314{
6315 long beg, end, len, slen;
6316 VALUE val;
6317 rb_encoding *enc;
6318 int cr;
6319
6320 rb_check_arity(argc, 2, 3);
6321 if (argc == 2) {
6322 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6323 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6324 rb_builtin_class_name(argv[0]));
6325 }
6326 val = argv[1];
6327 }
6328 else {
6329 beg = NUM2LONG(argv[0]);
6330 len = NUM2LONG(argv[1]);
6331 val = argv[2];
6332 }
6333 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
6334 slen = RSTRING_LEN(str);
6335 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
6336 rb_raise(rb_eIndexError, "index %ld out of string", beg);
6337 }
6338 if (beg < 0) {
6339 beg += slen;
6340 }
6341 assert(beg >= 0);
6342 assert(beg <= slen);
6343 if (len > slen - beg) {
6344 len = slen - beg;
6345 }
6346 end = beg + len;
6347 if (!str_check_byte_pos(str, beg)) {
6349 "offset %ld does not land on character boundary", beg);
6350 }
6351 if (!str_check_byte_pos(str, end)) {
6353 "offset %ld does not land on character boundary", end);
6354 }
6355 StringValue(val);
6356 enc = rb_enc_check(str, val);
6357 str_modify_keep_cr(str);
6358 rb_str_splice_0(str, beg, len, val);
6359 rb_enc_associate(str, enc);
6361 if (cr != ENC_CODERANGE_BROKEN)
6362 ENC_CODERANGE_SET(str, cr);
6363 return str;
6364}
6365
6366/*
6367 * call-seq:
6368 * reverse -> string
6369 *
6370 * Returns a new string with the characters from +self+ in reverse order.
6371 *
6372 * 'stressed'.reverse # => "desserts"
6373 *
6374 */
6375
6376static VALUE
6377rb_str_reverse(VALUE str)
6378{
6379 rb_encoding *enc;
6380 VALUE rev;
6381 char *s, *e, *p;
6382 int cr;
6383
6384 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6385 enc = STR_ENC_GET(str);
6386 rev = rb_str_new(0, RSTRING_LEN(str));
6387 s = RSTRING_PTR(str); e = RSTRING_END(str);
6388 p = RSTRING_END(rev);
6389 cr = ENC_CODERANGE(str);
6390
6391 if (RSTRING_LEN(str) > 1) {
6392 if (single_byte_optimizable(str)) {
6393 while (s < e) {
6394 *--p = *s++;
6395 }
6396 }
6397 else if (cr == ENC_CODERANGE_VALID) {
6398 while (s < e) {
6399 int clen = rb_enc_fast_mbclen(s, e, enc);
6400
6401 p -= clen;
6402 memcpy(p, s, clen);
6403 s += clen;
6404 }
6405 }
6406 else {
6407 cr = rb_enc_asciicompat(enc) ?
6409 while (s < e) {
6410 int clen = rb_enc_mbclen(s, e, enc);
6411
6412 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6413 p -= clen;
6414 memcpy(p, s, clen);
6415 s += clen;
6416 }
6417 }
6418 }
6419 STR_SET_LEN(rev, RSTRING_LEN(str));
6420 str_enc_copy(rev, str);
6421 ENC_CODERANGE_SET(rev, cr);
6422
6423 return rev;
6424}
6425
6426
6427/*
6428 * call-seq:
6429 * reverse! -> self
6430 *
6431 * Returns +self+ with its characters reversed:
6432 *
6433 * s = 'stressed'
6434 * s.reverse! # => "desserts"
6435 * s # => "desserts"
6436 *
6437 */
6438
6439static VALUE
6440rb_str_reverse_bang(VALUE str)
6441{
6442 if (RSTRING_LEN(str) > 1) {
6443 if (single_byte_optimizable(str)) {
6444 char *s, *e, c;
6445
6446 str_modify_keep_cr(str);
6447 s = RSTRING_PTR(str);
6448 e = RSTRING_END(str) - 1;
6449 while (s < e) {
6450 c = *s;
6451 *s++ = *e;
6452 *e-- = c;
6453 }
6454 }
6455 else {
6456 str_shared_replace(str, rb_str_reverse(str));
6457 }
6458 }
6459 else {
6460 str_modify_keep_cr(str);
6461 }
6462 return str;
6463}
6464
6465
6466/*
6467 * call-seq:
6468 * include? other_string -> true or false
6469 *
6470 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6471 *
6472 * s = 'foo'
6473 * s.include?('f') # => true
6474 * s.include?('fo') # => true
6475 * s.include?('food') # => false
6476 *
6477 */
6478
6479VALUE
6480rb_str_include(VALUE str, VALUE arg)
6481{
6482 long i;
6483
6484 StringValue(arg);
6485 i = rb_str_index(str, arg, 0);
6486
6487 return RBOOL(i != -1);
6488}
6489
6490
6491/*
6492 * call-seq:
6493 * to_i(base = 10) -> integer
6494 *
6495 * Returns the result of interpreting leading characters in +self+
6496 * as an integer in the given +base+ (which must be in (0, 2..36)):
6497 *
6498 * '123456'.to_i # => 123456
6499 * '123def'.to_i(16) # => 1195503
6500 *
6501 * With +base+ zero, string +object+ may contain leading characters
6502 * to specify the actual base:
6503 *
6504 * '123def'.to_i(0) # => 123
6505 * '0123def'.to_i(0) # => 83
6506 * '0b123def'.to_i(0) # => 1
6507 * '0o123def'.to_i(0) # => 83
6508 * '0d123def'.to_i(0) # => 123
6509 * '0x123def'.to_i(0) # => 1195503
6510 *
6511 * Characters past a leading valid number (in the given +base+) are ignored:
6512 *
6513 * '12.345'.to_i # => 12
6514 * '12345'.to_i(2) # => 1
6515 *
6516 * Returns zero if there is no leading valid number:
6517 *
6518 * 'abcdef'.to_i # => 0
6519 * '2'.to_i(2) # => 0
6520 *
6521 */
6522
6523static VALUE
6524rb_str_to_i(int argc, VALUE *argv, VALUE str)
6525{
6526 int base = 10;
6527
6528 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6529 rb_raise(rb_eArgError, "invalid radix %d", base);
6530 }
6531 return rb_str_to_inum(str, base, FALSE);
6532}
6533
6534
6535/*
6536 * call-seq:
6537 * to_f -> float
6538 *
6539 * Returns the result of interpreting leading characters in +self+ as a Float:
6540 *
6541 * '3.14159'.to_f # => 3.14159
6542 '1.234e-2'.to_f # => 0.01234
6543 *
6544 * Characters past a leading valid number (in the given +base+) are ignored:
6545 *
6546 * '3.14 (pi to two places)'.to_f # => 3.14
6547 *
6548 * Returns zero if there is no leading valid number:
6549 *
6550 * 'abcdef'.to_f # => 0.0
6551 *
6552 */
6553
6554static VALUE
6555rb_str_to_f(VALUE str)
6556{
6557 return DBL2NUM(rb_str_to_dbl(str, FALSE));
6558}
6559
6560
6561/*
6562 * call-seq:
6563 * to_s -> self or string
6564 *
6565 * Returns +self+ if +self+ is a \String,
6566 * or +self+ converted to a \String if +self+ is a subclass of \String.
6567 *
6568 * String#to_str is an alias for String#to_s.
6569 *
6570 */
6571
6572static VALUE
6573rb_str_to_s(VALUE str)
6574{
6575 if (rb_obj_class(str) != rb_cString) {
6576 return str_duplicate(rb_cString, str);
6577 }
6578 return str;
6579}
6580
6581#if 0
6582static void
6583str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6584{
6585 char s[RUBY_MAX_CHAR_LEN];
6586 int n = rb_enc_codelen(c, enc);
6587
6588 rb_enc_mbcput(c, s, enc);
6589 rb_enc_str_buf_cat(str, s, n, enc);
6590}
6591#endif
6592
6593#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6594
6595int
6596rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6597{
6598 char buf[CHAR_ESC_LEN + 1];
6599 int l;
6600
6601#if SIZEOF_INT > 4
6602 c &= 0xffffffff;
6603#endif
6604 if (unicode_p) {
6605 if (c < 0x7F && ISPRINT(c)) {
6606 snprintf(buf, CHAR_ESC_LEN, "%c", c);
6607 }
6608 else if (c < 0x10000) {
6609 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6610 }
6611 else {
6612 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6613 }
6614 }
6615 else {
6616 if (c < 0x100) {
6617 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6618 }
6619 else {
6620 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6621 }
6622 }
6623 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
6624 rb_str_buf_cat(result, buf, l);
6625 return l;
6626}
6627
6628const char *
6629ruby_escaped_char(int c)
6630{
6631 switch (c) {
6632 case '\0': return "\\0";
6633 case '\n': return "\\n";
6634 case '\r': return "\\r";
6635 case '\t': return "\\t";
6636 case '\f': return "\\f";
6637 case '\013': return "\\v";
6638 case '\010': return "\\b";
6639 case '\007': return "\\a";
6640 case '\033': return "\\e";
6641 case '\x7f': return "\\c?";
6642 }
6643 return NULL;
6644}
6645
6646VALUE
6647rb_str_escape(VALUE str)
6648{
6649 int encidx = ENCODING_GET(str);
6650 rb_encoding *enc = rb_enc_from_index(encidx);
6651 const char *p = RSTRING_PTR(str);
6652 const char *pend = RSTRING_END(str);
6653 const char *prev = p;
6654 char buf[CHAR_ESC_LEN + 1];
6655 VALUE result = rb_str_buf_new(0);
6656 int unicode_p = rb_enc_unicode_p(enc);
6657 int asciicompat = rb_enc_asciicompat(enc);
6658
6659 while (p < pend) {
6660 unsigned int c;
6661 const char *cc;
6662 int n = rb_enc_precise_mbclen(p, pend, enc);
6663 if (!MBCLEN_CHARFOUND_P(n)) {
6664 if (p > prev) str_buf_cat(result, prev, p - prev);
6665 n = rb_enc_mbminlen(enc);
6666 if (pend < p + n)
6667 n = (int)(pend - p);
6668 while (n--) {
6669 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6670 str_buf_cat(result, buf, strlen(buf));
6671 prev = ++p;
6672 }
6673 continue;
6674 }
6675 n = MBCLEN_CHARFOUND_LEN(n);
6676 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6677 p += n;
6678 cc = ruby_escaped_char(c);
6679 if (cc) {
6680 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6681 str_buf_cat(result, cc, strlen(cc));
6682 prev = p;
6683 }
6684 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6685 }
6686 else {
6687 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6688 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6689 prev = p;
6690 }
6691 }
6692 if (p > prev) str_buf_cat(result, prev, p - prev);
6693 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
6694
6695 return result;
6696}
6697
6698/*
6699 * call-seq:
6700 * inspect -> string
6701 *
6702 * Returns a printable version of +self+, enclosed in double-quotes,
6703 * and with special characters escaped:
6704 *
6705 * s = "foo\tbar\tbaz\n"
6706 * s.inspect
6707 * # => "\"foo\\tbar\\tbaz\\n\""
6708 *
6709 */
6710
6711VALUE
6713{
6714 int encidx = ENCODING_GET(str);
6715 rb_encoding *enc = rb_enc_from_index(encidx);
6716 const char *p, *pend, *prev;
6717 char buf[CHAR_ESC_LEN + 1];
6718 VALUE result = rb_str_buf_new(0);
6719 rb_encoding *resenc = rb_default_internal_encoding();
6720 int unicode_p = rb_enc_unicode_p(enc);
6721 int asciicompat = rb_enc_asciicompat(enc);
6722
6723 if (resenc == NULL) resenc = rb_default_external_encoding();
6724 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6725 rb_enc_associate(result, resenc);
6726 str_buf_cat2(result, "\"");
6727
6728 p = RSTRING_PTR(str); pend = RSTRING_END(str);
6729 prev = p;
6730 while (p < pend) {
6731 unsigned int c, cc;
6732 int n;
6733
6734 n = rb_enc_precise_mbclen(p, pend, enc);
6735 if (!MBCLEN_CHARFOUND_P(n)) {
6736 if (p > prev) str_buf_cat(result, prev, p - prev);
6737 n = rb_enc_mbminlen(enc);
6738 if (pend < p + n)
6739 n = (int)(pend - p);
6740 while (n--) {
6741 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6742 str_buf_cat(result, buf, strlen(buf));
6743 prev = ++p;
6744 }
6745 continue;
6746 }
6747 n = MBCLEN_CHARFOUND_LEN(n);
6748 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6749 p += n;
6750 if ((asciicompat || unicode_p) &&
6751 (c == '"'|| c == '\\' ||
6752 (c == '#' &&
6753 p < pend &&
6754 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
6755 (cc = rb_enc_codepoint(p,pend,enc),
6756 (cc == '$' || cc == '@' || cc == '{'))))) {
6757 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6758 str_buf_cat2(result, "\\");
6759 if (asciicompat || enc == resenc) {
6760 prev = p - n;
6761 continue;
6762 }
6763 }
6764 switch (c) {
6765 case '\n': cc = 'n'; break;
6766 case '\r': cc = 'r'; break;
6767 case '\t': cc = 't'; break;
6768 case '\f': cc = 'f'; break;
6769 case '\013': cc = 'v'; break;
6770 case '\010': cc = 'b'; break;
6771 case '\007': cc = 'a'; break;
6772 case 033: cc = 'e'; break;
6773 default: cc = 0; break;
6774 }
6775 if (cc) {
6776 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6777 buf[0] = '\\';
6778 buf[1] = (char)cc;
6779 str_buf_cat(result, buf, 2);
6780 prev = p;
6781 continue;
6782 }
6783 /* The special casing of 0x85 (NEXT_LINE) here is because
6784 * Oniguruma historically treats it as printable, but it
6785 * doesn't match the print POSIX bracket class or character
6786 * property in regexps.
6787 *
6788 * See Ruby Bug #16842 for details:
6789 * https://bugs.ruby-lang.org/issues/16842
6790 */
6791 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
6792 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6793 continue;
6794 }
6795 else {
6796 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6797 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6798 prev = p;
6799 continue;
6800 }
6801 }
6802 if (p > prev) str_buf_cat(result, prev, p - prev);
6803 str_buf_cat2(result, "\"");
6804
6805 return result;
6806}
6807
6808#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6809
6810/*
6811 * call-seq:
6812 * dump -> string
6813 *
6814 * Returns a printable version of +self+, enclosed in double-quotes,
6815 * with special characters escaped, and with non-printing characters
6816 * replaced by hexadecimal notation:
6817 *
6818 * "hello \n ''".dump # => "\"hello \\n ''\""
6819 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6820 *
6821 * Related: String#undump (inverse of String#dump).
6822 *
6823 */
6824
6825VALUE
6827{
6828 int encidx = rb_enc_get_index(str);
6829 rb_encoding *enc = rb_enc_from_index(encidx);
6830 long len;
6831 const char *p, *pend;
6832 char *q, *qend;
6833 VALUE result;
6834 int u8 = (encidx == rb_utf8_encindex());
6835 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6836
6837 len = 2; /* "" */
6838 if (!rb_enc_asciicompat(enc)) {
6839 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6840 len += strlen(enc->name);
6841 }
6842
6843 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6844 while (p < pend) {
6845 int clen;
6846 unsigned char c = *p++;
6847
6848 switch (c) {
6849 case '"': case '\\':
6850 case '\n': case '\r':
6851 case '\t': case '\f':
6852 case '\013': case '\010': case '\007': case '\033':
6853 clen = 2;
6854 break;
6855
6856 case '#':
6857 clen = IS_EVSTR(p, pend) ? 2 : 1;
6858 break;
6859
6860 default:
6861 if (ISPRINT(c)) {
6862 clen = 1;
6863 }
6864 else {
6865 if (u8 && c > 0x7F) { /* \u notation */
6866 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6867 if (MBCLEN_CHARFOUND_P(n)) {
6868 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6869 if (cc <= 0xFFFF)
6870 clen = 6; /* \uXXXX */
6871 else if (cc <= 0xFFFFF)
6872 clen = 9; /* \u{XXXXX} */
6873 else
6874 clen = 10; /* \u{XXXXXX} */
6875 p += MBCLEN_CHARFOUND_LEN(n)-1;
6876 break;
6877 }
6878 }
6879 clen = 4; /* \xNN */
6880 }
6881 break;
6882 }
6883
6884 if (clen > LONG_MAX - len) {
6885 rb_raise(rb_eRuntimeError, "string size too big");
6886 }
6887 len += clen;
6888 }
6889
6890 result = rb_str_new(0, len);
6891 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6892 q = RSTRING_PTR(result); qend = q + len + 1;
6893
6894 *q++ = '"';
6895 while (p < pend) {
6896 unsigned char c = *p++;
6897
6898 if (c == '"' || c == '\\') {
6899 *q++ = '\\';
6900 *q++ = c;
6901 }
6902 else if (c == '#') {
6903 if (IS_EVSTR(p, pend)) *q++ = '\\';
6904 *q++ = '#';
6905 }
6906 else if (c == '\n') {
6907 *q++ = '\\';
6908 *q++ = 'n';
6909 }
6910 else if (c == '\r') {
6911 *q++ = '\\';
6912 *q++ = 'r';
6913 }
6914 else if (c == '\t') {
6915 *q++ = '\\';
6916 *q++ = 't';
6917 }
6918 else if (c == '\f') {
6919 *q++ = '\\';
6920 *q++ = 'f';
6921 }
6922 else if (c == '\013') {
6923 *q++ = '\\';
6924 *q++ = 'v';
6925 }
6926 else if (c == '\010') {
6927 *q++ = '\\';
6928 *q++ = 'b';
6929 }
6930 else if (c == '\007') {
6931 *q++ = '\\';
6932 *q++ = 'a';
6933 }
6934 else if (c == '\033') {
6935 *q++ = '\\';
6936 *q++ = 'e';
6937 }
6938 else if (ISPRINT(c)) {
6939 *q++ = c;
6940 }
6941 else {
6942 *q++ = '\\';
6943 if (u8) {
6944 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6945 if (MBCLEN_CHARFOUND_P(n)) {
6946 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6947 p += n;
6948 if (cc <= 0xFFFF)
6949 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
6950 else
6951 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
6952 q += strlen(q);
6953 continue;
6954 }
6955 }
6956 snprintf(q, qend-q, "x%02X", c);
6957 q += 3;
6958 }
6959 }
6960 *q++ = '"';
6961 *q = '\0';
6962 if (!rb_enc_asciicompat(enc)) {
6963 snprintf(q, qend-q, nonascii_suffix, enc->name);
6964 encidx = rb_ascii8bit_encindex();
6965 }
6966 /* result from dump is ASCII */
6967 rb_enc_associate_index(result, encidx);
6969 return result;
6970}
6971
6972static int
6973unescape_ascii(unsigned int c)
6974{
6975 switch (c) {
6976 case 'n':
6977 return '\n';
6978 case 'r':
6979 return '\r';
6980 case 't':
6981 return '\t';
6982 case 'f':
6983 return '\f';
6984 case 'v':
6985 return '\13';
6986 case 'b':
6987 return '\010';
6988 case 'a':
6989 return '\007';
6990 case 'e':
6991 return 033;
6992 }
6994}
6995
6996static void
6997undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
6998{
6999 const char *s = *ss;
7000 unsigned int c;
7001 int codelen;
7002 size_t hexlen;
7003 unsigned char buf[6];
7004 static rb_encoding *enc_utf8 = NULL;
7005
7006 switch (*s) {
7007 case '\\':
7008 case '"':
7009 case '#':
7010 rb_str_cat(undumped, s, 1); /* cat itself */
7011 s++;
7012 break;
7013 case 'n':
7014 case 'r':
7015 case 't':
7016 case 'f':
7017 case 'v':
7018 case 'b':
7019 case 'a':
7020 case 'e':
7021 *buf = unescape_ascii(*s);
7022 rb_str_cat(undumped, (char *)buf, 1);
7023 s++;
7024 break;
7025 case 'u':
7026 if (*binary) {
7027 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7028 }
7029 *utf8 = true;
7030 if (++s >= s_end) {
7031 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7032 }
7033 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7034 if (*penc != enc_utf8) {
7035 *penc = enc_utf8;
7036 rb_enc_associate(undumped, enc_utf8);
7037 }
7038 if (*s == '{') { /* handle \u{...} form */
7039 s++;
7040 for (;;) {
7041 if (s >= s_end) {
7042 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7043 }
7044 if (*s == '}') {
7045 s++;
7046 break;
7047 }
7048 if (ISSPACE(*s)) {
7049 s++;
7050 continue;
7051 }
7052 c = scan_hex(s, s_end-s, &hexlen);
7053 if (hexlen == 0 || hexlen > 6) {
7054 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7055 }
7056 if (c > 0x10ffff) {
7057 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7058 }
7059 if (0xd800 <= c && c <= 0xdfff) {
7060 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7061 }
7062 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7063 rb_str_cat(undumped, (char *)buf, codelen);
7064 s += hexlen;
7065 }
7066 }
7067 else { /* handle \uXXXX form */
7068 c = scan_hex(s, 4, &hexlen);
7069 if (hexlen != 4) {
7070 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7071 }
7072 if (0xd800 <= c && c <= 0xdfff) {
7073 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7074 }
7075 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7076 rb_str_cat(undumped, (char *)buf, codelen);
7077 s += hexlen;
7078 }
7079 break;
7080 case 'x':
7081 if (*utf8) {
7082 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7083 }
7084 *binary = true;
7085 if (++s >= s_end) {
7086 rb_raise(rb_eRuntimeError, "invalid hex escape");
7087 }
7088 *buf = scan_hex(s, 2, &hexlen);
7089 if (hexlen != 2) {
7090 rb_raise(rb_eRuntimeError, "invalid hex escape");
7091 }
7092 rb_str_cat(undumped, (char *)buf, 1);
7093 s += hexlen;
7094 break;
7095 default:
7096 rb_str_cat(undumped, s-1, 2);
7097 s++;
7098 }
7099
7100 *ss = s;
7101}
7102
7103static VALUE rb_str_is_ascii_only_p(VALUE str);
7104
7105/*
7106 * call-seq:
7107 * undump -> string
7108 *
7109 * Returns an unescaped version of +self+:
7110 *
7111 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7112 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7113 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7114 * s_undumped == s_orig # => true
7115 *
7116 * Related: String#dump (inverse of String#undump).
7117 *
7118 */
7119
7120static VALUE
7121str_undump(VALUE str)
7122{
7123 const char *s = RSTRING_PTR(str);
7124 const char *s_end = RSTRING_END(str);
7125 rb_encoding *enc = rb_enc_get(str);
7126 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7127 bool utf8 = false;
7128 bool binary = false;
7129 int w;
7130
7132 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7133 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7134 }
7135 if (!str_null_check(str, &w)) {
7136 rb_raise(rb_eRuntimeError, "string contains null byte");
7137 }
7138 if (RSTRING_LEN(str) < 2) goto invalid_format;
7139 if (*s != '"') goto invalid_format;
7140
7141 /* strip '"' at the start */
7142 s++;
7143
7144 for (;;) {
7145 if (s >= s_end) {
7146 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7147 }
7148
7149 if (*s == '"') {
7150 /* epilogue */
7151 s++;
7152 if (s == s_end) {
7153 /* ascii compatible dumped string */
7154 break;
7155 }
7156 else {
7157 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7158 static const char dup_suffix[] = ".dup";
7159 const char *encname;
7160 int encidx;
7161 ptrdiff_t size;
7162
7163 /* check separately for strings dumped by older versions */
7164 size = sizeof(dup_suffix) - 1;
7165 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7166
7167 size = sizeof(force_encoding_suffix) - 1;
7168 if (s_end - s <= size) goto invalid_format;
7169 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7170 s += size;
7171
7172 if (utf8) {
7173 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7174 }
7175
7176 encname = s;
7177 s = memchr(s, '"', s_end-s);
7178 size = s - encname;
7179 if (!s) goto invalid_format;
7180 if (s_end - s != 2) goto invalid_format;
7181 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7182
7183 encidx = rb_enc_find_index2(encname, (long)size);
7184 if (encidx < 0) {
7185 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7186 }
7187 rb_enc_associate_index(undumped, encidx);
7188 }
7189 break;
7190 }
7191
7192 if (*s == '\\') {
7193 s++;
7194 if (s >= s_end) {
7195 rb_raise(rb_eRuntimeError, "invalid escape");
7196 }
7197 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7198 }
7199 else {
7200 rb_str_cat(undumped, s++, 1);
7201 }
7202 }
7203
7204 return undumped;
7205invalid_format:
7206 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7207}
7208
7209static void
7210rb_str_check_dummy_enc(rb_encoding *enc)
7211{
7212 if (rb_enc_dummy_p(enc)) {
7213 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7214 rb_enc_name(enc));
7215 }
7216}
7217
7218static rb_encoding *
7219str_true_enc(VALUE str)
7220{
7221 rb_encoding *enc = STR_ENC_GET(str);
7222 rb_str_check_dummy_enc(enc);
7223 return enc;
7224}
7225
7226static OnigCaseFoldType
7227check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7228{
7229 if (argc==0)
7230 return flags;
7231 if (argc>2)
7232 rb_raise(rb_eArgError, "too many options");
7233 if (argv[0]==sym_turkic) {
7234 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7235 if (argc==2) {
7236 if (argv[1]==sym_lithuanian)
7237 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7238 else
7239 rb_raise(rb_eArgError, "invalid second option");
7240 }
7241 }
7242 else if (argv[0]==sym_lithuanian) {
7243 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7244 if (argc==2) {
7245 if (argv[1]==sym_turkic)
7246 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7247 else
7248 rb_raise(rb_eArgError, "invalid second option");
7249 }
7250 }
7251 else if (argc>1)
7252 rb_raise(rb_eArgError, "too many options");
7253 else if (argv[0]==sym_ascii)
7254 flags |= ONIGENC_CASE_ASCII_ONLY;
7255 else if (argv[0]==sym_fold) {
7256 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7257 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7258 else
7259 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7260 }
7261 else
7262 rb_raise(rb_eArgError, "invalid option");
7263 return flags;
7264}
7265
7266static inline bool
7267case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7268{
7269 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7270 return true;
7271 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7272}
7273
7274/* 16 should be long enough to absorb any kind of single character length increase */
7275#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7276#ifndef CASEMAP_DEBUG
7277# define CASEMAP_DEBUG 0
7278#endif
7279
7280struct mapping_buffer;
7281typedef struct mapping_buffer {
7282 size_t capa;
7283 size_t used;
7284 struct mapping_buffer *next;
7285 OnigUChar space[FLEX_ARY_LEN];
7287
7288static void
7289mapping_buffer_free(void *p)
7290{
7291 mapping_buffer *previous_buffer;
7292 mapping_buffer *current_buffer = p;
7293 while (current_buffer) {
7294 previous_buffer = current_buffer;
7295 current_buffer = current_buffer->next;
7296 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7297 }
7298}
7299
7300static const rb_data_type_t mapping_buffer_type = {
7301 "mapping_buffer",
7302 {0, mapping_buffer_free,}
7303};
7304
7305static VALUE
7306rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7307{
7308 VALUE target;
7309
7310 const OnigUChar *source_current, *source_end;
7311 int target_length = 0;
7312 VALUE buffer_anchor;
7313 mapping_buffer *current_buffer = 0;
7314 mapping_buffer **pre_buffer;
7315 size_t buffer_count = 0;
7316 int buffer_length_or_invalid;
7317
7318 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7319
7320 source_current = (OnigUChar*)RSTRING_PTR(source);
7321 source_end = (OnigUChar*)RSTRING_END(source);
7322
7323 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7324 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7325 while (source_current < source_end) {
7326 /* increase multiplier using buffer count to converge quickly */
7327 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7328 if (CASEMAP_DEBUG) {
7329 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7330 }
7331 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7332 *pre_buffer = current_buffer;
7333 pre_buffer = &current_buffer->next;
7334 current_buffer->next = NULL;
7335 current_buffer->capa = capa;
7336 buffer_length_or_invalid = enc->case_map(flags,
7337 &source_current, source_end,
7338 current_buffer->space,
7339 current_buffer->space+current_buffer->capa,
7340 enc);
7341 if (buffer_length_or_invalid < 0) {
7342 current_buffer = DATA_PTR(buffer_anchor);
7343 DATA_PTR(buffer_anchor) = 0;
7344 mapping_buffer_free(current_buffer);
7345 rb_raise(rb_eArgError, "input string invalid");
7346 }
7347 target_length += current_buffer->used = buffer_length_or_invalid;
7348 }
7349 if (CASEMAP_DEBUG) {
7350 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7351 }
7352
7353 if (buffer_count==1) {
7354 target = rb_str_new((const char*)current_buffer->space, target_length);
7355 }
7356 else {
7357 char *target_current;
7358
7359 target = rb_str_new(0, target_length);
7360 target_current = RSTRING_PTR(target);
7361 current_buffer = DATA_PTR(buffer_anchor);
7362 while (current_buffer) {
7363 memcpy(target_current, current_buffer->space, current_buffer->used);
7364 target_current += current_buffer->used;
7365 current_buffer = current_buffer->next;
7366 }
7367 }
7368 current_buffer = DATA_PTR(buffer_anchor);
7369 DATA_PTR(buffer_anchor) = 0;
7370 mapping_buffer_free(current_buffer);
7371
7372 RB_GC_GUARD(buffer_anchor);
7373
7374 /* TODO: check about string terminator character */
7375 str_enc_copy(target, source);
7376 /*ENC_CODERANGE_SET(mapped, cr);*/
7377
7378 return target;
7379}
7380
7381static VALUE
7382rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7383{
7384 const OnigUChar *source_current, *source_end;
7385 OnigUChar *target_current, *target_end;
7386 long old_length = RSTRING_LEN(source);
7387 int length_or_invalid;
7388
7389 if (old_length == 0) return Qnil;
7390
7391 source_current = (OnigUChar*)RSTRING_PTR(source);
7392 source_end = (OnigUChar*)RSTRING_END(source);
7393 if (source == target) {
7394 target_current = (OnigUChar*)source_current;
7395 target_end = (OnigUChar*)source_end;
7396 }
7397 else {
7398 target_current = (OnigUChar*)RSTRING_PTR(target);
7399 target_end = (OnigUChar*)RSTRING_END(target);
7400 }
7401
7402 length_or_invalid = onigenc_ascii_only_case_map(flags,
7403 &source_current, source_end,
7404 target_current, target_end, enc);
7405 if (length_or_invalid < 0)
7406 rb_raise(rb_eArgError, "input string invalid");
7407 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7408 fprintf(stderr, "problem with rb_str_ascii_casemap"
7409 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7410 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7411 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7412 }
7413
7414 str_enc_copy(target, source);
7415
7416 return target;
7417}
7418
7419static bool
7420upcase_single(VALUE str)
7421{
7422 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7423 bool modified = false;
7424
7425 while (s < send) {
7426 unsigned int c = *(unsigned char*)s;
7427
7428 if ('a' <= c && c <= 'z') {
7429 *s = 'A' + (c - 'a');
7430 modified = true;
7431 }
7432 s++;
7433 }
7434 return modified;
7435}
7436
7437/*
7438 * call-seq:
7439 * upcase!(*options) -> self or nil
7440 *
7441 * Upcases the characters in +self+;
7442 * returns +self+ if any changes were made, +nil+ otherwise:
7443 *
7444 * s = 'Hello World!' # => "Hello World!"
7445 * s.upcase! # => "HELLO WORLD!"
7446 * s # => "HELLO WORLD!"
7447 * s.upcase! # => nil
7448 *
7449 * The casing may be affected by the given +options+;
7450 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7451 *
7452 * Related: String#upcase, String#downcase, String#downcase!.
7453 *
7454 */
7455
7456static VALUE
7457rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7458{
7459 rb_encoding *enc;
7460 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7461
7462 flags = check_case_options(argc, argv, flags);
7463 str_modify_keep_cr(str);
7464 enc = str_true_enc(str);
7465 if (case_option_single_p(flags, enc, str)) {
7466 if (upcase_single(str))
7467 flags |= ONIGENC_CASE_MODIFIED;
7468 }
7469 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7470 rb_str_ascii_casemap(str, str, &flags, enc);
7471 else
7472 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7473
7474 if (ONIGENC_CASE_MODIFIED&flags) return str;
7475 return Qnil;
7476}
7477
7478
7479/*
7480 * call-seq:
7481 * upcase(*options) -> string
7482 *
7483 * Returns a string containing the upcased characters in +self+:
7484 *
7485 * s = 'Hello World!' # => "Hello World!"
7486 * s.upcase # => "HELLO WORLD!"
7487 *
7488 * The casing may be affected by the given +options+;
7489 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7490 *
7491 * Related: String#upcase!, String#downcase, String#downcase!.
7492 *
7493 */
7494
7495static VALUE
7496rb_str_upcase(int argc, VALUE *argv, VALUE str)
7497{
7498 rb_encoding *enc;
7499 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7500 VALUE ret;
7501
7502 flags = check_case_options(argc, argv, flags);
7503 enc = str_true_enc(str);
7504 if (case_option_single_p(flags, enc, str)) {
7505 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7506 str_enc_copy(ret, str);
7507 upcase_single(ret);
7508 }
7509 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7510 ret = rb_str_new(0, RSTRING_LEN(str));
7511 rb_str_ascii_casemap(str, ret, &flags, enc);
7512 }
7513 else {
7514 ret = rb_str_casemap(str, &flags, enc);
7515 }
7516
7517 return ret;
7518}
7519
7520static bool
7521downcase_single(VALUE str)
7522{
7523 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7524 bool modified = false;
7525
7526 while (s < send) {
7527 unsigned int c = *(unsigned char*)s;
7528
7529 if ('A' <= c && c <= 'Z') {
7530 *s = 'a' + (c - 'A');
7531 modified = true;
7532 }
7533 s++;
7534 }
7535
7536 return modified;
7537}
7538
7539/*
7540 * call-seq:
7541 * downcase!(*options) -> self or nil
7542 *
7543 * Downcases the characters in +self+;
7544 * returns +self+ if any changes were made, +nil+ otherwise:
7545 *
7546 * s = 'Hello World!' # => "Hello World!"
7547 * s.downcase! # => "hello world!"
7548 * s # => "hello world!"
7549 * s.downcase! # => nil
7550 *
7551 * The casing may be affected by the given +options+;
7552 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7553 *
7554 * Related: String#downcase, String#upcase, String#upcase!.
7555 *
7556 */
7557
7558static VALUE
7559rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7560{
7561 rb_encoding *enc;
7562 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7563
7564 flags = check_case_options(argc, argv, flags);
7565 str_modify_keep_cr(str);
7566 enc = str_true_enc(str);
7567 if (case_option_single_p(flags, enc, str)) {
7568 if (downcase_single(str))
7569 flags |= ONIGENC_CASE_MODIFIED;
7570 }
7571 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7572 rb_str_ascii_casemap(str, str, &flags, enc);
7573 else
7574 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7575
7576 if (ONIGENC_CASE_MODIFIED&flags) return str;
7577 return Qnil;
7578}
7579
7580
7581/*
7582 * call-seq:
7583 * downcase(*options) -> string
7584 *
7585 * Returns a string containing the downcased characters in +self+:
7586 *
7587 * s = 'Hello World!' # => "Hello World!"
7588 * s.downcase # => "hello world!"
7589 *
7590 * The casing may be affected by the given +options+;
7591 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7592 *
7593 * Related: String#downcase!, String#upcase, String#upcase!.
7594 *
7595 */
7596
7597static VALUE
7598rb_str_downcase(int argc, VALUE *argv, VALUE str)
7599{
7600 rb_encoding *enc;
7601 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7602 VALUE ret;
7603
7604 flags = check_case_options(argc, argv, flags);
7605 enc = str_true_enc(str);
7606 if (case_option_single_p(flags, enc, str)) {
7607 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7608 str_enc_copy(ret, str);
7609 downcase_single(ret);
7610 }
7611 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7612 ret = rb_str_new(0, RSTRING_LEN(str));
7613 rb_str_ascii_casemap(str, ret, &flags, enc);
7614 }
7615 else {
7616 ret = rb_str_casemap(str, &flags, enc);
7617 }
7618
7619 return ret;
7620}
7621
7622
7623/*
7624 * call-seq:
7625 * capitalize!(*options) -> self or nil
7626 *
7627 * Upcases the first character in +self+;
7628 * downcases the remaining characters;
7629 * returns +self+ if any changes were made, +nil+ otherwise:
7630 *
7631 * s = 'hello World!' # => "hello World!"
7632 * s.capitalize! # => "Hello world!"
7633 * s # => "Hello world!"
7634 * s.capitalize! # => nil
7635 *
7636 * The casing may be affected by the given +options+;
7637 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7638 *
7639 * Related: String#capitalize.
7640 *
7641 */
7642
7643static VALUE
7644rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7645{
7646 rb_encoding *enc;
7647 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7648
7649 flags = check_case_options(argc, argv, flags);
7650 str_modify_keep_cr(str);
7651 enc = str_true_enc(str);
7652 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7653 if (flags&ONIGENC_CASE_ASCII_ONLY)
7654 rb_str_ascii_casemap(str, str, &flags, enc);
7655 else
7656 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7657
7658 if (ONIGENC_CASE_MODIFIED&flags) return str;
7659 return Qnil;
7660}
7661
7662
7663/*
7664 * call-seq:
7665 * capitalize(*options) -> string
7666 *
7667 * Returns a string containing the characters in +self+;
7668 * the first character is upcased;
7669 * the remaining characters are downcased:
7670 *
7671 * s = 'hello World!' # => "hello World!"
7672 * s.capitalize # => "Hello world!"
7673 *
7674 * The casing may be affected by the given +options+;
7675 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7676 *
7677 * Related: String#capitalize!.
7678 *
7679 */
7680
7681static VALUE
7682rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7683{
7684 rb_encoding *enc;
7685 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7686 VALUE ret;
7687
7688 flags = check_case_options(argc, argv, flags);
7689 enc = str_true_enc(str);
7690 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7691 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7692 ret = rb_str_new(0, RSTRING_LEN(str));
7693 rb_str_ascii_casemap(str, ret, &flags, enc);
7694 }
7695 else {
7696 ret = rb_str_casemap(str, &flags, enc);
7697 }
7698 return ret;
7699}
7700
7701
7702/*
7703 * call-seq:
7704 * swapcase!(*options) -> self or nil
7705 *
7706 * Upcases each lowercase character in +self+;
7707 * downcases uppercase character;
7708 * returns +self+ if any changes were made, +nil+ otherwise:
7709 *
7710 * s = 'Hello World!' # => "Hello World!"
7711 * s.swapcase! # => "hELLO wORLD!"
7712 * s # => "hELLO wORLD!"
7713 * ''.swapcase! # => nil
7714 *
7715 * The casing may be affected by the given +options+;
7716 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7717 *
7718 * Related: String#swapcase.
7719 *
7720 */
7721
7722static VALUE
7723rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7724{
7725 rb_encoding *enc;
7726 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7727
7728 flags = check_case_options(argc, argv, flags);
7729 str_modify_keep_cr(str);
7730 enc = str_true_enc(str);
7731 if (flags&ONIGENC_CASE_ASCII_ONLY)
7732 rb_str_ascii_casemap(str, str, &flags, enc);
7733 else
7734 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7735
7736 if (ONIGENC_CASE_MODIFIED&flags) return str;
7737 return Qnil;
7738}
7739
7740
7741/*
7742 * call-seq:
7743 * swapcase(*options) -> string
7744 *
7745 * Returns a string containing the characters in +self+, with cases reversed;
7746 * each uppercase character is downcased;
7747 * each lowercase character is upcased:
7748 *
7749 * s = 'Hello World!' # => "Hello World!"
7750 * s.swapcase # => "hELLO wORLD!"
7751 *
7752 * The casing may be affected by the given +options+;
7753 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7754 *
7755 * Related: String#swapcase!.
7756 *
7757 */
7758
7759static VALUE
7760rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7761{
7762 rb_encoding *enc;
7763 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7764 VALUE ret;
7765
7766 flags = check_case_options(argc, argv, flags);
7767 enc = str_true_enc(str);
7768 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7769 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7770 ret = rb_str_new(0, RSTRING_LEN(str));
7771 rb_str_ascii_casemap(str, ret, &flags, enc);
7772 }
7773 else {
7774 ret = rb_str_casemap(str, &flags, enc);
7775 }
7776 return ret;
7777}
7778
7779typedef unsigned char *USTR;
7780
7781struct tr {
7782 int gen;
7783 unsigned int now, max;
7784 char *p, *pend;
7785};
7786
7787static unsigned int
7788trnext(struct tr *t, rb_encoding *enc)
7789{
7790 int n;
7791
7792 for (;;) {
7793 nextpart:
7794 if (!t->gen) {
7795 if (t->p == t->pend) return -1;
7796 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7797 t->p += n;
7798 }
7799 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7800 t->p += n;
7801 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7802 t->p += n;
7803 if (t->p < t->pend) {
7804 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7805 t->p += n;
7806 if (t->now > c) {
7807 if (t->now < 0x80 && c < 0x80) {
7809 "invalid range \"%c-%c\" in string transliteration",
7810 t->now, c);
7811 }
7812 else {
7813 rb_raise(rb_eArgError, "invalid range in string transliteration");
7814 }
7815 continue; /* not reached */
7816 }
7817 t->gen = 1;
7818 t->max = c;
7819 }
7820 }
7821 return t->now;
7822 }
7823 else {
7824 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7825 if (t->now == t->max) {
7826 t->gen = 0;
7827 goto nextpart;
7828 }
7829 }
7830 if (t->now < t->max) {
7831 return t->now;
7832 }
7833 else {
7834 t->gen = 0;
7835 return t->max;
7836 }
7837 }
7838 }
7839}
7840
7841static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7842
7843static VALUE
7844tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7845{
7846 const unsigned int errc = -1;
7847 unsigned int trans[256];
7848 rb_encoding *enc, *e1, *e2;
7849 struct tr trsrc, trrepl;
7850 int cflag = 0;
7851 unsigned int c, c0, last = 0;
7852 int modify = 0, i, l;
7853 unsigned char *s, *send;
7854 VALUE hash = 0;
7855 int singlebyte = single_byte_optimizable(str);
7856 int termlen;
7857 int cr;
7858
7859#define CHECK_IF_ASCII(c) \
7860 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7861 (cr = ENC_CODERANGE_VALID) : 0)
7862
7863 StringValue(src);
7864 StringValue(repl);
7865 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7866 if (RSTRING_LEN(repl) == 0) {
7867 return rb_str_delete_bang(1, &src, str);
7868 }
7869
7870 cr = ENC_CODERANGE(str);
7871 e1 = rb_enc_check(str, src);
7872 e2 = rb_enc_check(str, repl);
7873 if (e1 == e2) {
7874 enc = e1;
7875 }
7876 else {
7877 enc = rb_enc_check(src, repl);
7878 }
7879 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7880 if (RSTRING_LEN(src) > 1 &&
7881 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7882 trsrc.p + l < trsrc.pend) {
7883 cflag = 1;
7884 trsrc.p += l;
7885 }
7886 trrepl.p = RSTRING_PTR(repl);
7887 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7888 trsrc.gen = trrepl.gen = 0;
7889 trsrc.now = trrepl.now = 0;
7890 trsrc.max = trrepl.max = 0;
7891
7892 if (cflag) {
7893 for (i=0; i<256; i++) {
7894 trans[i] = 1;
7895 }
7896 while ((c = trnext(&trsrc, enc)) != errc) {
7897 if (c < 256) {
7898 trans[c] = errc;
7899 }
7900 else {
7901 if (!hash) hash = rb_hash_new();
7902 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7903 }
7904 }
7905 while ((c = trnext(&trrepl, enc)) != errc)
7906 /* retrieve last replacer */;
7907 last = trrepl.now;
7908 for (i=0; i<256; i++) {
7909 if (trans[i] != errc) {
7910 trans[i] = last;
7911 }
7912 }
7913 }
7914 else {
7915 unsigned int r;
7916
7917 for (i=0; i<256; i++) {
7918 trans[i] = errc;
7919 }
7920 while ((c = trnext(&trsrc, enc)) != errc) {
7921 r = trnext(&trrepl, enc);
7922 if (r == errc) r = trrepl.now;
7923 if (c < 256) {
7924 trans[c] = r;
7925 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7926 }
7927 else {
7928 if (!hash) hash = rb_hash_new();
7929 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7930 }
7931 }
7932 }
7933
7934 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
7935 cr = ENC_CODERANGE_7BIT;
7936 str_modify_keep_cr(str);
7937 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
7938 termlen = rb_enc_mbminlen(enc);
7939 if (sflag) {
7940 int clen, tlen;
7941 long offset, max = RSTRING_LEN(str);
7942 unsigned int save = -1;
7943 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7944
7945 while (s < send) {
7946 int may_modify = 0;
7947
7948 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7949 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7950
7951 s += clen;
7952 if (c < 256) {
7953 c = trans[c];
7954 }
7955 else if (hash) {
7956 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7957 if (NIL_P(tmp)) {
7958 if (cflag) c = last;
7959 else c = errc;
7960 }
7961 else if (cflag) c = errc;
7962 else c = NUM2INT(tmp);
7963 }
7964 else {
7965 c = errc;
7966 }
7967 if (c != (unsigned int)-1) {
7968 if (save == c) {
7969 CHECK_IF_ASCII(c);
7970 continue;
7971 }
7972 save = c;
7973 tlen = rb_enc_codelen(c, enc);
7974 modify = 1;
7975 }
7976 else {
7977 save = -1;
7978 c = c0;
7979 if (enc != e1) may_modify = 1;
7980 }
7981 if ((offset = t - buf) + tlen > max) {
7982 size_t MAYBE_UNUSED(old) = max + termlen;
7983 max = offset + tlen + (send - s);
7984 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7985 t = buf + offset;
7986 }
7987 rb_enc_mbcput(c, t, enc);
7988 if (may_modify && memcmp(s, t, tlen) != 0) {
7989 modify = 1;
7990 }
7991 CHECK_IF_ASCII(c);
7992 t += tlen;
7993 }
7994 if (!STR_EMBED_P(str)) {
7995 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7996 }
7997 TERM_FILL((char *)t, termlen);
7998 RSTRING(str)->as.heap.ptr = (char *)buf;
7999 RSTRING(str)->as.heap.len = t - buf;
8000 STR_SET_NOEMBED(str);
8001 RSTRING(str)->as.heap.aux.capa = max;
8002 }
8003 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8004 while (s < send) {
8005 c = (unsigned char)*s;
8006 if (trans[c] != errc) {
8007 if (!cflag) {
8008 c = trans[c];
8009 *s = c;
8010 modify = 1;
8011 }
8012 else {
8013 *s = last;
8014 modify = 1;
8015 }
8016 }
8017 CHECK_IF_ASCII(c);
8018 s++;
8019 }
8020 }
8021 else {
8022 int clen, tlen;
8023 long offset, max = (long)((send - s) * 1.2);
8024 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8025
8026 while (s < send) {
8027 int may_modify = 0;
8028 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
8029 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8030
8031 if (c < 256) {
8032 c = trans[c];
8033 }
8034 else if (hash) {
8035 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8036 if (NIL_P(tmp)) {
8037 if (cflag) c = last;
8038 else c = errc;
8039 }
8040 else if (cflag) c = errc;
8041 else c = NUM2INT(tmp);
8042 }
8043 else {
8044 c = cflag ? last : errc;
8045 }
8046 if (c != errc) {
8047 tlen = rb_enc_codelen(c, enc);
8048 modify = 1;
8049 }
8050 else {
8051 c = c0;
8052 if (enc != e1) may_modify = 1;
8053 }
8054 if ((offset = t - buf) + tlen > max) {
8055 size_t MAYBE_UNUSED(old) = max + termlen;
8056 max = offset + tlen + (long)((send - s) * 1.2);
8057 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8058 t = buf + offset;
8059 }
8060 if (s != t) {
8061 rb_enc_mbcput(c, t, enc);
8062 if (may_modify && memcmp(s, t, tlen) != 0) {
8063 modify = 1;
8064 }
8065 }
8066 CHECK_IF_ASCII(c);
8067 s += clen;
8068 t += tlen;
8069 }
8070 if (!STR_EMBED_P(str)) {
8071 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8072 }
8073 TERM_FILL((char *)t, termlen);
8074 RSTRING(str)->as.heap.ptr = (char *)buf;
8075 RSTRING(str)->as.heap.len = t - buf;
8076 STR_SET_NOEMBED(str);
8077 RSTRING(str)->as.heap.aux.capa = max;
8078 }
8079
8080 if (modify) {
8081 if (cr != ENC_CODERANGE_BROKEN)
8082 ENC_CODERANGE_SET(str, cr);
8083 rb_enc_associate(str, enc);
8084 return str;
8085 }
8086 return Qnil;
8087}
8088
8089
8090/*
8091 * call-seq:
8092 * tr!(selector, replacements) -> self or nil
8093 *
8094 * Like String#tr, but modifies +self+ in place.
8095 * Returns +self+ if any changes were made, +nil+ otherwise.
8096 *
8097 */
8098
8099static VALUE
8100rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8101{
8102 return tr_trans(str, src, repl, 0);
8103}
8104
8105
8106/*
8107 * call-seq:
8108 * tr(selector, replacements) -> new_string
8109 *
8110 * Returns a copy of +self+ with each character specified by string +selector+
8111 * translated to the corresponding character in string +replacements+.
8112 * The correspondence is _positional_:
8113 *
8114 * - Each occurrence of the first character specified by +selector+
8115 * is translated to the first character in +replacements+.
8116 * - Each occurrence of the second character specified by +selector+
8117 * is translated to the second character in +replacements+.
8118 * - And so on.
8119 *
8120 * Example:
8121 *
8122 * 'hello'.tr('el', 'ip') #=> "hippo"
8123 *
8124 * If +replacements+ is shorter than +selector+,
8125 * it is implicitly padded with its own last character:
8126 *
8127 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8128 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8129 *
8130 * Arguments +selector+ and +replacements+ must be valid character selectors
8131 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8132 * and may use any of its valid forms, including negation, ranges, and escaping:
8133 *
8134 * # Negation.
8135 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8136 * # Ranges.
8137 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8138 * # Escapes.
8139 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8140 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8141 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8142 *
8143 */
8144
8145static VALUE
8146rb_str_tr(VALUE str, VALUE src, VALUE repl)
8147{
8148 str = str_duplicate(rb_cString, str);
8149 tr_trans(str, src, repl, 0);
8150 return str;
8151}
8152
8153#define TR_TABLE_MAX (UCHAR_MAX+1)
8154#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8155static void
8156tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8157 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8158{
8159 const unsigned int errc = -1;
8160 char buf[TR_TABLE_MAX];
8161 struct tr tr;
8162 unsigned int c;
8163 VALUE table = 0, ptable = 0;
8164 int i, l, cflag = 0;
8165
8166 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8167 tr.gen = tr.now = tr.max = 0;
8168
8169 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8170 cflag = 1;
8171 tr.p += l;
8172 }
8173 if (first) {
8174 for (i=0; i<TR_TABLE_MAX; i++) {
8175 stable[i] = 1;
8176 }
8177 stable[TR_TABLE_MAX] = cflag;
8178 }
8179 else if (stable[TR_TABLE_MAX] && !cflag) {
8180 stable[TR_TABLE_MAX] = 0;
8181 }
8182 for (i=0; i<TR_TABLE_MAX; i++) {
8183 buf[i] = cflag;
8184 }
8185
8186 while ((c = trnext(&tr, enc)) != errc) {
8187 if (c < TR_TABLE_MAX) {
8188 buf[(unsigned char)c] = !cflag;
8189 }
8190 else {
8191 VALUE key = UINT2NUM(c);
8192
8193 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8194 if (cflag) {
8195 ptable = *ctablep;
8196 table = ptable ? ptable : rb_hash_new();
8197 *ctablep = table;
8198 }
8199 else {
8200 table = rb_hash_new();
8201 ptable = *tablep;
8202 *tablep = table;
8203 }
8204 }
8205 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8206 rb_hash_aset(table, key, Qtrue);
8207 }
8208 }
8209 }
8210 for (i=0; i<TR_TABLE_MAX; i++) {
8211 stable[i] = stable[i] && buf[i];
8212 }
8213 if (!table && !cflag) {
8214 *tablep = 0;
8215 }
8216}
8217
8218
8219static int
8220tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8221{
8222 if (c < TR_TABLE_MAX) {
8223 return table[c] != 0;
8224 }
8225 else {
8226 VALUE v = UINT2NUM(c);
8227
8228 if (del) {
8229 if (!NIL_P(rb_hash_lookup(del, v)) &&
8230 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8231 return TRUE;
8232 }
8233 }
8234 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8235 return FALSE;
8236 }
8237 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8238 }
8239}
8240
8241/*
8242 * call-seq:
8243 * delete!(*selectors) -> self or nil
8244 *
8245 * Like String#delete, but modifies +self+ in place.
8246 * Returns +self+ if any changes were made, +nil+ otherwise.
8247 *
8248 */
8249
8250static VALUE
8251rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8252{
8253 char squeez[TR_TABLE_SIZE];
8254 rb_encoding *enc = 0;
8255 char *s, *send, *t;
8256 VALUE del = 0, nodel = 0;
8257 int modify = 0;
8258 int i, ascompat, cr;
8259
8260 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8262 for (i=0; i<argc; i++) {
8263 VALUE s = argv[i];
8264
8265 StringValue(s);
8266 enc = rb_enc_check(str, s);
8267 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8268 }
8269
8270 str_modify_keep_cr(str);
8271 ascompat = rb_enc_asciicompat(enc);
8272 s = t = RSTRING_PTR(str);
8273 send = RSTRING_END(str);
8274 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8275 while (s < send) {
8276 unsigned int c;
8277 int clen;
8278
8279 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8280 if (squeez[c]) {
8281 modify = 1;
8282 }
8283 else {
8284 if (t != s) *t = c;
8285 t++;
8286 }
8287 s++;
8288 }
8289 else {
8290 c = rb_enc_codepoint_len(s, send, &clen, enc);
8291
8292 if (tr_find(c, squeez, del, nodel)) {
8293 modify = 1;
8294 }
8295 else {
8296 if (t != s) rb_enc_mbcput(c, t, enc);
8297 t += clen;
8299 }
8300 s += clen;
8301 }
8302 }
8303 TERM_FILL(t, TERM_LEN(str));
8304 STR_SET_LEN(str, t - RSTRING_PTR(str));
8305 ENC_CODERANGE_SET(str, cr);
8306
8307 if (modify) return str;
8308 return Qnil;
8309}
8310
8311
8312/*
8313 * call-seq:
8314 * delete(*selectors) -> new_string
8315 *
8316 * Returns a copy of +self+ with characters specified by +selectors+ removed
8317 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8318 *
8319 * "hello".delete "l","lo" #=> "heo"
8320 * "hello".delete "lo" #=> "he"
8321 * "hello".delete "aeiou", "^e" #=> "hell"
8322 * "hello".delete "ej-m" #=> "ho"
8323 *
8324 */
8325
8326static VALUE
8327rb_str_delete(int argc, VALUE *argv, VALUE str)
8328{
8329 str = str_duplicate(rb_cString, str);
8330 rb_str_delete_bang(argc, argv, str);
8331 return str;
8332}
8333
8334
8335/*
8336 * call-seq:
8337 * squeeze!(*selectors) -> self or nil
8338 *
8339 * Like String#squeeze, but modifies +self+ in place.
8340 * Returns +self+ if any changes were made, +nil+ otherwise.
8341 */
8342
8343static VALUE
8344rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8345{
8346 char squeez[TR_TABLE_SIZE];
8347 rb_encoding *enc = 0;
8348 VALUE del = 0, nodel = 0;
8349 unsigned char *s, *send, *t;
8350 int i, modify = 0;
8351 int ascompat, singlebyte = single_byte_optimizable(str);
8352 unsigned int save;
8353
8354 if (argc == 0) {
8355 enc = STR_ENC_GET(str);
8356 }
8357 else {
8358 for (i=0; i<argc; i++) {
8359 VALUE s = argv[i];
8360
8361 StringValue(s);
8362 enc = rb_enc_check(str, s);
8363 if (singlebyte && !single_byte_optimizable(s))
8364 singlebyte = 0;
8365 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8366 }
8367 }
8368
8369 str_modify_keep_cr(str);
8370 s = t = (unsigned char *)RSTRING_PTR(str);
8371 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8372 send = (unsigned char *)RSTRING_END(str);
8373 save = -1;
8374 ascompat = rb_enc_asciicompat(enc);
8375
8376 if (singlebyte) {
8377 while (s < send) {
8378 unsigned int c = *s++;
8379 if (c != save || (argc > 0 && !squeez[c])) {
8380 *t++ = save = c;
8381 }
8382 }
8383 }
8384 else {
8385 while (s < send) {
8386 unsigned int c;
8387 int clen;
8388
8389 if (ascompat && (c = *s) < 0x80) {
8390 if (c != save || (argc > 0 && !squeez[c])) {
8391 *t++ = save = c;
8392 }
8393 s++;
8394 }
8395 else {
8396 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8397
8398 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8399 if (t != s) rb_enc_mbcput(c, t, enc);
8400 save = c;
8401 t += clen;
8402 }
8403 s += clen;
8404 }
8405 }
8406 }
8407
8408 TERM_FILL((char *)t, TERM_LEN(str));
8409 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8410 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8411 modify = 1;
8412 }
8413
8414 if (modify) return str;
8415 return Qnil;
8416}
8417
8418
8419/*
8420 * call-seq:
8421 * squeeze(*selectors) -> new_string
8422 *
8423 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8424 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8425 *
8426 * "Squeezed" means that each multiple-character run of a selected character
8427 * is squeezed down to a single character;
8428 * with no arguments given, squeezes all characters:
8429 *
8430 * "yellow moon".squeeze #=> "yelow mon"
8431 * " now is the".squeeze(" ") #=> " now is the"
8432 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8433 *
8434 */
8435
8436static VALUE
8437rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8438{
8439 str = str_duplicate(rb_cString, str);
8440 rb_str_squeeze_bang(argc, argv, str);
8441 return str;
8442}
8443
8444
8445/*
8446 * call-seq:
8447 * tr_s!(selector, replacements) -> self or nil
8448 *
8449 * Like String#tr_s, but modifies +self+ in place.
8450 * Returns +self+ if any changes were made, +nil+ otherwise.
8451 *
8452 * Related: String#squeeze!.
8453 */
8454
8455static VALUE
8456rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8457{
8458 return tr_trans(str, src, repl, 1);
8459}
8460
8461
8462/*
8463 * call-seq:
8464 * tr_s(selector, replacements) -> string
8465 *
8466 * Like String#tr, but also squeezes the modified portions of the translated string;
8467 * returns a new string (translated and squeezed).
8468 *
8469 * 'hello'.tr_s('l', 'r') #=> "hero"
8470 * 'hello'.tr_s('el', '-') #=> "h-o"
8471 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8472 *
8473 * Related: String#squeeze.
8474 *
8475 */
8476
8477static VALUE
8478rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8479{
8480 str = str_duplicate(rb_cString, str);
8481 tr_trans(str, src, repl, 1);
8482 return str;
8483}
8484
8485
8486/*
8487 * call-seq:
8488 * count(*selectors) -> integer
8489 *
8490 * Returns the total number of characters in +self+
8491 * that are specified by the given +selectors+
8492 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8493 *
8494 * a = "hello world"
8495 * a.count "lo" #=> 5
8496 * a.count "lo", "o" #=> 2
8497 * a.count "hello", "^l" #=> 4
8498 * a.count "ej-m" #=> 4
8499 *
8500 * "hello^world".count "\\^aeiou" #=> 4
8501 * "hello-world".count "a\\-eo" #=> 4
8502 *
8503 * c = "hello world\\r\\n"
8504 * c.count "\\" #=> 2
8505 * c.count "\\A" #=> 0
8506 * c.count "X-\\w" #=> 3
8507 */
8508
8509static VALUE
8510rb_str_count(int argc, VALUE *argv, VALUE str)
8511{
8512 char table[TR_TABLE_SIZE];
8513 rb_encoding *enc = 0;
8514 VALUE del = 0, nodel = 0, tstr;
8515 char *s, *send;
8516 int i;
8517 int ascompat;
8518 size_t n = 0;
8519
8521
8522 tstr = argv[0];
8523 StringValue(tstr);
8524 enc = rb_enc_check(str, tstr);
8525 if (argc == 1) {
8526 const char *ptstr;
8527 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8528 (ptstr = RSTRING_PTR(tstr),
8529 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8530 !is_broken_string(str)) {
8531 int clen;
8532 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8533
8534 s = RSTRING_PTR(str);
8535 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8536 send = RSTRING_END(str);
8537 while (s < send) {
8538 if (*(unsigned char*)s++ == c) n++;
8539 }
8540 return SIZET2NUM(n);
8541 }
8542 }
8543
8544 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8545 for (i=1; i<argc; i++) {
8546 tstr = argv[i];
8547 StringValue(tstr);
8548 enc = rb_enc_check(str, tstr);
8549 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8550 }
8551
8552 s = RSTRING_PTR(str);
8553 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8554 send = RSTRING_END(str);
8555 ascompat = rb_enc_asciicompat(enc);
8556 while (s < send) {
8557 unsigned int c;
8558
8559 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8560 if (table[c]) {
8561 n++;
8562 }
8563 s++;
8564 }
8565 else {
8566 int clen;
8567 c = rb_enc_codepoint_len(s, send, &clen, enc);
8568 if (tr_find(c, table, del, nodel)) {
8569 n++;
8570 }
8571 s += clen;
8572 }
8573 }
8574
8575 return SIZET2NUM(n);
8576}
8577
8578static VALUE
8579rb_fs_check(VALUE val)
8580{
8581 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8582 val = rb_check_string_type(val);
8583 if (NIL_P(val)) return 0;
8584 }
8585 return val;
8586}
8587
8588static const char isspacetable[256] = {
8589 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8590 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8591 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8592 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8594 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8595 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8596 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8597 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8598 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8599 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8600 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8601 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8602 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8603 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8604 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8605};
8606
8607#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8608
8609static long
8610split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8611{
8612 if (empty_count >= 0 && len == 0) {
8613 return empty_count + 1;
8614 }
8615 if (empty_count > 0) {
8616 /* make different substrings */
8617 if (result) {
8618 do {
8619 rb_ary_push(result, str_new_empty_String(str));
8620 } while (--empty_count > 0);
8621 }
8622 else {
8623 do {
8624 rb_yield(str_new_empty_String(str));
8625 } while (--empty_count > 0);
8626 }
8627 }
8628 str = rb_str_subseq(str, beg, len);
8629 if (result) {
8630 rb_ary_push(result, str);
8631 }
8632 else {
8633 rb_yield(str);
8634 }
8635 return empty_count;
8636}
8637
8638typedef enum {
8639 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8640} split_type_t;
8641
8642static split_type_t
8643literal_split_pattern(VALUE spat, split_type_t default_type)
8644{
8645 rb_encoding *enc = STR_ENC_GET(spat);
8646 const char *ptr;
8647 long len;
8648 RSTRING_GETMEM(spat, ptr, len);
8649 if (len == 0) {
8650 /* Special case - split into chars */
8651 return SPLIT_TYPE_CHARS;
8652 }
8653 else if (rb_enc_asciicompat(enc)) {
8654 if (len == 1 && ptr[0] == ' ') {
8655 return SPLIT_TYPE_AWK;
8656 }
8657 }
8658 else {
8659 int l;
8660 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8661 return SPLIT_TYPE_AWK;
8662 }
8663 }
8664 return default_type;
8665}
8666
8667/*
8668 * call-seq:
8669 * split(field_sep = $;, limit = nil) -> array
8670 * split(field_sep = $;, limit = nil) {|substring| ... } -> self
8671 *
8672 * :include: doc/string/split.rdoc
8673 *
8674 */
8675
8676static VALUE
8677rb_str_split_m(int argc, VALUE *argv, VALUE str)
8678{
8679 rb_encoding *enc;
8680 VALUE spat;
8681 VALUE limit;
8682 split_type_t split_type;
8683 long beg, end, i = 0, empty_count = -1;
8684 int lim = 0;
8685 VALUE result, tmp;
8686
8687 result = rb_block_given_p() ? Qfalse : Qnil;
8688 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8689 lim = NUM2INT(limit);
8690 if (lim <= 0) limit = Qnil;
8691 else if (lim == 1) {
8692 if (RSTRING_LEN(str) == 0)
8693 return result ? rb_ary_new2(0) : str;
8694 tmp = str_duplicate(rb_cString, str);
8695 if (!result) {
8696 rb_yield(tmp);
8697 return str;
8698 }
8699 return rb_ary_new3(1, tmp);
8700 }
8701 i = 1;
8702 }
8703 if (NIL_P(limit) && !lim) empty_count = 0;
8704
8705 enc = STR_ENC_GET(str);
8706 split_type = SPLIT_TYPE_REGEXP;
8707 if (!NIL_P(spat)) {
8708 spat = get_pat_quoted(spat, 0);
8709 }
8710 else if (NIL_P(spat = rb_fs)) {
8711 split_type = SPLIT_TYPE_AWK;
8712 }
8713 else if (!(spat = rb_fs_check(spat))) {
8714 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8715 }
8716 else {
8717 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8718 }
8719 if (split_type != SPLIT_TYPE_AWK) {
8720 switch (BUILTIN_TYPE(spat)) {
8721 case T_REGEXP:
8722 rb_reg_options(spat); /* check if uninitialized */
8723 tmp = RREGEXP_SRC(spat);
8724 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8725 if (split_type == SPLIT_TYPE_AWK) {
8726 spat = tmp;
8727 split_type = SPLIT_TYPE_STRING;
8728 }
8729 break;
8730
8731 case T_STRING:
8732 mustnot_broken(spat);
8733 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8734 break;
8735
8736 default:
8738 }
8739 }
8740
8741#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8742
8743 if (result) result = rb_ary_new();
8744 beg = 0;
8745 char *ptr = RSTRING_PTR(str);
8746 char *eptr = RSTRING_END(str);
8747 if (split_type == SPLIT_TYPE_AWK) {
8748 char *bptr = ptr;
8749 int skip = 1;
8750 unsigned int c;
8751
8752 end = beg;
8753 if (is_ascii_string(str)) {
8754 while (ptr < eptr) {
8755 c = (unsigned char)*ptr++;
8756 if (skip) {
8757 if (ascii_isspace(c)) {
8758 beg = ptr - bptr;
8759 }
8760 else {
8761 end = ptr - bptr;
8762 skip = 0;
8763 if (!NIL_P(limit) && lim <= i) break;
8764 }
8765 }
8766 else if (ascii_isspace(c)) {
8767 SPLIT_STR(beg, end-beg);
8768 skip = 1;
8769 beg = ptr - bptr;
8770 if (!NIL_P(limit)) ++i;
8771 }
8772 else {
8773 end = ptr - bptr;
8774 }
8775 }
8776 }
8777 else {
8778 while (ptr < eptr) {
8779 int n;
8780
8781 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8782 ptr += n;
8783 if (skip) {
8784 if (rb_isspace(c)) {
8785 beg = ptr - bptr;
8786 }
8787 else {
8788 end = ptr - bptr;
8789 skip = 0;
8790 if (!NIL_P(limit) && lim <= i) break;
8791 }
8792 }
8793 else if (rb_isspace(c)) {
8794 SPLIT_STR(beg, end-beg);
8795 skip = 1;
8796 beg = ptr - bptr;
8797 if (!NIL_P(limit)) ++i;
8798 }
8799 else {
8800 end = ptr - bptr;
8801 }
8802 }
8803 }
8804 }
8805 else if (split_type == SPLIT_TYPE_STRING) {
8806 char *str_start = ptr;
8807 char *substr_start = ptr;
8808 char *sptr = RSTRING_PTR(spat);
8809 long slen = RSTRING_LEN(spat);
8810
8811 mustnot_broken(str);
8812 enc = rb_enc_check(str, spat);
8813 while (ptr < eptr &&
8814 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8815 /* Check we are at the start of a char */
8816 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8817 if (t != ptr + end) {
8818 ptr = t;
8819 continue;
8820 }
8821 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8822 ptr += end + slen;
8823 substr_start = ptr;
8824 if (!NIL_P(limit) && lim <= ++i) break;
8825 }
8826 beg = ptr - str_start;
8827 }
8828 else if (split_type == SPLIT_TYPE_CHARS) {
8829 char *str_start = ptr;
8830 int n;
8831
8832 mustnot_broken(str);
8833 enc = rb_enc_get(str);
8834 while (ptr < eptr &&
8835 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8836 SPLIT_STR(ptr - str_start, n);
8837 ptr += n;
8838 if (!NIL_P(limit) && lim <= ++i) break;
8839 }
8840 beg = ptr - str_start;
8841 }
8842 else {
8843 long len = RSTRING_LEN(str);
8844 long start = beg;
8845 long idx;
8846 int last_null = 0;
8847 struct re_registers *regs;
8848 VALUE match = 0;
8849
8850 for (; rb_reg_search(spat, str, start, 0) >= 0;
8851 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8852 match = rb_backref_get();
8853 if (!result) rb_match_busy(match);
8854 regs = RMATCH_REGS(match);
8855 end = BEG(0);
8856 if (start == end && BEG(0) == END(0)) {
8857 if (!ptr) {
8858 SPLIT_STR(0, 0);
8859 break;
8860 }
8861 else if (last_null == 1) {
8862 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8863 beg = start;
8864 }
8865 else {
8866 if (start == len)
8867 start++;
8868 else
8869 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8870 last_null = 1;
8871 continue;
8872 }
8873 }
8874 else {
8875 SPLIT_STR(beg, end-beg);
8876 beg = start = END(0);
8877 }
8878 last_null = 0;
8879
8880 for (idx=1; idx < regs->num_regs; idx++) {
8881 if (BEG(idx) == -1) continue;
8882 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8883 }
8884 if (!NIL_P(limit) && lim <= ++i) break;
8885 }
8886 if (match) rb_match_unbusy(match);
8887 }
8888 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8889 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8890 }
8891
8892 return result ? result : str;
8893}
8894
8895VALUE
8896rb_str_split(VALUE str, const char *sep0)
8897{
8898 VALUE sep;
8899
8900 StringValue(str);
8901 sep = rb_str_new_cstr(sep0);
8902 return rb_str_split_m(1, &sep, str);
8903}
8904
8905#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8906
8907static inline int
8908enumerator_element(VALUE ary, VALUE e)
8909{
8910 if (ary) {
8911 rb_ary_push(ary, e);
8912 return 0;
8913 }
8914 else {
8915 rb_yield(e);
8916 return 1;
8917 }
8918}
8919
8920#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8921
8922static const char *
8923chomp_newline(const char *p, const char *e, rb_encoding *enc)
8924{
8925 const char *prev = rb_enc_prev_char(p, e, e, enc);
8926 if (rb_enc_is_newline(prev, e, enc)) {
8927 e = prev;
8928 prev = rb_enc_prev_char(p, e, e, enc);
8929 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
8930 e = prev;
8931 }
8932 return e;
8933}
8934
8935static VALUE
8936get_rs(void)
8937{
8938 VALUE rs = rb_rs;
8939 if (!NIL_P(rs) &&
8940 (!RB_TYPE_P(rs, T_STRING) ||
8941 RSTRING_LEN(rs) != 1 ||
8942 RSTRING_PTR(rs)[0] != '\n')) {
8943 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
8944 }
8945 return rs;
8946}
8947
8948#define rb_rs get_rs()
8949
8950static VALUE
8951rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
8952{
8953 rb_encoding *enc;
8954 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
8955 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8956 long pos, len, rslen;
8957 int rsnewline = 0;
8958
8959 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
8960 rs = rb_rs;
8961 if (!NIL_P(opts)) {
8962 static ID keywords[1];
8963 if (!keywords[0]) {
8964 keywords[0] = rb_intern_const("chomp");
8965 }
8966 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
8967 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
8968 }
8969
8970 if (NIL_P(rs)) {
8971 if (!ENUM_ELEM(ary, str)) {
8972 return ary;
8973 }
8974 else {
8975 return orig;
8976 }
8977 }
8978
8979 if (!RSTRING_LEN(str)) goto end;
8980 str = rb_str_new_frozen(str);
8981 ptr = subptr = RSTRING_PTR(str);
8982 pend = RSTRING_END(str);
8983 len = RSTRING_LEN(str);
8984 StringValue(rs);
8985 rslen = RSTRING_LEN(rs);
8986
8987 if (rs == rb_default_rs)
8988 enc = rb_enc_get(str);
8989 else
8990 enc = rb_enc_check(str, rs);
8991
8992 if (rslen == 0) {
8993 /* paragraph mode */
8994 int n;
8995 const char *eol = NULL;
8996 subend = subptr;
8997 while (subend < pend) {
8998 long chomp_rslen = 0;
8999 do {
9000 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9001 n = 0;
9002 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9003 if (rb_enc_is_newline(subend + n, pend, enc)) {
9004 if (eol == subend) break;
9005 subend += rslen;
9006 if (subptr) {
9007 eol = subend;
9008 chomp_rslen = -rslen;
9009 }
9010 }
9011 else {
9012 if (!subptr) subptr = subend;
9013 subend += rslen;
9014 }
9015 rslen = 0;
9016 } while (subend < pend);
9017 if (!subptr) break;
9018 if (rslen == 0) chomp_rslen = 0;
9019 line = rb_str_subseq(str, subptr - ptr,
9020 subend - subptr + (chomp ? chomp_rslen : rslen));
9021 if (ENUM_ELEM(ary, line)) {
9022 str_mod_check(str, ptr, len);
9023 }
9024 subptr = eol = NULL;
9025 }
9026 goto end;
9027 }
9028 else {
9029 rsptr = RSTRING_PTR(rs);
9030 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9031 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9032 rsnewline = 1;
9033 }
9034 }
9035
9036 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9037 rs = rb_str_new(rsptr, rslen);
9038 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9039 rsptr = RSTRING_PTR(rs);
9040 rslen = RSTRING_LEN(rs);
9041 }
9042
9043 while (subptr < pend) {
9044 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9045 if (pos < 0) break;
9046 hit = subptr + pos;
9047 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9048 if (hit != adjusted) {
9049 subptr = adjusted;
9050 continue;
9051 }
9052 subend = hit += rslen;
9053 if (chomp) {
9054 if (rsnewline) {
9055 subend = chomp_newline(subptr, subend, enc);
9056 }
9057 else {
9058 subend -= rslen;
9059 }
9060 }
9061 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9062 if (ENUM_ELEM(ary, line)) {
9063 str_mod_check(str, ptr, len);
9064 }
9065 subptr = hit;
9066 }
9067
9068 if (subptr != pend) {
9069 if (chomp) {
9070 if (rsnewline) {
9071 pend = chomp_newline(subptr, pend, enc);
9072 }
9073 else if (pend - subptr >= rslen &&
9074 memcmp(pend - rslen, rsptr, rslen) == 0) {
9075 pend -= rslen;
9076 }
9077 }
9078 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9079 ENUM_ELEM(ary, line);
9080 RB_GC_GUARD(str);
9081 }
9082
9083 end:
9084 if (ary)
9085 return ary;
9086 else
9087 return orig;
9088}
9089
9090/*
9091 * call-seq:
9092 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9093 * each_line(line_sep = $/, chomp: false) -> enumerator
9094 *
9095 * :include: doc/string/each_line.rdoc
9096 *
9097 */
9098
9099static VALUE
9100rb_str_each_line(int argc, VALUE *argv, VALUE str)
9101{
9102 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9103 return rb_str_enumerate_lines(argc, argv, str, 0);
9104}
9105
9106/*
9107 * call-seq:
9108 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9109 *
9110 * Forms substrings ("lines") of +self+ according to the given arguments
9111 * (see String#each_line for details); returns the lines in an array.
9112 *
9113 */
9114
9115static VALUE
9116rb_str_lines(int argc, VALUE *argv, VALUE str)
9117{
9118 VALUE ary = WANTARRAY("lines", 0);
9119 return rb_str_enumerate_lines(argc, argv, str, ary);
9120}
9121
9122static VALUE
9123rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9124{
9125 return LONG2FIX(RSTRING_LEN(str));
9126}
9127
9128static VALUE
9129rb_str_enumerate_bytes(VALUE str, VALUE ary)
9130{
9131 long i;
9132
9133 for (i=0; i<RSTRING_LEN(str); i++) {
9134 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9135 }
9136 if (ary)
9137 return ary;
9138 else
9139 return str;
9140}
9141
9142/*
9143 * call-seq:
9144 * each_byte {|byte| ... } -> self
9145 * each_byte -> enumerator
9146 *
9147 * :include: doc/string/each_byte.rdoc
9148 *
9149 */
9150
9151static VALUE
9152rb_str_each_byte(VALUE str)
9153{
9154 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9155 return rb_str_enumerate_bytes(str, 0);
9156}
9157
9158/*
9159 * call-seq:
9160 * bytes -> array_of_bytes
9161 *
9162 * :include: doc/string/bytes.rdoc
9163 *
9164 */
9165
9166static VALUE
9167rb_str_bytes(VALUE str)
9168{
9169 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9170 return rb_str_enumerate_bytes(str, ary);
9171}
9172
9173static VALUE
9174rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9175{
9176 return rb_str_length(str);
9177}
9178
9179static VALUE
9180rb_str_enumerate_chars(VALUE str, VALUE ary)
9181{
9182 VALUE orig = str;
9183 long i, len, n;
9184 const char *ptr;
9185 rb_encoding *enc;
9186
9187 str = rb_str_new_frozen(str);
9188 ptr = RSTRING_PTR(str);
9189 len = RSTRING_LEN(str);
9190 enc = rb_enc_get(str);
9191
9193 for (i = 0; i < len; i += n) {
9194 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9195 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9196 }
9197 }
9198 else {
9199 for (i = 0; i < len; i += n) {
9200 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9201 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9202 }
9203 }
9204 RB_GC_GUARD(str);
9205 if (ary)
9206 return ary;
9207 else
9208 return orig;
9209}
9210
9211/*
9212 * call-seq:
9213 * each_char {|c| ... } -> self
9214 * each_char -> enumerator
9215 *
9216 * :include: doc/string/each_char.rdoc
9217 *
9218 */
9219
9220static VALUE
9221rb_str_each_char(VALUE str)
9222{
9223 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9224 return rb_str_enumerate_chars(str, 0);
9225}
9226
9227/*
9228 * call-seq:
9229 * chars -> array_of_characters
9230 *
9231 * :include: doc/string/chars.rdoc
9232 *
9233 */
9234
9235static VALUE
9236rb_str_chars(VALUE str)
9237{
9238 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9239 return rb_str_enumerate_chars(str, ary);
9240}
9241
9242static VALUE
9243rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9244{
9245 VALUE orig = str;
9246 int n;
9247 unsigned int c;
9248 const char *ptr, *end;
9249 rb_encoding *enc;
9250
9251 if (single_byte_optimizable(str))
9252 return rb_str_enumerate_bytes(str, ary);
9253
9254 str = rb_str_new_frozen(str);
9255 ptr = RSTRING_PTR(str);
9256 end = RSTRING_END(str);
9257 enc = STR_ENC_GET(str);
9258
9259 while (ptr < end) {
9260 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9261 ENUM_ELEM(ary, UINT2NUM(c));
9262 ptr += n;
9263 }
9264 RB_GC_GUARD(str);
9265 if (ary)
9266 return ary;
9267 else
9268 return orig;
9269}
9270
9271/*
9272 * call-seq:
9273 * each_codepoint {|integer| ... } -> self
9274 * each_codepoint -> enumerator
9275 *
9276 * :include: doc/string/each_codepoint.rdoc
9277 *
9278 */
9279
9280static VALUE
9281rb_str_each_codepoint(VALUE str)
9282{
9283 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9284 return rb_str_enumerate_codepoints(str, 0);
9285}
9286
9287/*
9288 * call-seq:
9289 * codepoints -> array_of_integers
9290 *
9291 * :include: doc/string/codepoints.rdoc
9292 *
9293 */
9294
9295static VALUE
9296rb_str_codepoints(VALUE str)
9297{
9298 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9299 return rb_str_enumerate_codepoints(str, ary);
9300}
9301
9302static regex_t *
9303get_reg_grapheme_cluster(rb_encoding *enc)
9304{
9305 int encidx = rb_enc_to_index(enc);
9306
9307 const OnigUChar source_ascii[] = "\\X";
9308 const OnigUChar *source = source_ascii;
9309 size_t source_len = sizeof(source_ascii) - 1;
9310
9311 switch (encidx) {
9312#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9313#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9314#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9315#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9316#define CASE_UTF(e) \
9317 case ENCINDEX_UTF_##e: { \
9318 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9319 source = source_UTF_##e; \
9320 source_len = sizeof(source_UTF_##e); \
9321 break; \
9322 }
9323 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9324#undef CASE_UTF
9325#undef CHARS_16BE
9326#undef CHARS_16LE
9327#undef CHARS_32BE
9328#undef CHARS_32LE
9329 }
9330
9331 regex_t *reg_grapheme_cluster;
9332 OnigErrorInfo einfo;
9333 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9334 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9335 if (r) {
9336 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9337 onig_error_code_to_str(message, r, &einfo);
9338 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9339 }
9340
9341 return reg_grapheme_cluster;
9342}
9343
9344static regex_t *
9345get_cached_reg_grapheme_cluster(rb_encoding *enc)
9346{
9347 int encidx = rb_enc_to_index(enc);
9348 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9349
9350 if (encidx == rb_utf8_encindex()) {
9351 if (!reg_grapheme_cluster_utf8) {
9352 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9353 }
9354
9355 return reg_grapheme_cluster_utf8;
9356 }
9357
9358 return NULL;
9359}
9360
9361static VALUE
9362rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9363{
9364 size_t grapheme_cluster_count = 0;
9365 rb_encoding *enc = get_encoding(str);
9366 const char *ptr, *end;
9367
9368 if (!rb_enc_unicode_p(enc)) {
9369 return rb_str_length(str);
9370 }
9371
9372 bool cached_reg_grapheme_cluster = true;
9373 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9374 if (!reg_grapheme_cluster) {
9375 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9376 cached_reg_grapheme_cluster = false;
9377 }
9378
9379 ptr = RSTRING_PTR(str);
9380 end = RSTRING_END(str);
9381
9382 while (ptr < end) {
9383 OnigPosition len = onig_match(reg_grapheme_cluster,
9384 (const OnigUChar *)ptr, (const OnigUChar *)end,
9385 (const OnigUChar *)ptr, NULL, 0);
9386 if (len <= 0) break;
9387 grapheme_cluster_count++;
9388 ptr += len;
9389 }
9390
9391 if (!cached_reg_grapheme_cluster) {
9392 onig_free(reg_grapheme_cluster);
9393 }
9394
9395 return SIZET2NUM(grapheme_cluster_count);
9396}
9397
9398static VALUE
9399rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9400{
9401 VALUE orig = str;
9402 rb_encoding *enc = get_encoding(str);
9403 const char *ptr0, *ptr, *end;
9404
9405 if (!rb_enc_unicode_p(enc)) {
9406 return rb_str_enumerate_chars(str, ary);
9407 }
9408
9409 if (!ary) str = rb_str_new_frozen(str);
9410
9411 bool cached_reg_grapheme_cluster = true;
9412 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9413 if (!reg_grapheme_cluster) {
9414 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9415 cached_reg_grapheme_cluster = false;
9416 }
9417
9418 ptr0 = ptr = RSTRING_PTR(str);
9419 end = RSTRING_END(str);
9420
9421 while (ptr < end) {
9422 OnigPosition len = onig_match(reg_grapheme_cluster,
9423 (const OnigUChar *)ptr, (const OnigUChar *)end,
9424 (const OnigUChar *)ptr, NULL, 0);
9425 if (len <= 0) break;
9426 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9427 ptr += len;
9428 }
9429
9430 if (!cached_reg_grapheme_cluster) {
9431 onig_free(reg_grapheme_cluster);
9432 }
9433
9434 RB_GC_GUARD(str);
9435 if (ary)
9436 return ary;
9437 else
9438 return orig;
9439}
9440
9441/*
9442 * call-seq:
9443 * each_grapheme_cluster {|gc| ... } -> self
9444 * each_grapheme_cluster -> enumerator
9445 *
9446 * :include: doc/string/each_grapheme_cluster.rdoc
9447 *
9448 */
9449
9450static VALUE
9451rb_str_each_grapheme_cluster(VALUE str)
9452{
9453 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9454 return rb_str_enumerate_grapheme_clusters(str, 0);
9455}
9456
9457/*
9458 * call-seq:
9459 * grapheme_clusters -> array_of_grapheme_clusters
9460 *
9461 * :include: doc/string/grapheme_clusters.rdoc
9462 *
9463 */
9464
9465static VALUE
9466rb_str_grapheme_clusters(VALUE str)
9467{
9468 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9469 return rb_str_enumerate_grapheme_clusters(str, ary);
9470}
9471
9472static long
9473chopped_length(VALUE str)
9474{
9475 rb_encoding *enc = STR_ENC_GET(str);
9476 const char *p, *p2, *beg, *end;
9477
9478 beg = RSTRING_PTR(str);
9479 end = beg + RSTRING_LEN(str);
9480 if (beg >= end) return 0;
9481 p = rb_enc_prev_char(beg, end, end, enc);
9482 if (!p) return 0;
9483 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9484 p2 = rb_enc_prev_char(beg, p, end, enc);
9485 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9486 }
9487 return p - beg;
9488}
9489
9490/*
9491 * call-seq:
9492 * chop! -> self or nil
9493 *
9494 * Like String#chop, but modifies +self+ in place;
9495 * returns +nil+ if +self+ is empty, +self+ otherwise.
9496 *
9497 * Related: String#chomp!.
9498 */
9499
9500static VALUE
9501rb_str_chop_bang(VALUE str)
9502{
9503 str_modify_keep_cr(str);
9504 if (RSTRING_LEN(str) > 0) {
9505 long len;
9506 len = chopped_length(str);
9507 STR_SET_LEN(str, len);
9508 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9509 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9511 }
9512 return str;
9513 }
9514 return Qnil;
9515}
9516
9517
9518/*
9519 * call-seq:
9520 * chop -> new_string
9521 *
9522 * :include: doc/string/chop.rdoc
9523 *
9524 */
9525
9526static VALUE
9527rb_str_chop(VALUE str)
9528{
9529 return rb_str_subseq(str, 0, chopped_length(str));
9530}
9531
9532static long
9533smart_chomp(VALUE str, const char *e, const char *p)
9534{
9535 rb_encoding *enc = rb_enc_get(str);
9536 if (rb_enc_mbminlen(enc) > 1) {
9537 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9538 if (rb_enc_is_newline(pp, e, enc)) {
9539 e = pp;
9540 }
9541 pp = e - rb_enc_mbminlen(enc);
9542 if (pp >= p) {
9543 pp = rb_enc_left_char_head(p, pp, e, enc);
9544 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9545 e = pp;
9546 }
9547 }
9548 }
9549 else {
9550 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9551 case '\n':
9552 if (--e > p && *(e-1) == '\r') {
9553 --e;
9554 }
9555 break;
9556 case '\r':
9557 --e;
9558 break;
9559 }
9560 }
9561 return e - p;
9562}
9563
9564static long
9565chompped_length(VALUE str, VALUE rs)
9566{
9567 rb_encoding *enc;
9568 int newline;
9569 char *pp, *e, *rsptr;
9570 long rslen;
9571 char *const p = RSTRING_PTR(str);
9572 long len = RSTRING_LEN(str);
9573
9574 if (len == 0) return 0;
9575 e = p + len;
9576 if (rs == rb_default_rs) {
9577 return smart_chomp(str, e, p);
9578 }
9579
9580 enc = rb_enc_get(str);
9581 RSTRING_GETMEM(rs, rsptr, rslen);
9582 if (rslen == 0) {
9583 if (rb_enc_mbminlen(enc) > 1) {
9584 while (e > p) {
9585 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9586 if (!rb_enc_is_newline(pp, e, enc)) break;
9587 e = pp;
9588 pp -= rb_enc_mbminlen(enc);
9589 if (pp >= p) {
9590 pp = rb_enc_left_char_head(p, pp, e, enc);
9591 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9592 e = pp;
9593 }
9594 }
9595 }
9596 }
9597 else {
9598 while (e > p && *(e-1) == '\n') {
9599 --e;
9600 if (e > p && *(e-1) == '\r')
9601 --e;
9602 }
9603 }
9604 return e - p;
9605 }
9606 if (rslen > len) return len;
9607
9608 enc = rb_enc_get(rs);
9609 newline = rsptr[rslen-1];
9610 if (rslen == rb_enc_mbminlen(enc)) {
9611 if (rslen == 1) {
9612 if (newline == '\n')
9613 return smart_chomp(str, e, p);
9614 }
9615 else {
9616 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9617 return smart_chomp(str, e, p);
9618 }
9619 }
9620
9621 enc = rb_enc_check(str, rs);
9622 if (is_broken_string(rs)) {
9623 return len;
9624 }
9625 pp = e - rslen;
9626 if (p[len-1] == newline &&
9627 (rslen <= 1 ||
9628 memcmp(rsptr, pp, rslen) == 0)) {
9629 if (rb_enc_left_char_head(p, pp, e, enc) == pp)
9630 return len - rslen;
9631 RB_GC_GUARD(rs);
9632 }
9633 return len;
9634}
9635
9641static VALUE
9642chomp_rs(int argc, const VALUE *argv)
9643{
9644 rb_check_arity(argc, 0, 1);
9645 if (argc > 0) {
9646 VALUE rs = argv[0];
9647 if (!NIL_P(rs)) StringValue(rs);
9648 return rs;
9649 }
9650 else {
9651 return rb_rs;
9652 }
9653}
9654
9655VALUE
9656rb_str_chomp_string(VALUE str, VALUE rs)
9657{
9658 long olen = RSTRING_LEN(str);
9659 long len = chompped_length(str, rs);
9660 if (len >= olen) return Qnil;
9661 str_modify_keep_cr(str);
9662 STR_SET_LEN(str, len);
9663 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9664 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9666 }
9667 return str;
9668}
9669
9670/*
9671 * call-seq:
9672 * chomp!(line_sep = $/) -> self or nil
9673 *
9674 * Like String#chomp, but modifies +self+ in place;
9675 * returns +nil+ if no modification made, +self+ otherwise.
9676 *
9677 */
9678
9679static VALUE
9680rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9681{
9682 VALUE rs;
9683 str_modifiable(str);
9684 if (RSTRING_LEN(str) == 0) return Qnil;
9685 rs = chomp_rs(argc, argv);
9686 if (NIL_P(rs)) return Qnil;
9687 return rb_str_chomp_string(str, rs);
9688}
9689
9690
9691/*
9692 * call-seq:
9693 * chomp(line_sep = $/) -> new_string
9694 *
9695 * :include: doc/string/chomp.rdoc
9696 *
9697 */
9698
9699static VALUE
9700rb_str_chomp(int argc, VALUE *argv, VALUE str)
9701{
9702 VALUE rs = chomp_rs(argc, argv);
9703 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9704 return rb_str_subseq(str, 0, chompped_length(str, rs));
9705}
9706
9707static long
9708lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9709{
9710 const char *const start = s;
9711
9712 if (!s || s >= e) return 0;
9713
9714 /* remove spaces at head */
9715 if (single_byte_optimizable(str)) {
9716 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9717 }
9718 else {
9719 while (s < e) {
9720 int n;
9721 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9722
9723 if (cc && !rb_isspace(cc)) break;
9724 s += n;
9725 }
9726 }
9727 return s - start;
9728}
9729
9730/*
9731 * call-seq:
9732 * lstrip! -> self or nil
9733 *
9734 * Like String#lstrip, except that any modifications are made in +self+;
9735 * returns +self+ if any modification are made, +nil+ otherwise.
9736 *
9737 * Related: String#rstrip!, String#strip!.
9738 */
9739
9740static VALUE
9741rb_str_lstrip_bang(VALUE str)
9742{
9743 rb_encoding *enc;
9744 char *start, *s;
9745 long olen, loffset;
9746
9747 str_modify_keep_cr(str);
9748 enc = STR_ENC_GET(str);
9749 RSTRING_GETMEM(str, start, olen);
9750 loffset = lstrip_offset(str, start, start+olen, enc);
9751 if (loffset > 0) {
9752 long len = olen-loffset;
9753 s = start + loffset;
9754 memmove(start, s, len);
9755 STR_SET_LEN(str, len);
9756 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9757 return str;
9758 }
9759 return Qnil;
9760}
9761
9762
9763/*
9764 * call-seq:
9765 * lstrip -> new_string
9766 *
9767 * Returns a copy of +self+ with leading whitespace removed;
9768 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9769 *
9770 * whitespace = "\x00\t\n\v\f\r "
9771 * s = whitespace + 'abc' + whitespace
9772 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9773 * s.lstrip # => "abc\u0000\t\n\v\f\r "
9774 *
9775 * Related: String#rstrip, String#strip.
9776 */
9777
9778static VALUE
9779rb_str_lstrip(VALUE str)
9780{
9781 char *start;
9782 long len, loffset;
9783 RSTRING_GETMEM(str, start, len);
9784 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9785 if (loffset <= 0) return str_duplicate(rb_cString, str);
9786 return rb_str_subseq(str, loffset, len - loffset);
9787}
9788
9789static long
9790rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9791{
9792 const char *t;
9793
9794 rb_str_check_dummy_enc(enc);
9796 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
9797 }
9798 if (!s || s >= e) return 0;
9799 t = e;
9800
9801 /* remove trailing spaces or '\0's */
9802 if (single_byte_optimizable(str)) {
9803 unsigned char c;
9804 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9805 }
9806 else {
9807 char *tp;
9808
9809 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9810 unsigned int c = rb_enc_codepoint(tp, e, enc);
9811 if (c && !rb_isspace(c)) break;
9812 t = tp;
9813 }
9814 }
9815 return e - t;
9816}
9817
9818/*
9819 * call-seq:
9820 * rstrip! -> self or nil
9821 *
9822 * Like String#rstrip, except that any modifications are made in +self+;
9823 * returns +self+ if any modification are made, +nil+ otherwise.
9824 *
9825 * Related: String#lstrip!, String#strip!.
9826 */
9827
9828static VALUE
9829rb_str_rstrip_bang(VALUE str)
9830{
9831 rb_encoding *enc;
9832 char *start;
9833 long olen, roffset;
9834
9835 str_modify_keep_cr(str);
9836 enc = STR_ENC_GET(str);
9837 RSTRING_GETMEM(str, start, olen);
9838 roffset = rstrip_offset(str, start, start+olen, enc);
9839 if (roffset > 0) {
9840 long len = olen - roffset;
9841
9842 STR_SET_LEN(str, len);
9843 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9844 return str;
9845 }
9846 return Qnil;
9847}
9848
9849
9850/*
9851 * call-seq:
9852 * rstrip -> new_string
9853 *
9854 * Returns a copy of the receiver with trailing whitespace removed;
9855 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9856 *
9857 * whitespace = "\x00\t\n\v\f\r "
9858 * s = whitespace + 'abc' + whitespace
9859 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9860 * s.rstrip # => "\u0000\t\n\v\f\r abc"
9861 *
9862 * Related: String#lstrip, String#strip.
9863 */
9864
9865static VALUE
9866rb_str_rstrip(VALUE str)
9867{
9868 rb_encoding *enc;
9869 char *start;
9870 long olen, roffset;
9871
9872 enc = STR_ENC_GET(str);
9873 RSTRING_GETMEM(str, start, olen);
9874 roffset = rstrip_offset(str, start, start+olen, enc);
9875
9876 if (roffset <= 0) return str_duplicate(rb_cString, str);
9877 return rb_str_subseq(str, 0, olen-roffset);
9878}
9879
9880
9881/*
9882 * call-seq:
9883 * strip! -> self or nil
9884 *
9885 * Like String#strip, except that any modifications are made in +self+;
9886 * returns +self+ if any modification are made, +nil+ otherwise.
9887 *
9888 * Related: String#lstrip!, String#strip!.
9889 */
9890
9891static VALUE
9892rb_str_strip_bang(VALUE str)
9893{
9894 char *start;
9895 long olen, loffset, roffset;
9896 rb_encoding *enc;
9897
9898 str_modify_keep_cr(str);
9899 enc = STR_ENC_GET(str);
9900 RSTRING_GETMEM(str, start, olen);
9901 loffset = lstrip_offset(str, start, start+olen, enc);
9902 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9903
9904 if (loffset > 0 || roffset > 0) {
9905 long len = olen-roffset;
9906 if (loffset > 0) {
9907 len -= loffset;
9908 memmove(start, start + loffset, len);
9909 }
9910 STR_SET_LEN(str, len);
9911 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9912 return str;
9913 }
9914 return Qnil;
9915}
9916
9917
9918/*
9919 * call-seq:
9920 * strip -> new_string
9921 *
9922 * Returns a copy of the receiver with leading and trailing whitespace removed;
9923 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9924 *
9925 * whitespace = "\x00\t\n\v\f\r "
9926 * s = whitespace + 'abc' + whitespace
9927 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9928 * s.strip # => "abc"
9929 *
9930 * Related: String#lstrip, String#rstrip.
9931 */
9932
9933static VALUE
9934rb_str_strip(VALUE str)
9935{
9936 char *start;
9937 long olen, loffset, roffset;
9938 rb_encoding *enc = STR_ENC_GET(str);
9939
9940 RSTRING_GETMEM(str, start, olen);
9941 loffset = lstrip_offset(str, start, start+olen, enc);
9942 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9943
9944 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
9945 return rb_str_subseq(str, loffset, olen-loffset-roffset);
9946}
9947
9948static VALUE
9949scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
9950{
9951 VALUE result, match;
9952 struct re_registers *regs;
9953 int i;
9954 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9955 if (pos >= 0) {
9956 if (BUILTIN_TYPE(pat) == T_STRING) {
9957 regs = NULL;
9958 end = pos + RSTRING_LEN(pat);
9959 }
9960 else {
9961 match = rb_backref_get();
9962 regs = RMATCH_REGS(match);
9963 pos = BEG(0);
9964 end = END(0);
9965 }
9966 if (pos == end) {
9967 rb_encoding *enc = STR_ENC_GET(str);
9968 /*
9969 * Always consume at least one character of the input string
9970 */
9971 if (RSTRING_LEN(str) > end)
9972 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
9973 RSTRING_END(str), enc);
9974 else
9975 *start = end + 1;
9976 }
9977 else {
9978 *start = end;
9979 }
9980 if (!regs || regs->num_regs == 1) {
9981 result = rb_str_subseq(str, pos, end - pos);
9982 return result;
9983 }
9984 result = rb_ary_new2(regs->num_regs);
9985 for (i=1; i < regs->num_regs; i++) {
9986 VALUE s = Qnil;
9987 if (BEG(i) >= 0) {
9988 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
9989 }
9990 rb_ary_push(result, s);
9991 }
9992
9993 return result;
9994 }
9995 return Qnil;
9996}
9997
9998
9999/*
10000 * call-seq:
10001 * scan(string_or_regexp) -> array
10002 * scan(string_or_regexp) {|matches| ... } -> self
10003 *
10004 * Matches a pattern against +self+; the pattern is:
10005 *
10006 * - +string_or_regexp+ itself, if it is a Regexp.
10007 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10008 *
10009 * Iterates through +self+, generating a collection of matching results:
10010 *
10011 * - If the pattern contains no groups, each result is the
10012 * matched string, <code>$&</code>.
10013 * - If the pattern contains groups, each result is an array
10014 * containing one entry per group.
10015 *
10016 * With no block given, returns an array of the results:
10017 *
10018 * s = 'cruel world'
10019 * s.scan(/\w+/) # => ["cruel", "world"]
10020 * s.scan(/.../) # => ["cru", "el ", "wor"]
10021 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10022 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10023 *
10024 * With a block given, calls the block with each result; returns +self+:
10025 *
10026 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10027 * print "\n"
10028 * s.scan(/(.)(.)/) {|x,y| print y, x }
10029 * print "\n"
10030 *
10031 * Output:
10032 *
10033 * <<cruel>> <<world>>
10034 * rceu lowlr
10035 *
10036 */
10037
10038static VALUE
10039rb_str_scan(VALUE str, VALUE pat)
10040{
10041 VALUE result;
10042 long start = 0;
10043 long last = -1, prev = 0;
10044 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10045
10046 pat = get_pat_quoted(pat, 1);
10047 mustnot_broken(str);
10048 if (!rb_block_given_p()) {
10049 VALUE ary = rb_ary_new();
10050
10051 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10052 last = prev;
10053 prev = start;
10054 rb_ary_push(ary, result);
10055 }
10056 if (last >= 0) rb_pat_search(pat, str, last, 1);
10057 else rb_backref_set(Qnil);
10058 return ary;
10059 }
10060
10061 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10062 last = prev;
10063 prev = start;
10064 rb_yield(result);
10065 str_mod_check(str, p, len);
10066 }
10067 if (last >= 0) rb_pat_search(pat, str, last, 1);
10068 return str;
10069}
10070
10071
10072/*
10073 * call-seq:
10074 * hex -> integer
10075 *
10076 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10077 * (with an optional sign and an optional <code>0x</code>) and returns the
10078 * corresponding number;
10079 * returns zero if there is no such leading substring:
10080 *
10081 * '0x0a'.hex # => 10
10082 * '-1234'.hex # => -4660
10083 * '0'.hex # => 0
10084 * 'non-numeric'.hex # => 0
10085 *
10086 * Related: String#oct.
10087 *
10088 */
10089
10090static VALUE
10091rb_str_hex(VALUE str)
10092{
10093 return rb_str_to_inum(str, 16, FALSE);
10094}
10095
10096
10097/*
10098 * call-seq:
10099 * oct -> integer
10100 *
10101 * Interprets the leading substring of +self+ as a string of octal digits
10102 * (with an optional sign) and returns the corresponding number;
10103 * returns zero if there is no such leading substring:
10104 *
10105 * '123'.oct # => 83
10106 * '-377'.oct # => -255
10107 * '0377non-numeric'.oct # => 255
10108 * 'non-numeric'.oct # => 0
10109 *
10110 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10111 * see Kernel#Integer.
10112 *
10113 * Related: String#hex.
10114 *
10115 */
10116
10117static VALUE
10118rb_str_oct(VALUE str)
10119{
10120 return rb_str_to_inum(str, -8, FALSE);
10121}
10122
10123#ifndef HAVE_CRYPT_R
10124# include "ruby/thread_native.h"
10125# include "ruby/atomic.h"
10126
10127static struct {
10128 rb_nativethread_lock_t lock;
10129} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10130
10131static void
10132crypt_mutex_initialize(void)
10133{
10134}
10135#endif
10136
10137/*
10138 * call-seq:
10139 * crypt(salt_str) -> new_string
10140 *
10141 * Returns the string generated by calling <code>crypt(3)</code>
10142 * standard library function with <code>str</code> and
10143 * <code>salt_str</code>, in this order, as its arguments. Please do
10144 * not use this method any longer. It is legacy; provided only for
10145 * backward compatibility with ruby scripts in earlier days. It is
10146 * bad to use in contemporary programs for several reasons:
10147 *
10148 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10149 * run. The generated string lacks data portability.
10150 *
10151 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10152 * (i.e. silently ends up in unexpected results).
10153 *
10154 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10155 * thread safe.
10156 *
10157 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10158 * very very weak. According to its manpage, Linux's traditional
10159 * <code>crypt(3)</code> output has only 2**56 variations; too
10160 * easy to brute force today. And this is the default behaviour.
10161 *
10162 * * In order to make things robust some OSes implement so-called
10163 * "modular" usage. To go through, you have to do a complex
10164 * build-up of the <code>salt_str</code> parameter, by hand.
10165 * Failure in generation of a proper salt string tends not to
10166 * yield any errors; typos in parameters are normally not
10167 * detectable.
10168 *
10169 * * For instance, in the following example, the second invocation
10170 * of String#crypt is wrong; it has a typo in "round=" (lacks
10171 * "s"). However the call does not fail and something unexpected
10172 * is generated.
10173 *
10174 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10175 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10176 *
10177 * * Even in the "modular" mode, some hash functions are considered
10178 * archaic and no longer recommended at all; for instance module
10179 * <code>$1$</code> is officially abandoned by its author: see
10180 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10181 * instance module <code>$3$</code> is considered completely
10182 * broken: see the manpage of FreeBSD.
10183 *
10184 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10185 * written above, <code>crypt(3)</code> on Mac OS never fails.
10186 * This means even if you build up a proper salt string it
10187 * generates a traditional DES hash anyways, and there is no way
10188 * for you to be aware of.
10189 *
10190 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10191 *
10192 * If for some reason you cannot migrate to other secure contemporary
10193 * password hashing algorithms, install the string-crypt gem and
10194 * <code>require 'string/crypt'</code> to continue using it.
10195 */
10196
10197static VALUE
10198rb_str_crypt(VALUE str, VALUE salt)
10199{
10200#ifdef HAVE_CRYPT_R
10201 VALUE databuf;
10202 struct crypt_data *data;
10203# define CRYPT_END() ALLOCV_END(databuf)
10204#else
10205 extern char *crypt(const char *, const char *);
10206# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10207#endif
10208 VALUE result;
10209 const char *s, *saltp;
10210 char *res;
10211#ifdef BROKEN_CRYPT
10212 char salt_8bit_clean[3];
10213#endif
10214
10215 StringValue(salt);
10216 mustnot_wchar(str);
10217 mustnot_wchar(salt);
10218 s = StringValueCStr(str);
10219 saltp = RSTRING_PTR(salt);
10220 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10221 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10222 }
10223
10224#ifdef BROKEN_CRYPT
10225 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10226 salt_8bit_clean[0] = saltp[0] & 0x7f;
10227 salt_8bit_clean[1] = saltp[1] & 0x7f;
10228 salt_8bit_clean[2] = '\0';
10229 saltp = salt_8bit_clean;
10230 }
10231#endif
10232#ifdef HAVE_CRYPT_R
10233 data = ALLOCV(databuf, sizeof(struct crypt_data));
10234# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10235 data->initialized = 0;
10236# endif
10237 res = crypt_r(s, saltp, data);
10238#else
10239 crypt_mutex_initialize();
10240 rb_nativethread_lock_lock(&crypt_mutex.lock);
10241 res = crypt(s, saltp);
10242#endif
10243 if (!res) {
10244 int err = errno;
10245 CRYPT_END();
10246 rb_syserr_fail(err, "crypt");
10247 }
10248 result = rb_str_new_cstr(res);
10249 CRYPT_END();
10250 return result;
10251}
10252
10253
10254/*
10255 * call-seq:
10256 * ord -> integer
10257 *
10258 * :include: doc/string/ord.rdoc
10259 *
10260 */
10261
10262static VALUE
10263rb_str_ord(VALUE s)
10264{
10265 unsigned int c;
10266
10267 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10268 return UINT2NUM(c);
10269}
10270/*
10271 * call-seq:
10272 * sum(n = 16) -> integer
10273 *
10274 * :include: doc/string/sum.rdoc
10275 *
10276 */
10277
10278static VALUE
10279rb_str_sum(int argc, VALUE *argv, VALUE str)
10280{
10281 int bits = 16;
10282 char *ptr, *p, *pend;
10283 long len;
10284 VALUE sum = INT2FIX(0);
10285 unsigned long sum0 = 0;
10286
10287 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10288 bits = 0;
10289 }
10290 ptr = p = RSTRING_PTR(str);
10291 len = RSTRING_LEN(str);
10292 pend = p + len;
10293
10294 while (p < pend) {
10295 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10296 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10297 str_mod_check(str, ptr, len);
10298 sum0 = 0;
10299 }
10300 sum0 += (unsigned char)*p;
10301 p++;
10302 }
10303
10304 if (bits == 0) {
10305 if (sum0) {
10306 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10307 }
10308 }
10309 else {
10310 if (sum == INT2FIX(0)) {
10311 if (bits < (int)sizeof(long)*CHAR_BIT) {
10312 sum0 &= (((unsigned long)1)<<bits)-1;
10313 }
10314 sum = LONG2FIX(sum0);
10315 }
10316 else {
10317 VALUE mod;
10318
10319 if (sum0) {
10320 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10321 }
10322
10323 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10324 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10325 sum = rb_funcall(sum, '&', 1, mod);
10326 }
10327 }
10328 return sum;
10329}
10330
10331static VALUE
10332rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10333{
10334 rb_encoding *enc;
10335 VALUE w;
10336 long width, len, flen = 1, fclen = 1;
10337 VALUE res;
10338 char *p;
10339 const char *f = " ";
10340 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10341 VALUE pad;
10342 int singlebyte = 1, cr;
10343 int termlen;
10344
10345 rb_scan_args(argc, argv, "11", &w, &pad);
10346 enc = STR_ENC_GET(str);
10347 termlen = rb_enc_mbminlen(enc);
10348 width = NUM2LONG(w);
10349 if (argc == 2) {
10350 StringValue(pad);
10351 enc = rb_enc_check(str, pad);
10352 f = RSTRING_PTR(pad);
10353 flen = RSTRING_LEN(pad);
10354 fclen = str_strlen(pad, enc); /* rb_enc_check */
10355 singlebyte = single_byte_optimizable(pad);
10356 if (flen == 0 || fclen == 0) {
10357 rb_raise(rb_eArgError, "zero width padding");
10358 }
10359 }
10360 len = str_strlen(str, enc); /* rb_enc_check */
10361 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10362 n = width - len;
10363 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10364 rlen = n - llen;
10365 cr = ENC_CODERANGE(str);
10366 if (flen > 1) {
10367 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10368 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10369 }
10370 size = RSTRING_LEN(str);
10371 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10372 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10373 (len += llen2 + rlen2) >= LONG_MAX - size) {
10374 rb_raise(rb_eArgError, "argument too big");
10375 }
10376 len += size;
10377 res = str_new0(rb_cString, 0, len, termlen);
10378 p = RSTRING_PTR(res);
10379 if (flen <= 1) {
10380 memset(p, *f, llen);
10381 p += llen;
10382 }
10383 else {
10384 while (llen >= fclen) {
10385 memcpy(p,f,flen);
10386 p += flen;
10387 llen -= fclen;
10388 }
10389 if (llen > 0) {
10390 memcpy(p, f, llen2);
10391 p += llen2;
10392 }
10393 }
10394 memcpy(p, RSTRING_PTR(str), size);
10395 p += size;
10396 if (flen <= 1) {
10397 memset(p, *f, rlen);
10398 p += rlen;
10399 }
10400 else {
10401 while (rlen >= fclen) {
10402 memcpy(p,f,flen);
10403 p += flen;
10404 rlen -= fclen;
10405 }
10406 if (rlen > 0) {
10407 memcpy(p, f, rlen2);
10408 p += rlen2;
10409 }
10410 }
10411 TERM_FILL(p, termlen);
10412 STR_SET_LEN(res, p-RSTRING_PTR(res));
10413 rb_enc_associate(res, enc);
10414 if (argc == 2)
10415 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10416 if (cr != ENC_CODERANGE_BROKEN)
10417 ENC_CODERANGE_SET(res, cr);
10418
10419 RB_GC_GUARD(pad);
10420 return res;
10421}
10422
10423
10424/*
10425 * call-seq:
10426 * ljust(size, pad_string = ' ') -> new_string
10427 *
10428 * :include: doc/string/ljust.rdoc
10429 *
10430 * Related: String#rjust, String#center.
10431 *
10432 */
10433
10434static VALUE
10435rb_str_ljust(int argc, VALUE *argv, VALUE str)
10436{
10437 return rb_str_justify(argc, argv, str, 'l');
10438}
10439
10440/*
10441 * call-seq:
10442 * rjust(size, pad_string = ' ') -> new_string
10443 *
10444 * :include: doc/string/rjust.rdoc
10445 *
10446 * Related: String#ljust, String#center.
10447 *
10448 */
10449
10450static VALUE
10451rb_str_rjust(int argc, VALUE *argv, VALUE str)
10452{
10453 return rb_str_justify(argc, argv, str, 'r');
10454}
10455
10456
10457/*
10458 * call-seq:
10459 * center(size, pad_string = ' ') -> new_string
10460 *
10461 * :include: doc/string/center.rdoc
10462 *
10463 * Related: String#ljust, String#rjust.
10464 *
10465 */
10466
10467static VALUE
10468rb_str_center(int argc, VALUE *argv, VALUE str)
10469{
10470 return rb_str_justify(argc, argv, str, 'c');
10471}
10472
10473/*
10474 * call-seq:
10475 * partition(string_or_regexp) -> [head, match, tail]
10476 *
10477 * :include: doc/string/partition.rdoc
10478 *
10479 */
10480
10481static VALUE
10482rb_str_partition(VALUE str, VALUE sep)
10483{
10484 long pos;
10485
10486 sep = get_pat_quoted(sep, 0);
10487 if (RB_TYPE_P(sep, T_REGEXP)) {
10488 if (rb_reg_search(sep, str, 0, 0) < 0) {
10489 goto failed;
10490 }
10491 VALUE match = rb_backref_get();
10492 struct re_registers *regs = RMATCH_REGS(match);
10493
10494 pos = BEG(0);
10495 sep = rb_str_subseq(str, pos, END(0) - pos);
10496 }
10497 else {
10498 pos = rb_str_index(str, sep, 0);
10499 if (pos < 0) goto failed;
10500 }
10501 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10502 sep,
10503 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10504 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10505
10506 failed:
10507 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10508}
10509
10510/*
10511 * call-seq:
10512 * rpartition(sep) -> [head, match, tail]
10513 *
10514 * :include: doc/string/rpartition.rdoc
10515 *
10516 */
10517
10518static VALUE
10519rb_str_rpartition(VALUE str, VALUE sep)
10520{
10521 long pos = RSTRING_LEN(str);
10522
10523 sep = get_pat_quoted(sep, 0);
10524 if (RB_TYPE_P(sep, T_REGEXP)) {
10525 if (rb_reg_search(sep, str, pos, 1) < 0) {
10526 goto failed;
10527 }
10528 VALUE match = rb_backref_get();
10529 struct re_registers *regs = RMATCH_REGS(match);
10530
10531 pos = BEG(0);
10532 sep = rb_str_subseq(str, pos, END(0) - pos);
10533 }
10534 else {
10535 pos = rb_str_sublen(str, pos);
10536 pos = rb_str_rindex(str, sep, pos);
10537 if (pos < 0) {
10538 goto failed;
10539 }
10540 pos = rb_str_offset(str, pos);
10541 }
10542
10543 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10544 sep,
10545 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10546 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10547 failed:
10548 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10549}
10550
10551/*
10552 * call-seq:
10553 * start_with?(*string_or_regexp) -> true or false
10554 *
10555 * :include: doc/string/start_with_p.rdoc
10556 *
10557 */
10558
10559static VALUE
10560rb_str_start_with(int argc, VALUE *argv, VALUE str)
10561{
10562 int i;
10563
10564 for (i=0; i<argc; i++) {
10565 VALUE tmp = argv[i];
10566 if (RB_TYPE_P(tmp, T_REGEXP)) {
10567 if (rb_reg_start_with_p(tmp, str))
10568 return Qtrue;
10569 }
10570 else {
10571 StringValue(tmp);
10572 rb_enc_check(str, tmp);
10573 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
10574 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10575 return Qtrue;
10576 }
10577 }
10578 return Qfalse;
10579}
10580
10581/*
10582 * call-seq:
10583 * end_with?(*strings) -> true or false
10584 *
10585 * :include: doc/string/end_with_p.rdoc
10586 *
10587 */
10588
10589static VALUE
10590rb_str_end_with(int argc, VALUE *argv, VALUE str)
10591{
10592 int i;
10593 char *p, *s, *e;
10594 rb_encoding *enc;
10595
10596 for (i=0; i<argc; i++) {
10597 VALUE tmp = argv[i];
10598 long slen, tlen;
10599 StringValue(tmp);
10600 enc = rb_enc_check(str, tmp);
10601 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10602 if ((slen = RSTRING_LEN(str)) < tlen) continue;
10603 p = RSTRING_PTR(str);
10604 e = p + slen;
10605 s = e - tlen;
10606 if (rb_enc_left_char_head(p, s, e, enc) != s)
10607 continue;
10608 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10609 return Qtrue;
10610 }
10611 return Qfalse;
10612}
10613
10623static long
10624deleted_prefix_length(VALUE str, VALUE prefix)
10625{
10626 char *strptr, *prefixptr;
10627 long olen, prefixlen;
10628
10629 StringValue(prefix);
10630 if (is_broken_string(prefix)) return 0;
10631 rb_enc_check(str, prefix);
10632
10633 /* return 0 if not start with prefix */
10634 prefixlen = RSTRING_LEN(prefix);
10635 if (prefixlen <= 0) return 0;
10636 olen = RSTRING_LEN(str);
10637 if (olen < prefixlen) return 0;
10638 strptr = RSTRING_PTR(str);
10639 prefixptr = RSTRING_PTR(prefix);
10640 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10641
10642 return prefixlen;
10643}
10644
10645/*
10646 * call-seq:
10647 * delete_prefix!(prefix) -> self or nil
10648 *
10649 * Like String#delete_prefix, except that +self+ is modified in place.
10650 * Returns +self+ if the prefix is removed, +nil+ otherwise.
10651 *
10652 */
10653
10654static VALUE
10655rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10656{
10657 long prefixlen;
10658 str_modify_keep_cr(str);
10659
10660 prefixlen = deleted_prefix_length(str, prefix);
10661 if (prefixlen <= 0) return Qnil;
10662
10663 return rb_str_drop_bytes(str, prefixlen);
10664}
10665
10666/*
10667 * call-seq:
10668 * delete_prefix(prefix) -> new_string
10669 *
10670 * :include: doc/string/delete_prefix.rdoc
10671 *
10672 */
10673
10674static VALUE
10675rb_str_delete_prefix(VALUE str, VALUE prefix)
10676{
10677 long prefixlen;
10678
10679 prefixlen = deleted_prefix_length(str, prefix);
10680 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10681
10682 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10683}
10684
10694static long
10695deleted_suffix_length(VALUE str, VALUE suffix)
10696{
10697 char *strptr, *suffixptr, *s;
10698 long olen, suffixlen;
10699 rb_encoding *enc;
10700
10701 StringValue(suffix);
10702 if (is_broken_string(suffix)) return 0;
10703 enc = rb_enc_check(str, suffix);
10704
10705 /* return 0 if not start with suffix */
10706 suffixlen = RSTRING_LEN(suffix);
10707 if (suffixlen <= 0) return 0;
10708 olen = RSTRING_LEN(str);
10709 if (olen < suffixlen) return 0;
10710 strptr = RSTRING_PTR(str);
10711 suffixptr = RSTRING_PTR(suffix);
10712 s = strptr + olen - suffixlen;
10713 if (memcmp(s, suffixptr, suffixlen) != 0) return 0;
10714 if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0;
10715
10716 return suffixlen;
10717}
10718
10719/*
10720 * call-seq:
10721 * delete_suffix!(suffix) -> self or nil
10722 *
10723 * Like String#delete_suffix, except that +self+ is modified in place.
10724 * Returns +self+ if the suffix is removed, +nil+ otherwise.
10725 *
10726 */
10727
10728static VALUE
10729rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10730{
10731 long olen, suffixlen, len;
10732 str_modifiable(str);
10733
10734 suffixlen = deleted_suffix_length(str, suffix);
10735 if (suffixlen <= 0) return Qnil;
10736
10737 olen = RSTRING_LEN(str);
10738 str_modify_keep_cr(str);
10739 len = olen - suffixlen;
10740 STR_SET_LEN(str, len);
10741 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10742 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10744 }
10745 return str;
10746}
10747
10748/*
10749 * call-seq:
10750 * delete_suffix(suffix) -> new_string
10751 *
10752 * :include: doc/string/delete_suffix.rdoc
10753 *
10754 */
10755
10756static VALUE
10757rb_str_delete_suffix(VALUE str, VALUE suffix)
10758{
10759 long suffixlen;
10760
10761 suffixlen = deleted_suffix_length(str, suffix);
10762 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10763
10764 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10765}
10766
10767void
10768rb_str_setter(VALUE val, ID id, VALUE *var)
10769{
10770 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10771 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10772 }
10773 *var = val;
10774}
10775
10776static void
10777rb_fs_setter(VALUE val, ID id, VALUE *var)
10778{
10779 val = rb_fs_check(val);
10780 if (!val) {
10782 "value of %"PRIsVALUE" must be String or Regexp",
10783 rb_id2str(id));
10784 }
10785 if (!NIL_P(val)) {
10786 rb_warn_deprecated("`$;'", NULL);
10787 }
10788 *var = val;
10789}
10790
10791
10792/*
10793 * call-seq:
10794 * force_encoding(encoding) -> self
10795 *
10796 * :include: doc/string/force_encoding.rdoc
10797 *
10798 */
10799
10800static VALUE
10801rb_str_force_encoding(VALUE str, VALUE enc)
10802{
10803 str_modifiable(str);
10804 rb_enc_associate(str, rb_to_encoding(enc));
10806 return str;
10807}
10808
10809/*
10810 * call-seq:
10811 * b -> string
10812 *
10813 * :include: doc/string/b.rdoc
10814 *
10815 */
10816
10817static VALUE
10818rb_str_b(VALUE str)
10819{
10820 VALUE str2;
10821 if (FL_TEST(str, STR_NOEMBED)) {
10822 str2 = str_alloc_heap(rb_cString);
10823 }
10824 else {
10825 str2 = str_alloc_embed(rb_cString, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
10826 }
10827 str_replace_shared_without_enc(str2, str);
10828
10829 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
10830 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
10831 // If we know the receiver's code range then we know the result's code range.
10832 int cr = ENC_CODERANGE(str);
10833 switch (cr) {
10834 case ENC_CODERANGE_7BIT:
10836 break;
10840 break;
10841 default:
10842 ENC_CODERANGE_CLEAR(str2);
10843 break;
10844 }
10845 }
10846
10847 return str2;
10848}
10849
10850/*
10851 * call-seq:
10852 * valid_encoding? -> true or false
10853 *
10854 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
10855 *
10856 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? # => true
10857 * "\xc2".force_encoding("UTF-8").valid_encoding? # => false
10858 * "\x80".force_encoding("UTF-8").valid_encoding? # => false
10859 */
10860
10861static VALUE
10862rb_str_valid_encoding_p(VALUE str)
10863{
10864 int cr = rb_enc_str_coderange(str);
10865
10866 return RBOOL(cr != ENC_CODERANGE_BROKEN);
10867}
10868
10869/*
10870 * call-seq:
10871 * ascii_only? -> true or false
10872 *
10873 * Returns +true+ if +self+ contains only ASCII characters,
10874 * +false+ otherwise:
10875 *
10876 * 'abc'.ascii_only? # => true
10877 * "abc\u{6666}".ascii_only? # => false
10878 *
10879 */
10880
10881static VALUE
10882rb_str_is_ascii_only_p(VALUE str)
10883{
10884 int cr = rb_enc_str_coderange(str);
10885
10886 return RBOOL(cr == ENC_CODERANGE_7BIT);
10887}
10888
10889VALUE
10891{
10892 static const char ellipsis[] = "...";
10893 const long ellipsislen = sizeof(ellipsis) - 1;
10894 rb_encoding *const enc = rb_enc_get(str);
10895 const long blen = RSTRING_LEN(str);
10896 const char *const p = RSTRING_PTR(str), *e = p + blen;
10897 VALUE estr, ret = 0;
10898
10899 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
10900 if (len * rb_enc_mbminlen(enc) >= blen ||
10901 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
10902 ret = str;
10903 }
10904 else if (len <= ellipsislen ||
10905 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
10906 if (rb_enc_asciicompat(enc)) {
10907 ret = rb_str_new(ellipsis, len);
10908 rb_enc_associate(ret, enc);
10909 }
10910 else {
10911 estr = rb_usascii_str_new(ellipsis, len);
10912 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
10913 }
10914 }
10915 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
10916 rb_str_cat(ret, ellipsis, ellipsislen);
10917 }
10918 else {
10919 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
10920 rb_enc_from_encoding(enc), 0, Qnil);
10921 rb_str_append(ret, estr);
10922 }
10923 return ret;
10924}
10925
10926static VALUE
10927str_compat_and_valid(VALUE str, rb_encoding *enc)
10928{
10929 int cr;
10930 str = StringValue(str);
10931 cr = rb_enc_str_coderange(str);
10932 if (cr == ENC_CODERANGE_BROKEN) {
10933 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
10934 }
10935 else {
10936 rb_encoding *e = STR_ENC_GET(str);
10937 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
10938 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
10939 rb_enc_name(enc), rb_enc_name(e));
10940 }
10941 }
10942 return str;
10943}
10944
10945static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
10946
10947VALUE
10949{
10950 rb_encoding *enc = STR_ENC_GET(str);
10951 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
10952}
10953
10954VALUE
10955rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
10956{
10957 int cr = ENC_CODERANGE_UNKNOWN;
10958 if (enc == STR_ENC_GET(str)) {
10959 /* cached coderange makes sense only when enc equals the
10960 * actual encoding of str */
10961 cr = ENC_CODERANGE(str);
10962 }
10963 return enc_str_scrub(enc, str, repl, cr);
10964}
10965
10966static VALUE
10967enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
10968{
10969 int encidx;
10970 VALUE buf = Qnil;
10971 const char *rep, *p, *e, *p1, *sp;
10972 long replen = -1;
10973 long slen;
10974
10975 if (rb_block_given_p()) {
10976 if (!NIL_P(repl))
10977 rb_raise(rb_eArgError, "both of block and replacement given");
10978 replen = 0;
10979 }
10980
10981 if (ENC_CODERANGE_CLEAN_P(cr))
10982 return Qnil;
10983
10984 if (!NIL_P(repl)) {
10985 repl = str_compat_and_valid(repl, enc);
10986 }
10987
10988 if (rb_enc_dummy_p(enc)) {
10989 return Qnil;
10990 }
10991 encidx = rb_enc_to_index(enc);
10992
10993#define DEFAULT_REPLACE_CHAR(str) do { \
10994 static const char replace[sizeof(str)-1] = str; \
10995 rep = replace; replen = (int)sizeof(replace); \
10996 } while (0)
10997
10998 slen = RSTRING_LEN(str);
10999 p = RSTRING_PTR(str);
11000 e = RSTRING_END(str);
11001 p1 = p;
11002 sp = p;
11003
11004 if (rb_enc_asciicompat(enc)) {
11005 int rep7bit_p;
11006 if (!replen) {
11007 rep = NULL;
11008 rep7bit_p = FALSE;
11009 }
11010 else if (!NIL_P(repl)) {
11011 rep = RSTRING_PTR(repl);
11012 replen = RSTRING_LEN(repl);
11013 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11014 }
11015 else if (encidx == rb_utf8_encindex()) {
11016 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11017 rep7bit_p = FALSE;
11018 }
11019 else {
11020 DEFAULT_REPLACE_CHAR("?");
11021 rep7bit_p = TRUE;
11022 }
11023 cr = ENC_CODERANGE_7BIT;
11024
11025 p = search_nonascii(p, e);
11026 if (!p) {
11027 p = e;
11028 }
11029 while (p < e) {
11030 int ret = rb_enc_precise_mbclen(p, e, enc);
11031 if (MBCLEN_NEEDMORE_P(ret)) {
11032 break;
11033 }
11034 else if (MBCLEN_CHARFOUND_P(ret)) {
11036 p += MBCLEN_CHARFOUND_LEN(ret);
11037 }
11038 else if (MBCLEN_INVALID_P(ret)) {
11039 /*
11040 * p1~p: valid ascii/multibyte chars
11041 * p ~e: invalid bytes + unknown bytes
11042 */
11043 long clen = rb_enc_mbmaxlen(enc);
11044 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11045 if (p > p1) {
11046 rb_str_buf_cat(buf, p1, p - p1);
11047 }
11048
11049 if (e - p < clen) clen = e - p;
11050 if (clen <= 2) {
11051 clen = 1;
11052 }
11053 else {
11054 const char *q = p;
11055 clen--;
11056 for (; clen > 1; clen--) {
11057 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11058 if (MBCLEN_NEEDMORE_P(ret)) break;
11059 if (MBCLEN_INVALID_P(ret)) continue;
11061 }
11062 }
11063 if (rep) {
11064 rb_str_buf_cat(buf, rep, replen);
11065 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11066 }
11067 else {
11068 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11069 str_mod_check(str, sp, slen);
11070 repl = str_compat_and_valid(repl, enc);
11071 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11074 }
11075 p += clen;
11076 p1 = p;
11077 p = search_nonascii(p, e);
11078 if (!p) {
11079 p = e;
11080 break;
11081 }
11082 }
11083 else {
11085 }
11086 }
11087 if (NIL_P(buf)) {
11088 if (p == e) {
11089 ENC_CODERANGE_SET(str, cr);
11090 return Qnil;
11091 }
11092 buf = rb_str_buf_new(RSTRING_LEN(str));
11093 }
11094 if (p1 < p) {
11095 rb_str_buf_cat(buf, p1, p - p1);
11096 }
11097 if (p < e) {
11098 if (rep) {
11099 rb_str_buf_cat(buf, rep, replen);
11100 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11101 }
11102 else {
11103 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11104 str_mod_check(str, sp, slen);
11105 repl = str_compat_and_valid(repl, enc);
11106 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11109 }
11110 }
11111 }
11112 else {
11113 /* ASCII incompatible */
11114 long mbminlen = rb_enc_mbminlen(enc);
11115 if (!replen) {
11116 rep = NULL;
11117 }
11118 else if (!NIL_P(repl)) {
11119 rep = RSTRING_PTR(repl);
11120 replen = RSTRING_LEN(repl);
11121 }
11122 else if (encidx == ENCINDEX_UTF_16BE) {
11123 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11124 }
11125 else if (encidx == ENCINDEX_UTF_16LE) {
11126 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11127 }
11128 else if (encidx == ENCINDEX_UTF_32BE) {
11129 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11130 }
11131 else if (encidx == ENCINDEX_UTF_32LE) {
11132 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11133 }
11134 else {
11135 DEFAULT_REPLACE_CHAR("?");
11136 }
11137
11138 while (p < e) {
11139 int ret = rb_enc_precise_mbclen(p, e, enc);
11140 if (MBCLEN_NEEDMORE_P(ret)) {
11141 break;
11142 }
11143 else if (MBCLEN_CHARFOUND_P(ret)) {
11144 p += MBCLEN_CHARFOUND_LEN(ret);
11145 }
11146 else if (MBCLEN_INVALID_P(ret)) {
11147 const char *q = p;
11148 long clen = rb_enc_mbmaxlen(enc);
11149 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11150 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11151
11152 if (e - p < clen) clen = e - p;
11153 if (clen <= mbminlen * 2) {
11154 clen = mbminlen;
11155 }
11156 else {
11157 clen -= mbminlen;
11158 for (; clen > mbminlen; clen-=mbminlen) {
11159 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11160 if (MBCLEN_NEEDMORE_P(ret)) break;
11161 if (MBCLEN_INVALID_P(ret)) continue;
11163 }
11164 }
11165 if (rep) {
11166 rb_str_buf_cat(buf, rep, replen);
11167 }
11168 else {
11169 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11170 str_mod_check(str, sp, slen);
11171 repl = str_compat_and_valid(repl, enc);
11172 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11173 }
11174 p += clen;
11175 p1 = p;
11176 }
11177 else {
11179 }
11180 }
11181 if (NIL_P(buf)) {
11182 if (p == e) {
11184 return Qnil;
11185 }
11186 buf = rb_str_buf_new(RSTRING_LEN(str));
11187 }
11188 if (p1 < p) {
11189 rb_str_buf_cat(buf, p1, p - p1);
11190 }
11191 if (p < e) {
11192 if (rep) {
11193 rb_str_buf_cat(buf, rep, replen);
11194 }
11195 else {
11196 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11197 str_mod_check(str, sp, slen);
11198 repl = str_compat_and_valid(repl, enc);
11199 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11200 }
11201 }
11203 }
11204 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11205 return buf;
11206}
11207
11208/*
11209 * call-seq:
11210 * scrub(replacement_string = default_replacement) -> new_string
11211 * scrub{|bytes| ... } -> new_string
11212 *
11213 * :include: doc/string/scrub.rdoc
11214 *
11215 */
11216static VALUE
11217str_scrub(int argc, VALUE *argv, VALUE str)
11218{
11219 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11220 VALUE new = rb_str_scrub(str, repl);
11221 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11222}
11223
11224/*
11225 * call-seq:
11226 * scrub! -> self
11227 * scrub!(replacement_string = default_replacement) -> self
11228 * scrub!{|bytes| ... } -> self
11229 *
11230 * Like String#scrub, except that any replacements are made in +self+.
11231 *
11232 */
11233static VALUE
11234str_scrub_bang(int argc, VALUE *argv, VALUE str)
11235{
11236 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11237 VALUE new = rb_str_scrub(str, repl);
11238 if (!NIL_P(new)) rb_str_replace(str, new);
11239 return str;
11240}
11241
11242static ID id_normalize;
11243static ID id_normalized_p;
11244static VALUE mUnicodeNormalize;
11245
11246static VALUE
11247unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11248{
11249 static int UnicodeNormalizeRequired = 0;
11250 VALUE argv2[2];
11251
11252 if (!UnicodeNormalizeRequired) {
11253 rb_require("unicode_normalize/normalize.rb");
11254 UnicodeNormalizeRequired = 1;
11255 }
11256 argv2[0] = str;
11257 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11258 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11259}
11260
11261/*
11262 * call-seq:
11263 * unicode_normalize(form = :nfc) -> string
11264 *
11265 * Returns a copy of +self+ with
11266 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11267 *
11268 * Argument +form+ must be one of the following symbols
11269 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11270 *
11271 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11272 * - +:nfd+: Canonical decomposition.
11273 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11274 * - +:nfkd+: Compatibility decomposition.
11275 *
11276 * The encoding of +self+ must be one of:
11277 *
11278 * - Encoding::UTF_8
11279 * - Encoding::UTF_16BE
11280 * - Encoding::UTF_16LE
11281 * - Encoding::UTF_32BE
11282 * - Encoding::UTF_32LE
11283 * - Encoding::GB18030
11284 * - Encoding::UCS_2BE
11285 * - Encoding::UCS_4BE
11286 *
11287 * Examples:
11288 *
11289 * "a\u0300".unicode_normalize # => "a"
11290 * "\u00E0".unicode_normalize(:nfd) # => "a "
11291 *
11292 * Related: String#unicode_normalize!, String#unicode_normalized?.
11293 */
11294static VALUE
11295rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11296{
11297 return unicode_normalize_common(argc, argv, str, id_normalize);
11298}
11299
11300/*
11301 * call-seq:
11302 * unicode_normalize!(form = :nfc) -> self
11303 *
11304 * Like String#unicode_normalize, except that the normalization
11305 * is performed on +self+.
11306 *
11307 * Related String#unicode_normalized?.
11308 *
11309 */
11310static VALUE
11311rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11312{
11313 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11314}
11315
11316/* call-seq:
11317 * unicode_normalized?(form = :nfc) -> true or false
11318 *
11319 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11320 * +false+ otherwise.
11321 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11322 *
11323 * Examples:
11324 *
11325 * "a\u0300".unicode_normalized? # => false
11326 * "a\u0300".unicode_normalized?(:nfd) # => true
11327 * "\u00E0".unicode_normalized? # => true
11328 * "\u00E0".unicode_normalized?(:nfd) # => false
11329 *
11330 *
11331 * Raises an exception if +self+ is not in a Unicode encoding:
11332 *
11333 * s = "\xE0".force_encoding('ISO-8859-1')
11334 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11335 *
11336 * Related: String#unicode_normalize, String#unicode_normalize!.
11337 *
11338 */
11339static VALUE
11340rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11341{
11342 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11343}
11344
11345/**********************************************************************
11346 * Document-class: Symbol
11347 *
11348 * Symbol objects represent named identifiers inside the Ruby interpreter.
11349 *
11350 * You can create a \Symbol object explicitly with:
11351 *
11352 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11353 *
11354 * The same Symbol object will be
11355 * created for a given name or string for the duration of a program's
11356 * execution, regardless of the context or meaning of that name. Thus
11357 * if <code>Fred</code> is a constant in one context, a method in
11358 * another, and a class in a third, the Symbol <code>:Fred</code>
11359 * will be the same object in all three contexts.
11360 *
11361 * module One
11362 * class Fred
11363 * end
11364 * $f1 = :Fred
11365 * end
11366 * module Two
11367 * Fred = 1
11368 * $f2 = :Fred
11369 * end
11370 * def Fred()
11371 * end
11372 * $f3 = :Fred
11373 * $f1.object_id #=> 2514190
11374 * $f2.object_id #=> 2514190
11375 * $f3.object_id #=> 2514190
11376 *
11377 * Constant, method, and variable names are returned as symbols:
11378 *
11379 * module One
11380 * Two = 2
11381 * def three; 3 end
11382 * @four = 4
11383 * @@five = 5
11384 * $six = 6
11385 * end
11386 * seven = 7
11387 *
11388 * One.constants
11389 * # => [:Two]
11390 * One.instance_methods(true)
11391 * # => [:three]
11392 * One.instance_variables
11393 * # => [:@four]
11394 * One.class_variables
11395 * # => [:@@five]
11396 * global_variables.grep(/six/)
11397 * # => [:$six]
11398 * local_variables
11399 * # => [:seven]
11400 *
11401 * Symbol objects are different from String objects in that
11402 * Symbol objects represent identifiers, while String objects
11403 * represent text or data.
11404 *
11405 * == What's Here
11406 *
11407 * First, what's elsewhere. \Class \Symbol:
11408 *
11409 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11410 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
11411 *
11412 * Here, class \Symbol provides methods that are useful for:
11413 *
11414 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
11415 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
11416 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
11417 *
11418 * === Methods for Querying
11419 *
11420 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
11421 * - #=~: Returns the index of the first substring in symbol that matches a
11422 * given Regexp or other object; returns +nil+ if no match is found.
11423 * - #[], #slice : Returns a substring of symbol
11424 * determined by a given index, start/length, or range, or string.
11425 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11426 * - #encoding: Returns the Encoding object that represents the encoding
11427 * of symbol.
11428 * - #end_with?: Returns +true+ if symbol ends with
11429 * any of the given strings.
11430 * - #match: Returns a MatchData object if symbol
11431 * matches a given Regexp; +nil+ otherwise.
11432 * - #match?: Returns +true+ if symbol
11433 * matches a given Regexp; +false+ otherwise.
11434 * - #length, #size: Returns the number of characters in symbol.
11435 * - #start_with?: Returns +true+ if symbol starts with
11436 * any of the given strings.
11437 *
11438 * === Methods for Comparing
11439 *
11440 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
11441 * or larger than symbol.
11442 * - #==, #===: Returns +true+ if a given symbol has the same content and
11443 * encoding.
11444 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
11445 * symbol is smaller than, equal to, or larger than symbol.
11446 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
11447 * after Unicode case folding; +false+ otherwise.
11448 *
11449 * === Methods for Converting
11450 *
11451 * - #capitalize: Returns symbol with the first character upcased
11452 * and all other characters downcased.
11453 * - #downcase: Returns symbol with all characters downcased.
11454 * - #inspect: Returns the string representation of +self+ as a symbol literal.
11455 * - #name: Returns the frozen string corresponding to symbol.
11456 * - #succ, #next: Returns the symbol that is the successor to symbol.
11457 * - #swapcase: Returns symbol with all upcase characters downcased
11458 * and all downcase characters upcased.
11459 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
11460 * - #to_s, #id2name: Returns the string corresponding to +self+.
11461 * - #to_sym, #intern: Returns +self+.
11462 * - #upcase: Returns symbol with all characters upcased.
11463 *
11464 */
11465
11466
11467/*
11468 * call-seq:
11469 * symbol == object -> true or false
11470 *
11471 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
11472 *
11473 * Symbol#=== is an alias for Symbol#==.
11474 *
11475 */
11476
11477#define sym_equal rb_obj_equal
11478
11479static int
11480sym_printable(const char *s, const char *send, rb_encoding *enc)
11481{
11482 while (s < send) {
11483 int n;
11484 int c = rb_enc_precise_mbclen(s, send, enc);
11485
11486 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11487 n = MBCLEN_CHARFOUND_LEN(c);
11488 c = rb_enc_mbc_to_codepoint(s, send, enc);
11489 if (!rb_enc_isprint(c, enc)) return FALSE;
11490 s += n;
11491 }
11492 return TRUE;
11493}
11494
11495int
11496rb_str_symname_p(VALUE sym)
11497{
11498 rb_encoding *enc;
11499 const char *ptr;
11500 long len;
11501 rb_encoding *resenc = rb_default_internal_encoding();
11502
11503 if (resenc == NULL) resenc = rb_default_external_encoding();
11504 enc = STR_ENC_GET(sym);
11505 ptr = RSTRING_PTR(sym);
11506 len = RSTRING_LEN(sym);
11507 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11508 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11509 return FALSE;
11510 }
11511 return TRUE;
11512}
11513
11514VALUE
11515rb_str_quote_unprintable(VALUE str)
11516{
11517 rb_encoding *enc;
11518 const char *ptr;
11519 long len;
11520 rb_encoding *resenc;
11521
11522 Check_Type(str, T_STRING);
11523 resenc = rb_default_internal_encoding();
11524 if (resenc == NULL) resenc = rb_default_external_encoding();
11525 enc = STR_ENC_GET(str);
11526 ptr = RSTRING_PTR(str);
11527 len = RSTRING_LEN(str);
11528 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11529 !sym_printable(ptr, ptr + len, enc)) {
11530 return rb_str_escape(str);
11531 }
11532 return str;
11533}
11534
11535MJIT_FUNC_EXPORTED VALUE
11536rb_id_quote_unprintable(ID id)
11537{
11538 VALUE str = rb_id2str(id);
11539 if (!rb_str_symname_p(str)) {
11540 return rb_str_escape(str);
11541 }
11542 return str;
11543}
11544
11545/*
11546 * call-seq:
11547 * inspect -> string
11548 *
11549 * Returns a string representation of +self+ (including the leading colon):
11550 *
11551 * :foo.inspect # => ":foo"
11552 *
11553 * Related: Symbol#to_s, Symbol#name.
11554 *
11555 */
11556
11557static VALUE
11558sym_inspect(VALUE sym)
11559{
11560 VALUE str = rb_sym2str(sym);
11561 const char *ptr;
11562 long len;
11563 char *dest;
11564
11565 if (!rb_str_symname_p(str)) {
11566 str = rb_str_inspect(str);
11567 len = RSTRING_LEN(str);
11568 rb_str_resize(str, len + 1);
11569 dest = RSTRING_PTR(str);
11570 memmove(dest + 1, dest, len);
11571 }
11572 else {
11573 rb_encoding *enc = STR_ENC_GET(str);
11574 RSTRING_GETMEM(str, ptr, len);
11575 str = rb_enc_str_new(0, len + 1, enc);
11576 dest = RSTRING_PTR(str);
11577 memcpy(dest + 1, ptr, len);
11578 }
11579 dest[0] = ':';
11580 return str;
11581}
11582
11583/*
11584 * call-seq:
11585 * to_s -> string
11586 *
11587 * Returns a string representation of +self+ (not including the leading colon):
11588 *
11589 * :foo.to_s # => "foo"
11590 *
11591 * Symbol#id2name is an alias for Symbol#to_s.
11592 *
11593 * Related: Symbol#inspect, Symbol#name.
11594 */
11595
11596VALUE
11598{
11599 return str_new_shared(rb_cString, rb_sym2str(sym));
11600}
11601
11602MJIT_FUNC_EXPORTED VALUE
11603rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11604{
11605 VALUE obj;
11606
11607 if (argc < 1) {
11608 rb_raise(rb_eArgError, "no receiver given");
11609 }
11610 obj = argv[0];
11611 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11612}
11613
11614/*
11615 * call-seq:
11616 * succ
11617 *
11618 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
11619 *
11620 * :foo.succ # => :fop
11621 *
11622 * Symbol#next is an alias for Symbol#succ.
11623 *
11624 * Related: String#succ.
11625 */
11626
11627static VALUE
11628sym_succ(VALUE sym)
11629{
11630 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
11631}
11632
11633/*
11634 * call-seq:
11635 * symbol <=> object -> -1, 0, +1, or nil
11636 *
11637 * If +object+ is a symbol,
11638 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
11639 *
11640 * :bar <=> :foo # => -1
11641 * :foo <=> :foo # => 0
11642 * :foo <=> :bar # => 1
11643 *
11644 * Otherwise, returns +nil+:
11645 *
11646 * :foo <=> 'bar' # => nil
11647 *
11648 * Related: String#<=>.
11649 */
11650
11651static VALUE
11652sym_cmp(VALUE sym, VALUE other)
11653{
11654 if (!SYMBOL_P(other)) {
11655 return Qnil;
11656 }
11657 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11658}
11659
11660/*
11661 * call-seq:
11662 * casecmp(object) -> -1, 0, 1, or nil
11663 *
11664 * :include: doc/symbol/casecmp.rdoc
11665 *
11666 */
11667
11668static VALUE
11669sym_casecmp(VALUE sym, VALUE other)
11670{
11671 if (!SYMBOL_P(other)) {
11672 return Qnil;
11673 }
11674 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11675}
11676
11677/*
11678 * call-seq:
11679 * casecmp?(object) -> true, false, or nil
11680 *
11681 * :include: doc/symbol/casecmp_p.rdoc
11682 *
11683 */
11684
11685static VALUE
11686sym_casecmp_p(VALUE sym, VALUE other)
11687{
11688 if (!SYMBOL_P(other)) {
11689 return Qnil;
11690 }
11691 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11692}
11693
11694/*
11695 * call-seq:
11696 * symbol =~ object -> integer or nil
11697 *
11698 * Equivalent to <tt>symbol.to_s =~ object</tt>,
11699 * including possible updates to global variables;
11700 * see String#=~.
11701 *
11702 */
11703
11704static VALUE
11705sym_match(VALUE sym, VALUE other)
11706{
11707 return rb_str_match(rb_sym2str(sym), other);
11708}
11709
11710/*
11711 * call-seq:
11712 * match(pattern, offset = 0) -> matchdata or nil
11713 * match(pattern, offset = 0) {|matchdata| } -> object
11714 *
11715 * Equivalent to <tt>self.to_s.match</tt>,
11716 * including possible updates to global variables;
11717 * see String#match.
11718 *
11719 */
11720
11721static VALUE
11722sym_match_m(int argc, VALUE *argv, VALUE sym)
11723{
11724 return rb_str_match_m(argc, argv, rb_sym2str(sym));
11725}
11726
11727/*
11728 * call-seq:
11729 * match?(pattern, offset) -> true or false
11730 *
11731 * Equivalent to <tt>sym.to_s.match?</tt>;
11732 * see String#match.
11733 *
11734 */
11735
11736static VALUE
11737sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11738{
11739 return rb_str_match_m_p(argc, argv, sym);
11740}
11741
11742/*
11743 * call-seq:
11744 * symbol[index] -> string or nil
11745 * symbol[start, length] -> string or nil
11746 * symbol[range] -> string or nil
11747 * symbol[regexp, capture = 0] -> string or nil
11748 * symbol[substring] -> string or nil
11749 *
11750 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
11751 *
11752 */
11753
11754static VALUE
11755sym_aref(int argc, VALUE *argv, VALUE sym)
11756{
11757 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11758}
11759
11760/*
11761 * call-seq:
11762 * length -> integer
11763 *
11764 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
11765 *
11766 * Symbol#size is an alias for Symbol#length.
11767 *
11768 */
11769
11770static VALUE
11771sym_length(VALUE sym)
11772{
11773 return rb_str_length(rb_sym2str(sym));
11774}
11775
11776/*
11777 * call-seq:
11778 * empty? -> true or false
11779 *
11780 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
11781 *
11782 */
11783
11784static VALUE
11785sym_empty(VALUE sym)
11786{
11787 return rb_str_empty(rb_sym2str(sym));
11788}
11789
11790/*
11791 * call-seq:
11792 * upcase(*options) -> symbol
11793 *
11794 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11795 *
11796 * See String#upcase.
11797 *
11798 */
11799
11800static VALUE
11801sym_upcase(int argc, VALUE *argv, VALUE sym)
11802{
11803 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11804}
11805
11806/*
11807 * call-seq:
11808 * downcase(*options) -> symbol
11809 *
11810 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11811 *
11812 * See String#downcase.
11813 *
11814 * Related: Symbol#upcase.
11815 *
11816 */
11817
11818static VALUE
11819sym_downcase(int argc, VALUE *argv, VALUE sym)
11820{
11821 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11822}
11823
11824/*
11825 * call-seq:
11826 * capitalize(*options) -> symbol
11827 *
11828 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11829 *
11830 * See String#capitalize.
11831 *
11832 */
11833
11834static VALUE
11835sym_capitalize(int argc, VALUE *argv, VALUE sym)
11836{
11837 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11838}
11839
11840/*
11841 * call-seq:
11842 * swapcase(*options) -> symbol
11843 *
11844 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11845 *
11846 * See String#swapcase.
11847 *
11848 */
11849
11850static VALUE
11851sym_swapcase(int argc, VALUE *argv, VALUE sym)
11852{
11853 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11854}
11855
11856/*
11857 * call-seq:
11858 * start_with?(*string_or_regexp) -> true or false
11859 *
11860 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
11861 *
11862 */
11863
11864static VALUE
11865sym_start_with(int argc, VALUE *argv, VALUE sym)
11866{
11867 return rb_str_start_with(argc, argv, rb_sym2str(sym));
11868}
11869
11870/*
11871 * call-seq:
11872 * end_with?(*string_or_regexp) -> true or false
11873 *
11874 *
11875 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
11876 *
11877 */
11878
11879static VALUE
11880sym_end_with(int argc, VALUE *argv, VALUE sym)
11881{
11882 return rb_str_end_with(argc, argv, rb_sym2str(sym));
11883}
11884
11885/*
11886 * call-seq:
11887 * encoding -> encoding
11888 *
11889 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
11890 *
11891 */
11892
11893static VALUE
11894sym_encoding(VALUE sym)
11895{
11896 return rb_obj_encoding(rb_sym2str(sym));
11897}
11898
11899static VALUE
11900string_for_symbol(VALUE name)
11901{
11902 if (!RB_TYPE_P(name, T_STRING)) {
11903 VALUE tmp = rb_check_string_type(name);
11904 if (NIL_P(tmp)) {
11905 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
11906 name);
11907 }
11908 name = tmp;
11909 }
11910 return name;
11911}
11912
11913ID
11915{
11916 if (SYMBOL_P(name)) {
11917 return SYM2ID(name);
11918 }
11919 name = string_for_symbol(name);
11920 return rb_intern_str(name);
11921}
11922
11923VALUE
11925{
11926 if (SYMBOL_P(name)) {
11927 return name;
11928 }
11929 name = string_for_symbol(name);
11930 return rb_str_intern(name);
11931}
11932
11933/*
11934 * call-seq:
11935 * Symbol.all_symbols -> array_of_symbols
11936 *
11937 * Returns an array of all symbols currently in Ruby's symbol table:
11938 *
11939 * Symbol.all_symbols.size # => 9334
11940 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
11941 *
11942 */
11943
11944static VALUE
11945sym_all_symbols(VALUE _)
11946{
11947 return rb_sym_all_symbols();
11948}
11949
11950VALUE
11952{
11953 return rb_fstring(str);
11954}
11955
11956VALUE
11957rb_interned_str(const char *ptr, long len)
11958{
11959 struct RString fake_str;
11960 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
11961}
11962
11963VALUE
11965{
11966 return rb_interned_str(ptr, strlen(ptr));
11967}
11968
11969VALUE
11970rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
11971{
11972 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
11973 rb_enc_autoload(enc);
11974 }
11975
11976 struct RString fake_str;
11977 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
11978}
11979
11980VALUE
11982{
11983 return rb_enc_interned_str(ptr, strlen(ptr), enc);
11984}
11985
11986void
11987Init_String(void)
11988{
11989 rb_cString = rb_define_class("String", rb_cObject);
11990 assert(rb_vm_fstring_table());
11991 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
11993 rb_define_alloc_func(rb_cString, empty_str_alloc);
11994 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
11995 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
11996 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
11997 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12000 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12001 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12002 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12003 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12006 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12007 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12008 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12009 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12012 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12013 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12014 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12015 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12016 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12018 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12020 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12021 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12022 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12023 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12024 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12025 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12027 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12028 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12029 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12030 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12031 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12032 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12033 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12034 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12036 rb_define_method(rb_cString, "+@", str_uplus, 0);
12037 rb_define_method(rb_cString, "-@", str_uminus, 0);
12038 rb_define_alias(rb_cString, "dedup", "-@");
12039
12040 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12041 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12042 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12043 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12046 rb_define_method(rb_cString, "undump", str_undump, 0);
12047
12048 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12049 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12050 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12051 sym_fold = ID2SYM(rb_intern_const("fold"));
12052
12053 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12054 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12055 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12056 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12057
12058 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12059 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12060 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12061 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12062
12063 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12064 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12065 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12066 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12067 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12068 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12069 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12070 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12071 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12072 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12073 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12075 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12076 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12077 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12078 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12079 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12080
12081 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12082 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12083 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12084
12085 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12086
12087 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12088 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12089 rb_define_method(rb_cString, "center", rb_str_center, -1);
12090
12091 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12092 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12093 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12094 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12095 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12096 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12097 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12098 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12099 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12100
12101 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12102 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12103 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12104 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12105 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12106 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12107 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12108 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12109 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12110
12111 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12112 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12113 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12114 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12115 rb_define_method(rb_cString, "count", rb_str_count, -1);
12116
12117 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12118 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12119 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12120 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12121
12122 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12123 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12124 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12125 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12126 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12127
12128 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12129
12130 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12131 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12132
12133 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12134 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12135
12136 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12137 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12138 rb_define_method(rb_cString, "b", rb_str_b, 0);
12139 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12140 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12141
12142 /* define UnicodeNormalize module here so that we don't have to look it up */
12143 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12144 id_normalize = rb_intern_const("normalize");
12145 id_normalized_p = rb_intern_const("normalized?");
12146
12147 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12148 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12149 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12150
12151 rb_fs = Qnil;
12152 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12153 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12154 rb_gc_register_address(&rb_fs);
12155
12156 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12160 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12161
12162 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12163 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12164 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12166 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
12167 rb_define_method(rb_cSymbol, "name", rb_sym2str, 0); /* in symbol.c */
12168 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12169 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12170 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12171
12172 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12173 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12174 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12175 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12176
12177 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12178 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12179 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12180 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12181 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12182 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12183 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12184
12185 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12186 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12187 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12188 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12189
12190 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12191 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12192
12193 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12194}
#define RUBY_ASSERT(expr)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:177
#define RUBY_ASSERT_ALWAYS(expr)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:167
Atomic operations.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_isascii(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isascii(), except it additionally takes an encoding.
Definition ctype.h:82
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1200
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implenentation detail of RB_OBJ_FROZEN().
Definition fl_type.h:906
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition fl_type.h:356
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1130
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:923
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1038
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2289
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2113
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:2579
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:868
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2368
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:107
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:105
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:142
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition fl_type.h:67
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:398
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:145
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define OBJ_FREEZE_RAW
Old name of RB_OBJ_FREEZE_RAW.
Definition fl_type.h:144
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:143
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:108
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:395
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:393
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:533
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:140
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:137
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:652
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:66
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:534
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:535
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:97
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:532
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:67
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:139
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:68
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:107
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:141
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:109
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:651
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:138
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:146
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:68
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports always regardless of runtime -W flag.
Definition error.c:421
void rb_raise(VALUE exc, const char *fmt,...)
Exception entry point.
Definition error.c:3150
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:688
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3262
void rb_bug(const char *fmt,...)
Interpreter panic switch.
Definition error.c:794
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1095
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1091
void rb_fatal(const char *fmt,...)
Raises the unsung "fatal" exception.
Definition error.c:3201
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1098
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1089
VALUE rb_eArgError
ArgumentError exception.
Definition error.c:1092
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1093
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:590
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:1940
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1195
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3417
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:191
VALUE rb_cSymbol
Sumbol class.
Definition string.c:80
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:123
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1183
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:79
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3027
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition rgengc.h:220
Encoding relates APIs.
static const char * rb_enc_name(rb_encoding *enc)
Queries the (canonical) name of the passed encoding.
Definition encoding.h:433
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:699
static char * rb_enc_prev_char(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the previous (left) character.
Definition encoding.h:678
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:720
static bool rb_enc_asciicompat(rb_encoding *enc)
Queries if the passed encoding is in some sense compatible with ASCII.
Definition encoding.h:784
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:587
static int rb_enc_mbcput(unsigned int c, void *buf, rb_encoding *enc)
Identical to rb_enc_uint_chr(), except it writes back to the passed buffer instead of allocating one.
Definition encoding.h:659
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:463
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition encoding.h:607
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:448
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:635
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:742
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1208
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:821
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1074
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2716
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1093
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:11970
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:249
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2060
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3288
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1021
VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it additionally takes an encoding.
Definition string.c:981
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1313
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1214
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:833
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:11981
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:719
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:411
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1453
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2630
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2884
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1709
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1102
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1189
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
#define rb_check_frozen
Just another name of rb_check_frozen.
Definition error.h:264
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:280
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:604
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:200
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1662
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1010
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1668
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1578
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1229
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4118
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3601
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1435
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1861
VALUE rb_str_to_interned_str(VALUE str)
Identical to rb_interned_str(), except it takes a Ruby's string instead of C's.
Definition string.c:11951
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1571
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1376
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2211
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3353
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1289
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:11597
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2283
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "defaultexternal" encoding.
Definition string.c:1265
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1565
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:2744
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:4828
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:3581
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:2826
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:10890
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1741
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1618
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1056
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:871
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1382
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1834
void rb_str_modify(VALUE str)
Declares that the string is about to be modified.
Definition string.c:2437
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:3571
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3177
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2149
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:1840
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6030
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:2834
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:11964
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1295
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "defaultexternal" encoding.
Definition string.h:1604
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3319
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:2791
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:3683
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3020
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:6712
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2489
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:11957
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:3637
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:3453
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:3612
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3295
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:2942
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5332
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:10948
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1513
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2640
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:2921
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3003
VALUE rb_str_resize(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3064
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1068
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2445
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:6826
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1277
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1532
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2163
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5258
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:8896
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1062
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:851
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1682
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:2805
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1142
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:276
ID rb_intern(const char *name)
Finds or creates a symbol of the given name.
Definition symbol.c:796
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
Definition symbol.c:943
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:11924
ID rb_to_id(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition string.c:11914
ID rb_intern_str(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition symbol.c:802
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1765
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3380
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4362
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:214
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1357
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:366
#define ALLOCA_N(type, n)
Definition memory.h:286
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:354
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:161
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:343
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:69
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:152
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:71
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition rgengc.h:107
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:139
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
@ RSTRING_EMBED_LEN_MAX
Max possible number of characters that can be embedded.
Definition rstring.h:215
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:72
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1307
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2617
static long RSTRING_EMBED_LEN(VALUE str)
Queries the length of the string.
Definition rstring.h:423
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:554
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:528
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:574
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2501
static long RSTRING_LEN(VALUE str)
Queries the length of the string.
Definition rstring.h:484
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1301
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2512
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1609
static char * RSTRING_PTR(VALUE str)
Queries the contents pointer of the string.
Definition rstring.h:498
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:95
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:441
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1329
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:77
Ruby's String.
Definition rstring.h:231
struct RString::@49::@51 embed
Embedded contents.
union RString::@49 as
String's specific fields.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:234
long capa
Capacity of *ptr.
Definition rstring.h:268
struct RString::@49::@50 heap
Strings that use separated memory region for contents use this pattern.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:250
char ary[RSTRING_EMBED_LEN_MAX+1]
When a string is short enough, it uses this area to store the contents themselves.
Definition rstring.h:298
VALUE shared
Parent of the string.
Definition rstring.h:276
char * ptr
Pointer to the contents of the string.
Definition rstring.h:258
union RString::@49::@50::@52 aux
Auxiliary info.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:190
Definition st.h:79
Definition string.c:7781
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:302
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:432
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:375