Ruby 3.2.3p157 (2024-01-18 revision 52bb2ac0a6971d0391efa2275f7a66bff319087c)
string.c
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "gc.h"
27#include "id.h"
28#include "internal.h"
29#include "internal/array.h"
30#include "internal/compar.h"
31#include "internal/compilers.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/numeric.h"
36#include "internal/object.h"
37#include "internal/proc.h"
38#include "internal/re.h"
39#include "internal/sanitizers.h"
40#include "internal/string.h"
41#include "internal/transcode.h"
42#include "probes.h"
43#include "ruby/encoding.h"
44#include "ruby/re.h"
45#include "ruby/util.h"
46#include "ruby_assert.h"
47#include "vm_sync.h"
48
49#if defined HAVE_CRYPT_R
50# if defined HAVE_CRYPT_H
51# include <crypt.h>
52# endif
53#elif !defined HAVE_CRYPT
54# include "missing/crypt.h"
55# define HAVE_CRYPT_R 1
56#endif
57
58#define BEG(no) (regs->beg[(no)])
59#define END(no) (regs->end[(no)])
60
61#undef rb_str_new
62#undef rb_usascii_str_new
63#undef rb_utf8_str_new
64#undef rb_enc_str_new
65#undef rb_str_new_cstr
66#undef rb_usascii_str_new_cstr
67#undef rb_utf8_str_new_cstr
68#undef rb_enc_str_new_cstr
69#undef rb_external_str_new_cstr
70#undef rb_locale_str_new_cstr
71#undef rb_str_dup_frozen
72#undef rb_str_buf_new_cstr
73#undef rb_str_buf_cat
74#undef rb_str_buf_cat2
75#undef rb_str_cat2
76#undef rb_str_cat_cstr
77#undef rb_fstring_cstr
78
81
82/* FLAGS of RString
83 *
84 * 1: RSTRING_NOEMBED
85 * 2: STR_SHARED (== ELTS_SHARED)
86 * 2-6: RSTRING_EMBED_LEN (5 bits == 32)
87 * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
88 * other strings that rely on this string's buffer)
89 * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
90 * early, specific to rb_str_tmp_frozen_{acquire,release})
91 * 7: STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
92 * such as read(2). Any modification and realloc is prohibited)
93 *
94 * 8-9: ENC_CODERANGE (2 bits)
95 * 10-16: ENCODING (7 bits == 128)
96 * 17: RSTRING_FSTR
97 * 18: STR_NOFREE (do not free this string's buffer when a String is freed.
98 * used for a string object based on C string literal)
99 * 19: STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
100 * object header is temporarily allocated on C stack)
101 */
102
103#define RUBY_MAX_CHAR_LEN 16
104#define STR_SHARED_ROOT FL_USER5
105#define STR_BORROWED FL_USER6
106#define STR_TMPLOCK FL_USER7
107#define STR_NOFREE FL_USER18
108#define STR_FAKESTR FL_USER19
109
110#define STR_SET_NOEMBED(str) do {\
111 FL_SET((str), STR_NOEMBED);\
112 if (USE_RVARGC) {\
113 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
114 }\
115 else {\
116 STR_SET_EMBED_LEN((str), 0);\
117 }\
118} while (0)
119#define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
120#if USE_RVARGC
121# define STR_SET_EMBED_LEN(str, n) do { \
122 assert(str_embed_capa(str) > (n));\
123 RSTRING(str)->as.embed.len = (n);\
124} while (0)
125#else
126# define STR_SET_EMBED_LEN(str, n) do { \
127 long tmp_n = (n);\
128 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
129 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
130} while (0)
131#endif
132
133#define STR_SET_LEN(str, n) do { \
134 if (STR_EMBED_P(str)) {\
135 STR_SET_EMBED_LEN((str), (n));\
136 }\
137 else {\
138 RSTRING(str)->as.heap.len = (n);\
139 }\
140} while (0)
141
142#define STR_DEC_LEN(str) do {\
143 if (STR_EMBED_P(str)) {\
144 long n = RSTRING_LEN(str);\
145 n--;\
146 STR_SET_EMBED_LEN((str), n);\
147 }\
148 else {\
149 RSTRING(str)->as.heap.len--;\
150 }\
151} while (0)
152
153static inline bool
154str_enc_fastpath(VALUE str)
155{
156 // The overwhelming majority of strings are in one of these 3 encodings.
157 switch (ENCODING_GET_INLINED(str)) {
158 case ENCINDEX_ASCII_8BIT:
159 case ENCINDEX_UTF_8:
160 case ENCINDEX_US_ASCII:
161 return true;
162 default:
163 return false;
164 }
165}
166
167#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
168#define TERM_FILL(ptr, termlen) do {\
169 char *const term_fill_ptr = (ptr);\
170 const int term_fill_len = (termlen);\
171 *term_fill_ptr = '\0';\
172 if (UNLIKELY(term_fill_len > 1))\
173 memset(term_fill_ptr, 0, term_fill_len);\
174} while (0)
175
176#define RESIZE_CAPA(str,capacity) do {\
177 const int termlen = TERM_LEN(str);\
178 RESIZE_CAPA_TERM(str,capacity,termlen);\
179} while (0)
180#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
181 if (STR_EMBED_P(str)) {\
182 if (str_embed_capa(str) < capacity + termlen) {\
183 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
184 const long tlen = RSTRING_LEN(str);\
185 memcpy(tmp, RSTRING_PTR(str), tlen);\
186 RSTRING(str)->as.heap.ptr = tmp;\
187 RSTRING(str)->as.heap.len = tlen;\
188 STR_SET_NOEMBED(str);\
189 RSTRING(str)->as.heap.aux.capa = (capacity);\
190 }\
191 }\
192 else {\
193 assert(!FL_TEST((str), STR_SHARED)); \
194 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
195 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
196 RSTRING(str)->as.heap.aux.capa = (capacity);\
197 }\
198} while (0)
199
200#define STR_SET_SHARED(str, shared_str) do { \
201 if (!FL_TEST(str, STR_FAKESTR)) { \
202 assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
203 assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
204 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
205 FL_SET((str), STR_SHARED); \
206 FL_SET((shared_str), STR_SHARED_ROOT); \
207 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
208 FL_SET_RAW((shared_str), STR_BORROWED); \
209 } \
210} while (0)
211
212#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
213#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
214/* TODO: include the terminator size in capa. */
215
216#define STR_ENC_GET(str) get_encoding(str)
217
218#if !defined SHARABLE_MIDDLE_SUBSTRING
219# define SHARABLE_MIDDLE_SUBSTRING 0
220#endif
221#if !SHARABLE_MIDDLE_SUBSTRING
222#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
223#else
224#define SHARABLE_SUBSTRING_P(beg, len, end) 1
225#endif
226
227
228static inline long
229str_embed_capa(VALUE str)
230{
231#if USE_RVARGC
232 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
233#else
234 return RSTRING_EMBED_LEN_MAX + 1;
235#endif
236}
237
238bool
239rb_str_reembeddable_p(VALUE str)
240{
241 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
242}
243
244static inline size_t
245rb_str_embed_size(long capa)
246{
247 return offsetof(struct RString, as.embed.ary) + capa;
248}
249
250size_t
251rb_str_size_as_embedded(VALUE str)
252{
253 size_t real_size;
254#if USE_RVARGC
255 if (STR_EMBED_P(str)) {
256 real_size = rb_str_embed_size(RSTRING(str)->as.embed.len) + TERM_LEN(str);
257 }
258 /* if the string is not currently embedded, but it can be embedded, how
259 * much space would it require */
260 else if (rb_str_reembeddable_p(str)) {
261 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
262 }
263 else {
264#endif
265 real_size = sizeof(struct RString);
266#if USE_RVARGC
267 }
268#endif
269 return real_size;
270}
271
272static inline bool
273STR_EMBEDDABLE_P(long len, long termlen)
274{
275#if USE_RVARGC
276 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
277#else
278 return len <= RSTRING_EMBED_LEN_MAX + 1 - termlen;
279#endif
280}
281
282static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
283static VALUE str_new_frozen(VALUE klass, VALUE orig);
284static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
285static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
286static VALUE str_new(VALUE klass, const char *ptr, long len);
287static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
288static inline void str_modifiable(VALUE str);
289static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
290
291static inline void
292str_make_independent(VALUE str)
293{
294 long len = RSTRING_LEN(str);
295 int termlen = TERM_LEN(str);
296 str_make_independent_expand((str), len, 0L, termlen);
297}
298
299static inline int str_dependent_p(VALUE str);
300
301void
302rb_str_make_independent(VALUE str)
303{
304 if (str_dependent_p(str)) {
305 str_make_independent(str);
306 }
307}
308
309void
310rb_str_make_embedded(VALUE str)
311{
312 RUBY_ASSERT(rb_str_reembeddable_p(str));
313 RUBY_ASSERT(!STR_EMBED_P(str));
314
315 char *buf = RSTRING(str)->as.heap.ptr;
316 long len = RSTRING(str)->as.heap.len;
317
318 STR_SET_EMBED(str);
319 STR_SET_EMBED_LEN(str, len);
320
321 if (len > 0) {
322 memcpy(RSTRING_PTR(str), buf, len);
323 ruby_xfree(buf);
324 }
325
326 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
327}
328
329void
330rb_str_update_shared_ary(VALUE str, VALUE old_root, VALUE new_root)
331{
332 // if the root location hasn't changed, we don't need to update
333 if (new_root == old_root) {
334 return;
335 }
336
337 // if the root string isn't embedded, we don't need to touch the ponter.
338 // it already points to the shame shared buffer
339 if (!STR_EMBED_P(new_root)) {
340 return;
341 }
342
343 size_t offset = (size_t)((uintptr_t)RSTRING(str)->as.heap.ptr - (uintptr_t)RSTRING(old_root)->as.embed.ary);
344
345 RUBY_ASSERT(RSTRING(str)->as.heap.ptr >= RSTRING(old_root)->as.embed.ary);
346 RSTRING(str)->as.heap.ptr = RSTRING(new_root)->as.embed.ary + offset;
347}
348
349void
350rb_debug_rstring_null_ptr(const char *func)
351{
352 fprintf(stderr, "%s is returning NULL!! "
353 "SIGSEGV is highly expected to follow immediately.\n"
354 "If you could reproduce, attach your debugger here, "
355 "and look at the passed string.\n",
356 func);
357}
358
359/* symbols for [up|down|swap]case/capitalize options */
360static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
361
362static rb_encoding *
363get_encoding(VALUE str)
364{
365 return rb_enc_from_index(ENCODING_GET(str));
366}
367
368static void
369mustnot_broken(VALUE str)
370{
371 if (is_broken_string(str)) {
372 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
373 }
374}
375
376static void
377mustnot_wchar(VALUE str)
378{
379 rb_encoding *enc = STR_ENC_GET(str);
380 if (rb_enc_mbminlen(enc) > 1) {
381 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
382 }
383}
384
385static int fstring_cmp(VALUE a, VALUE b);
386
387static VALUE register_fstring(VALUE str, bool copy);
388
389const struct st_hash_type rb_fstring_hash_type = {
390 fstring_cmp,
392};
393
394#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
395
397 VALUE fstr;
398 bool copy;
399};
400
401static int
402fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
403{
404
405 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
406 VALUE str = (VALUE)*key;
407
408 if (existing) {
409 /* because of lazy sweep, str may be unmarked already and swept
410 * at next time */
411
412 if (rb_objspace_garbage_object_p(str)) {
413 arg->fstr = Qundef;
414 return ST_DELETE;
415 }
416
417 arg->fstr = str;
418 return ST_STOP;
419 }
420 else {
421 if (FL_TEST_RAW(str, STR_FAKESTR)) {
422 if (arg->copy) {
423 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->as.heap.len);
424 rb_enc_copy(new_str, str);
425 str = new_str;
426 }
427 else {
428 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
429 RSTRING(str)->as.heap.len,
430 ENCODING_GET(str));
431 }
432 OBJ_FREEZE_RAW(str);
433 }
434 else {
435 if (!OBJ_FROZEN(str))
436 str = str_new_frozen(rb_cString, str);
437 if (STR_SHARED_P(str)) { /* str should not be shared */
438 /* shared substring */
439 str_make_independent(str);
440 assert(OBJ_FROZEN(str));
441 }
442 if (!BARE_STRING_P(str)) {
443 str = str_new_frozen(rb_cString, str);
444 }
445 }
446 RBASIC(str)->flags |= RSTRING_FSTR;
447
448 *key = *value = arg->fstr = str;
449 return ST_CONTINUE;
450 }
451}
452
453RUBY_FUNC_EXPORTED
454VALUE
455rb_fstring(VALUE str)
456{
457 VALUE fstr;
458 int bare;
459
460 Check_Type(str, T_STRING);
461
462 if (FL_TEST(str, RSTRING_FSTR))
463 return str;
464
465 bare = BARE_STRING_P(str);
466 if (!bare) {
467 if (STR_EMBED_P(str)) {
468 OBJ_FREEZE_RAW(str);
469 return str;
470 }
471 if (FL_TEST_RAW(str, STR_NOEMBED|STR_SHARED_ROOT|STR_SHARED) == (STR_NOEMBED|STR_SHARED_ROOT)) {
472 assert(OBJ_FROZEN(str));
473 return str;
474 }
475 }
476
477 if (!OBJ_FROZEN(str))
478 rb_str_resize(str, RSTRING_LEN(str));
479
480 fstr = register_fstring(str, FALSE);
481
482 if (!bare) {
483 str_replace_shared_without_enc(str, fstr);
484 OBJ_FREEZE_RAW(str);
485 return str;
486 }
487 return fstr;
488}
489
490static VALUE
491register_fstring(VALUE str, bool copy)
492{
493 struct fstr_update_arg args;
494 args.copy = copy;
495
496 RB_VM_LOCK_ENTER();
497 {
498 st_table *frozen_strings = rb_vm_fstring_table();
499 do {
500 args.fstr = str;
501 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
502 } while (UNDEF_P(args.fstr));
503 }
504 RB_VM_LOCK_LEAVE();
505
506 assert(OBJ_FROZEN(args.fstr));
507 assert(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
508 assert(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
509 assert(RBASIC_CLASS(args.fstr) == rb_cString);
510 return args.fstr;
511}
512
513static VALUE
514setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
515{
516 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
517 /* SHARED to be allocated by the callback */
518
519 if (!name) {
520 RUBY_ASSERT_ALWAYS(len == 0);
521 name = "";
522 }
523
524 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
525
526 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
527 fake_str->as.heap.len = len;
528 fake_str->as.heap.ptr = (char *)name;
529 fake_str->as.heap.aux.capa = len;
530 return (VALUE)fake_str;
531}
532
533/*
534 * set up a fake string which refers a static string literal.
535 */
536VALUE
537rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
538{
539 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
540}
541
542/*
543 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
544 * shared string which refers a static string literal. `ptr` must
545 * point a constant string.
546 */
547MJIT_FUNC_EXPORTED VALUE
548rb_fstring_new(const char *ptr, long len)
549{
550 struct RString fake_str;
551 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
552}
553
554VALUE
555rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
556{
557 struct RString fake_str;
558 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
559}
560
561VALUE
562rb_fstring_cstr(const char *ptr)
563{
564 return rb_fstring_new(ptr, strlen(ptr));
565}
566
567static int
568fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
569{
570 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
571 return ST_CONTINUE;
572}
573
574static int
575fstring_cmp(VALUE a, VALUE b)
576{
577 long alen, blen;
578 const char *aptr, *bptr;
579 RSTRING_GETMEM(a, aptr, alen);
580 RSTRING_GETMEM(b, bptr, blen);
581 return (alen != blen ||
582 ENCODING_GET(a) != ENCODING_GET(b) ||
583 memcmp(aptr, bptr, alen) != 0);
584}
585
586static inline int
587single_byte_optimizable(VALUE str)
588{
589 rb_encoding *enc;
590
591 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
593 return 1;
594
595 enc = STR_ENC_GET(str);
596 if (rb_enc_mbmaxlen(enc) == 1)
597 return 1;
598
599 /* Conservative. Possibly single byte.
600 * "\xa1" in Shift_JIS for example. */
601 return 0;
602}
603
605
606static inline const char *
607search_nonascii(const char *p, const char *e)
608{
609 const uintptr_t *s, *t;
610
611#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
612# if SIZEOF_UINTPTR_T == 8
613# define NONASCII_MASK UINT64_C(0x8080808080808080)
614# elif SIZEOF_UINTPTR_T == 4
615# define NONASCII_MASK UINT32_C(0x80808080)
616# else
617# error "don't know what to do."
618# endif
619#else
620# if SIZEOF_UINTPTR_T == 8
621# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
622# elif SIZEOF_UINTPTR_T == 4
623# define NONASCII_MASK 0x80808080UL /* or...? */
624# else
625# error "don't know what to do."
626# endif
627#endif
628
629 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
630#if !UNALIGNED_WORD_ACCESS
631 if ((uintptr_t)p % SIZEOF_VOIDP) {
632 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
633 p += l;
634 switch (l) {
635 default: UNREACHABLE;
636#if SIZEOF_VOIDP > 4
637 case 7: if (p[-7]&0x80) return p-7;
638 case 6: if (p[-6]&0x80) return p-6;
639 case 5: if (p[-5]&0x80) return p-5;
640 case 4: if (p[-4]&0x80) return p-4;
641#endif
642 case 3: if (p[-3]&0x80) return p-3;
643 case 2: if (p[-2]&0x80) return p-2;
644 case 1: if (p[-1]&0x80) return p-1;
645 case 0: break;
646 }
647 }
648#endif
649#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
650#define aligned_ptr(value) \
651 __builtin_assume_aligned((value), sizeof(uintptr_t))
652#else
653#define aligned_ptr(value) (uintptr_t *)(value)
654#endif
655 s = aligned_ptr(p);
656 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
657#undef aligned_ptr
658 for (;s < t; s++) {
659 if (*s & NONASCII_MASK) {
660#ifdef WORDS_BIGENDIAN
661 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
662#else
663 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
664#endif
665 }
666 }
667 p = (const char *)s;
668 }
669
670 switch (e - p) {
671 default: UNREACHABLE;
672#if SIZEOF_VOIDP > 4
673 case 7: if (e[-7]&0x80) return e-7;
674 case 6: if (e[-6]&0x80) return e-6;
675 case 5: if (e[-5]&0x80) return e-5;
676 case 4: if (e[-4]&0x80) return e-4;
677#endif
678 case 3: if (e[-3]&0x80) return e-3;
679 case 2: if (e[-2]&0x80) return e-2;
680 case 1: if (e[-1]&0x80) return e-1;
681 case 0: return NULL;
682 }
683}
684
685static int
686coderange_scan(const char *p, long len, rb_encoding *enc)
687{
688 const char *e = p + len;
689
690 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
691 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
692 p = search_nonascii(p, e);
694 }
695
696 if (rb_enc_asciicompat(enc)) {
697 p = search_nonascii(p, e);
698 if (!p) return ENC_CODERANGE_7BIT;
699 for (;;) {
700 int ret = rb_enc_precise_mbclen(p, e, enc);
702 p += MBCLEN_CHARFOUND_LEN(ret);
703 if (p == e) break;
704 p = search_nonascii(p, e);
705 if (!p) break;
706 }
707 }
708 else {
709 while (p < e) {
710 int ret = rb_enc_precise_mbclen(p, e, enc);
712 p += MBCLEN_CHARFOUND_LEN(ret);
713 }
714 }
715 return ENC_CODERANGE_VALID;
716}
717
718long
719rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
720{
721 const char *p = s;
722
723 if (*cr == ENC_CODERANGE_BROKEN)
724 return e - s;
725
726 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
727 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
728 if (*cr == ENC_CODERANGE_VALID) return e - s;
729 p = search_nonascii(p, e);
731 return e - s;
732 }
733 else if (rb_enc_asciicompat(enc)) {
734 p = search_nonascii(p, e);
735 if (!p) {
736 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
737 return e - s;
738 }
739 for (;;) {
740 int ret = rb_enc_precise_mbclen(p, e, enc);
741 if (!MBCLEN_CHARFOUND_P(ret)) {
743 return p - s;
744 }
745 p += MBCLEN_CHARFOUND_LEN(ret);
746 if (p == e) break;
747 p = search_nonascii(p, e);
748 if (!p) break;
749 }
750 }
751 else {
752 while (p < e) {
753 int ret = rb_enc_precise_mbclen(p, e, enc);
754 if (!MBCLEN_CHARFOUND_P(ret)) {
756 return p - s;
757 }
758 p += MBCLEN_CHARFOUND_LEN(ret);
759 }
760 }
762 return e - s;
763}
764
765static inline void
766str_enc_copy(VALUE str1, VALUE str2)
767{
768 rb_enc_set_index(str1, ENCODING_GET(str2));
769}
770
771static void
772rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
773{
774 /* this function is designed for copying encoding and coderange
775 * from src to new string "dest" which is made from the part of src.
776 */
777 str_enc_copy(dest, src);
778 if (RSTRING_LEN(dest) == 0) {
779 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
781 else
783 return;
784 }
785 switch (ENC_CODERANGE(src)) {
788 break;
790 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
791 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
793 else
795 break;
796 default:
797 break;
798 }
799}
800
801static void
802rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
803{
804 str_enc_copy(dest, src);
806}
807
808static int
809enc_coderange_scan(VALUE str, rb_encoding *enc)
810{
811 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
812}
813
814int
815rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
816{
817 return enc_coderange_scan(str, enc);
818}
819
820int
822{
823 int cr = ENC_CODERANGE(str);
824
825 if (cr == ENC_CODERANGE_UNKNOWN) {
826 cr = enc_coderange_scan(str, get_encoding(str));
827 ENC_CODERANGE_SET(str, cr);
828 }
829 return cr;
830}
831
832int
834{
835 rb_encoding *enc = STR_ENC_GET(str);
836
837 if (!rb_enc_asciicompat(enc))
838 return FALSE;
839 else if (is_ascii_string(str))
840 return TRUE;
841 return FALSE;
842}
843
844static inline void
845str_mod_check(VALUE s, const char *p, long len)
846{
847 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
848 rb_raise(rb_eRuntimeError, "string modified");
849 }
850}
851
852static size_t
853str_capacity(VALUE str, const int termlen)
854{
855 if (STR_EMBED_P(str)) {
856#if USE_RVARGC
857 return str_embed_capa(str) - termlen;
858#else
859 return (RSTRING_EMBED_LEN_MAX + 1 - termlen);
860#endif
861 }
862 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
863 return RSTRING(str)->as.heap.len;
864 }
865 else {
866 return RSTRING(str)->as.heap.aux.capa;
867 }
868}
869
870size_t
872{
873 return str_capacity(str, TERM_LEN(str));
874}
875
876static inline void
877must_not_null(const char *ptr)
878{
879 if (!ptr) {
880 rb_raise(rb_eArgError, "NULL pointer given");
881 }
882}
883
884static inline VALUE
885str_alloc_embed(VALUE klass, size_t capa)
886{
887 size_t size = rb_str_embed_size(capa);
888 assert(size > 0);
889 assert(rb_gc_size_allocatable_p(size));
890#if !USE_RVARGC
891 assert(size <= sizeof(struct RString));
892#endif
893
894 RVARGC_NEWOBJ_OF(str, struct RString, klass,
896
897 return (VALUE)str;
898}
899
900static inline VALUE
901str_alloc_heap(VALUE klass)
902{
903 RVARGC_NEWOBJ_OF(str, struct RString, klass,
904 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString));
905
906 return (VALUE)str;
907}
908
909static inline VALUE
910empty_str_alloc(VALUE klass)
911{
912 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
913 VALUE str = str_alloc_embed(klass, 0);
914 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
915 return str;
916}
917
918static VALUE
919str_new0(VALUE klass, const char *ptr, long len, int termlen)
920{
921 VALUE str;
922
923 if (len < 0) {
924 rb_raise(rb_eArgError, "negative string size (or size too big)");
925 }
926
927 RUBY_DTRACE_CREATE_HOOK(STRING, len);
928
929 if (STR_EMBEDDABLE_P(len, termlen)) {
930 str = str_alloc_embed(klass, len + termlen);
931 if (len == 0) {
933 }
934 }
935 else {
936 str = str_alloc_heap(klass);
937 RSTRING(str)->as.heap.aux.capa = len;
938 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
939 * integer overflow. If we can STATIC_ASSERT that, the following
940 * mul_add_mul can be reverted to a simple ALLOC_N. */
941 RSTRING(str)->as.heap.ptr =
942 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
943 }
944 if (ptr) {
945 memcpy(RSTRING_PTR(str), ptr, len);
946 }
947 STR_SET_LEN(str, len);
948 TERM_FILL(RSTRING_PTR(str) + len, termlen);
949 return str;
950}
951
952static VALUE
953str_new(VALUE klass, const char *ptr, long len)
954{
955 return str_new0(klass, ptr, len, 1);
956}
957
958VALUE
959rb_str_new(const char *ptr, long len)
960{
961 return str_new(rb_cString, ptr, len);
962}
963
964VALUE
965rb_usascii_str_new(const char *ptr, long len)
966{
967 VALUE str = rb_str_new(ptr, len);
968 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
969 return str;
970}
971
972VALUE
973rb_utf8_str_new(const char *ptr, long len)
974{
975 VALUE str = str_new(rb_cString, ptr, len);
976 rb_enc_associate_index(str, rb_utf8_encindex());
977 return str;
978}
979
980VALUE
981rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
982{
983 VALUE str;
984
985 if (!enc) return rb_str_new(ptr, len);
986
987 str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
988 rb_enc_associate(str, enc);
989 return str;
990}
991
992VALUE
994{
995 must_not_null(ptr);
996 /* rb_str_new_cstr() can take pointer from non-malloc-generated
997 * memory regions, and that cannot be detected by the MSAN. Just
998 * trust the programmer that the argument passed here is a sane C
999 * string. */
1000 __msan_unpoison_string(ptr);
1001 return rb_str_new(ptr, strlen(ptr));
1002}
1003
1004VALUE
1006{
1007 VALUE str = rb_str_new_cstr(ptr);
1008 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
1009 return str;
1010}
1011
1012VALUE
1014{
1015 VALUE str = rb_str_new_cstr(ptr);
1016 rb_enc_associate_index(str, rb_utf8_encindex());
1017 return str;
1018}
1019
1020VALUE
1022{
1023 must_not_null(ptr);
1024 if (rb_enc_mbminlen(enc) != 1) {
1025 rb_raise(rb_eArgError, "wchar encoding given");
1026 }
1027 return rb_enc_str_new(ptr, strlen(ptr), enc);
1028}
1029
1030static VALUE
1031str_new_static(VALUE klass, const char *ptr, long len, int encindex)
1032{
1033 VALUE str;
1034
1035 if (len < 0) {
1036 rb_raise(rb_eArgError, "negative string size (or size too big)");
1037 }
1038
1039 if (!ptr) {
1040 rb_encoding *enc = rb_enc_get_from_index(encindex);
1041 str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
1042 }
1043 else {
1044 RUBY_DTRACE_CREATE_HOOK(STRING, len);
1045 str = str_alloc_heap(klass);
1046 RSTRING(str)->as.heap.len = len;
1047 RSTRING(str)->as.heap.ptr = (char *)ptr;
1048 RSTRING(str)->as.heap.aux.capa = len;
1049 RBASIC(str)->flags |= STR_NOFREE;
1050 }
1051 rb_enc_associate_index(str, encindex);
1052 return str;
1053}
1054
1055VALUE
1056rb_str_new_static(const char *ptr, long len)
1057{
1058 return str_new_static(rb_cString, ptr, len, 0);
1059}
1060
1061VALUE
1063{
1064 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1065}
1066
1067VALUE
1069{
1070 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1071}
1072
1073VALUE
1075{
1076 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1077}
1078
1079static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1080 rb_encoding *from, rb_encoding *to,
1081 int ecflags, VALUE ecopts);
1082
1083static inline bool
1084is_enc_ascii_string(VALUE str, rb_encoding *enc)
1085{
1086 int encidx = rb_enc_to_index(enc);
1087 if (rb_enc_get_index(str) == encidx)
1088 return is_ascii_string(str);
1089 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1090}
1091
1092VALUE
1093rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1094{
1095 long len;
1096 const char *ptr;
1097 VALUE newstr;
1098
1099 if (!to) return str;
1100 if (!from) from = rb_enc_get(str);
1101 if (from == to) return str;
1102 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1103 rb_is_ascii8bit_enc(to)) {
1104 if (STR_ENC_GET(str) != to) {
1105 str = rb_str_dup(str);
1106 rb_enc_associate(str, to);
1107 }
1108 return str;
1109 }
1110
1111 RSTRING_GETMEM(str, ptr, len);
1112 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1113 from, to, ecflags, ecopts);
1114 if (NIL_P(newstr)) {
1115 /* some error, return original */
1116 return str;
1117 }
1118 return newstr;
1119}
1120
1121VALUE
1122rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1123 rb_encoding *from, int ecflags, VALUE ecopts)
1124{
1125 long olen;
1126
1127 olen = RSTRING_LEN(newstr);
1128 if (ofs < -olen || olen < ofs)
1129 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1130 if (ofs < 0) ofs += olen;
1131 if (!from) {
1132 STR_SET_LEN(newstr, ofs);
1133 return rb_str_cat(newstr, ptr, len);
1134 }
1135
1136 rb_str_modify(newstr);
1137 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1138 rb_enc_get(newstr),
1139 ecflags, ecopts);
1140}
1141
1142VALUE
1143rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1144{
1145 STR_SET_LEN(str, 0);
1146 rb_enc_associate(str, enc);
1147 rb_str_cat(str, ptr, len);
1148 return str;
1149}
1150
1151static VALUE
1152str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1153 rb_encoding *from, rb_encoding *to,
1154 int ecflags, VALUE ecopts)
1155{
1156 rb_econv_t *ec;
1158 long olen;
1159 VALUE econv_wrapper;
1160 const unsigned char *start, *sp;
1161 unsigned char *dest, *dp;
1162 size_t converted_output = (size_t)ofs;
1163
1164 olen = rb_str_capacity(newstr);
1165
1166 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1167 RBASIC_CLEAR_CLASS(econv_wrapper);
1168 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1169 if (!ec) return Qnil;
1170 DATA_PTR(econv_wrapper) = ec;
1171
1172 sp = (unsigned char*)ptr;
1173 start = sp;
1174 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1175 (dp = dest + converted_output),
1176 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1178 /* destination buffer short */
1179 size_t converted_input = sp - start;
1180 size_t rest = len - converted_input;
1181 converted_output = dp - dest;
1182 rb_str_set_len(newstr, converted_output);
1183 if (converted_input && converted_output &&
1184 rest < (LONG_MAX / converted_output)) {
1185 rest = (rest * converted_output) / converted_input;
1186 }
1187 else {
1188 rest = olen;
1189 }
1190 olen += rest < 2 ? 2 : rest;
1191 rb_str_resize(newstr, olen);
1192 }
1193 DATA_PTR(econv_wrapper) = 0;
1194 rb_econv_close(ec);
1195 switch (ret) {
1196 case econv_finished:
1197 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1198 rb_str_set_len(newstr, len);
1199 rb_enc_associate(newstr, to);
1200 return newstr;
1201
1202 default:
1203 return Qnil;
1204 }
1205}
1206
1207VALUE
1209{
1210 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1211}
1212
1213VALUE
1215{
1216 rb_encoding *ienc;
1217 VALUE str;
1218 const int eidx = rb_enc_to_index(eenc);
1219
1220 if (!ptr) {
1221 return rb_enc_str_new(ptr, len, eenc);
1222 }
1223
1224 /* ASCII-8BIT case, no conversion */
1225 if ((eidx == rb_ascii8bit_encindex()) ||
1226 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1227 return rb_str_new(ptr, len);
1228 }
1229 /* no default_internal or same encoding, no conversion */
1230 ienc = rb_default_internal_encoding();
1231 if (!ienc || eenc == ienc) {
1232 return rb_enc_str_new(ptr, len, eenc);
1233 }
1234 /* ASCII compatible, and ASCII only string, no conversion in
1235 * default_internal */
1236 if ((eidx == rb_ascii8bit_encindex()) ||
1237 (eidx == rb_usascii_encindex()) ||
1238 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1239 return rb_enc_str_new(ptr, len, ienc);
1240 }
1241 /* convert from the given encoding to default_internal */
1242 str = rb_enc_str_new(NULL, 0, ienc);
1243 /* when the conversion failed for some reason, just ignore the
1244 * default_internal and result in the given encoding as-is. */
1245 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1246 rb_str_initialize(str, ptr, len, eenc);
1247 }
1248 return str;
1249}
1250
1251VALUE
1252rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1253{
1254 int eidx = rb_enc_to_index(eenc);
1255 if (eidx == rb_usascii_encindex() &&
1256 !is_ascii_string(str)) {
1257 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1258 return str;
1259 }
1260 rb_enc_associate_index(str, eidx);
1261 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1262}
1263
1264VALUE
1265rb_external_str_new(const char *ptr, long len)
1266{
1267 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1268}
1269
1270VALUE
1272{
1273 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1274}
1275
1276VALUE
1277rb_locale_str_new(const char *ptr, long len)
1278{
1279 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1280}
1281
1282VALUE
1284{
1285 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1286}
1287
1288VALUE
1290{
1291 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1292}
1293
1294VALUE
1296{
1297 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1298}
1299
1300VALUE
1302{
1303 return rb_str_export_to_enc(str, rb_default_external_encoding());
1304}
1305
1306VALUE
1308{
1309 return rb_str_export_to_enc(str, rb_locale_encoding());
1310}
1311
1312VALUE
1314{
1315 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1316}
1317
1318static VALUE
1319str_replace_shared_without_enc(VALUE str2, VALUE str)
1320{
1321 const int termlen = TERM_LEN(str);
1322 char *ptr;
1323 long len;
1324
1325 RSTRING_GETMEM(str, ptr, len);
1326 if (str_embed_capa(str2) >= len + termlen) {
1327 char *ptr2 = RSTRING(str2)->as.embed.ary;
1328 STR_SET_EMBED(str2);
1329 memcpy(ptr2, RSTRING_PTR(str), len);
1330 STR_SET_EMBED_LEN(str2, len);
1331 TERM_FILL(ptr2+len, termlen);
1332 }
1333 else {
1334 VALUE root;
1335 if (STR_SHARED_P(str)) {
1336 root = RSTRING(str)->as.heap.aux.shared;
1337 RSTRING_GETMEM(str, ptr, len);
1338 }
1339 else {
1340 root = rb_str_new_frozen(str);
1341 RSTRING_GETMEM(root, ptr, len);
1342 }
1343 assert(OBJ_FROZEN(root));
1344 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1345 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1346 rb_fatal("about to free a possible shared root");
1347 }
1348 char *ptr2 = STR_HEAP_PTR(str2);
1349 if (ptr2 != ptr) {
1350 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1351 }
1352 }
1353 FL_SET(str2, STR_NOEMBED);
1354 RSTRING(str2)->as.heap.len = len;
1355 RSTRING(str2)->as.heap.ptr = ptr;
1356 STR_SET_SHARED(str2, root);
1357 }
1358 return str2;
1359}
1360
1361static VALUE
1362str_replace_shared(VALUE str2, VALUE str)
1363{
1364 str_replace_shared_without_enc(str2, str);
1365 rb_enc_cr_str_exact_copy(str2, str);
1366 return str2;
1367}
1368
1369static VALUE
1370str_new_shared(VALUE klass, VALUE str)
1371{
1372 return str_replace_shared(str_alloc_heap(klass), str);
1373}
1374
1375VALUE
1377{
1378 return str_new_shared(rb_obj_class(str), str);
1379}
1380
1381VALUE
1383{
1384 if (OBJ_FROZEN(orig)) return orig;
1385 return str_new_frozen(rb_obj_class(orig), orig);
1386}
1387
1388static VALUE
1389rb_str_new_frozen_String(VALUE orig)
1390{
1391 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1392 return str_new_frozen(rb_cString, orig);
1393}
1394
1395VALUE
1396rb_str_tmp_frozen_acquire(VALUE orig)
1397{
1398 if (OBJ_FROZEN_RAW(orig)) return orig;
1399 return str_new_frozen_buffer(0, orig, FALSE);
1400}
1401
1402void
1403rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1404{
1405 if (RBASIC_CLASS(tmp) != 0)
1406 return;
1407
1408 if (STR_EMBED_P(tmp)) {
1409 assert(OBJ_FROZEN_RAW(tmp));
1410 }
1411 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1412 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1413 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1414
1415 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1416 assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1417 assert(RSTRING(orig)->as.heap.len == RSTRING(tmp)->as.heap.len);
1418
1419 /* Unshare orig since the root (tmp) only has this one child. */
1420 FL_UNSET_RAW(orig, STR_SHARED);
1421 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1422 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1423 assert(OBJ_FROZEN_RAW(tmp));
1424
1425 /* Make tmp embedded and empty so it is safe for sweeping. */
1426 STR_SET_EMBED(tmp);
1427 STR_SET_EMBED_LEN(tmp, 0);
1428 }
1429 }
1430}
1431
1432static VALUE
1433str_new_frozen(VALUE klass, VALUE orig)
1434{
1435 return str_new_frozen_buffer(klass, orig, TRUE);
1436}
1437
1438static VALUE
1439heap_str_make_shared(VALUE klass, VALUE orig)
1440{
1441 assert(!STR_EMBED_P(orig));
1442 assert(!STR_SHARED_P(orig));
1443
1444 VALUE str = str_alloc_heap(klass);
1445 RSTRING(str)->as.heap.len = RSTRING_LEN(orig);
1446 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1447 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1448 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1449 RBASIC(orig)->flags &= ~STR_NOFREE;
1450 STR_SET_SHARED(orig, str);
1451 if (klass == 0)
1452 FL_UNSET_RAW(str, STR_BORROWED);
1453 return str;
1454}
1455
1456static VALUE
1457str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1458{
1459 VALUE str;
1460
1461 long len = RSTRING_LEN(orig);
1462 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1463
1464 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1465 str = str_new0(klass, RSTRING_PTR(orig), len, termlen);
1466 assert(STR_EMBED_P(str));
1467 }
1468 else {
1469 if (FL_TEST_RAW(orig, STR_SHARED)) {
1470 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1471 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1472 long rest = RSTRING_LEN(shared) - ofs - RSTRING(orig)->as.heap.len;
1473 assert(ofs >= 0);
1474 assert(rest >= 0);
1475 assert(ofs + rest <= RSTRING_LEN(shared));
1476#if !USE_RVARGC
1477 assert(!STR_EMBED_P(shared));
1478#endif
1479 assert(OBJ_FROZEN(shared));
1480
1481 if ((ofs > 0) || (rest > 0) ||
1482 (klass != RBASIC(shared)->klass) ||
1483 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1484 str = str_new_shared(klass, shared);
1485 assert(!STR_EMBED_P(str));
1486 RSTRING(str)->as.heap.ptr += ofs;
1487 RSTRING(str)->as.heap.len -= ofs + rest;
1488 }
1489 else {
1490 if (RBASIC_CLASS(shared) == 0)
1491 FL_SET_RAW(shared, STR_BORROWED);
1492 return shared;
1493 }
1494 }
1495 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1496 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1497 STR_SET_EMBED(str);
1498 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1499 STR_SET_EMBED_LEN(str, RSTRING_LEN(orig));
1500 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1501 }
1502 else {
1503 str = heap_str_make_shared(klass, orig);
1504 }
1505 }
1506
1507 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1508 OBJ_FREEZE(str);
1509 return str;
1510}
1511
1512VALUE
1513rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1514{
1515 return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1516}
1517
1518static VALUE
1519str_new_empty_String(VALUE str)
1520{
1521 VALUE v = rb_str_new(0, 0);
1522 rb_enc_copy(v, str);
1523 return v;
1524}
1525
1526#define STR_BUF_MIN_SIZE 63
1527#if !USE_RVARGC
1528STATIC_ASSERT(STR_BUF_MIN_SIZE, STR_BUF_MIN_SIZE > RSTRING_EMBED_LEN_MAX);
1529#endif
1530
1531VALUE
1533{
1534 if (STR_EMBEDDABLE_P(capa, 1)) {
1535 return str_alloc_embed(rb_cString, capa + 1);
1536 }
1537
1538 VALUE str = str_alloc_heap(rb_cString);
1539
1540#if !USE_RVARGC
1541 if (capa < STR_BUF_MIN_SIZE) {
1542 capa = STR_BUF_MIN_SIZE;
1543 }
1544#endif
1545 RSTRING(str)->as.heap.aux.capa = capa;
1546 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1547 RSTRING(str)->as.heap.ptr[0] = '\0';
1548
1549 return str;
1550}
1551
1552VALUE
1554{
1555 VALUE str;
1556 long len = strlen(ptr);
1557
1558 str = rb_str_buf_new(len);
1559 rb_str_buf_cat(str, ptr, len);
1560
1561 return str;
1562}
1563
1564VALUE
1566{
1567 return str_new(0, 0, len);
1568}
1569
1570void
1572{
1573 if (FL_TEST(str, RSTRING_FSTR)) {
1574 st_data_t fstr = (st_data_t)str;
1575
1576 RB_VM_LOCK_ENTER();
1577 {
1578 st_delete(rb_vm_fstring_table(), &fstr, NULL);
1579 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1580 }
1581 RB_VM_LOCK_LEAVE();
1582 }
1583
1584 if (STR_EMBED_P(str)) {
1585 RB_DEBUG_COUNTER_INC(obj_str_embed);
1586 }
1587 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1588 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1589 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1590 }
1591 else {
1592 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1593 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1594 }
1595}
1596
1597RUBY_FUNC_EXPORTED size_t
1598rb_str_memsize(VALUE str)
1599{
1600 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1601 return STR_HEAP_SIZE(str);
1602 }
1603 else {
1604 return 0;
1605 }
1606}
1607
1608VALUE
1610{
1611 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1612}
1613
1614static inline void str_discard(VALUE str);
1615static void str_shared_replace(VALUE str, VALUE str2);
1616
1617void
1619{
1620 if (str != str2) str_shared_replace(str, str2);
1621}
1622
1623static void
1624str_shared_replace(VALUE str, VALUE str2)
1625{
1626 rb_encoding *enc;
1627 int cr;
1628 int termlen;
1629
1630 RUBY_ASSERT(str2 != str);
1631 enc = STR_ENC_GET(str2);
1632 cr = ENC_CODERANGE(str2);
1633 str_discard(str);
1634 termlen = rb_enc_mbminlen(enc);
1635
1636 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1637 STR_SET_EMBED(str);
1638 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1639 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
1640 rb_enc_associate(str, enc);
1641 ENC_CODERANGE_SET(str, cr);
1642 }
1643 else {
1644#if USE_RVARGC
1645 if (STR_EMBED_P(str2)) {
1646 assert(!FL_TEST(str2, STR_SHARED));
1647 long len = RSTRING(str2)->as.embed.len;
1648 assert(len + termlen <= str_embed_capa(str2));
1649
1650 char *new_ptr = ALLOC_N(char, len + termlen);
1651 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1652 RSTRING(str2)->as.heap.ptr = new_ptr;
1653 RSTRING(str2)->as.heap.len = len;
1654 RSTRING(str2)->as.heap.aux.capa = len;
1655 STR_SET_NOEMBED(str2);
1656 }
1657#endif
1658
1659 STR_SET_NOEMBED(str);
1660 FL_UNSET(str, STR_SHARED);
1661 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1662 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
1663
1664 if (FL_TEST(str2, STR_SHARED)) {
1665 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1666 STR_SET_SHARED(str, shared);
1667 }
1668 else {
1669 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1670 }
1671
1672 /* abandon str2 */
1673 STR_SET_EMBED(str2);
1674 RSTRING_PTR(str2)[0] = 0;
1675 STR_SET_EMBED_LEN(str2, 0);
1676 rb_enc_associate(str, enc);
1677 ENC_CODERANGE_SET(str, cr);
1678 }
1679}
1680
1681VALUE
1683{
1684 VALUE str;
1685
1686 if (RB_TYPE_P(obj, T_STRING)) {
1687 return obj;
1688 }
1689 str = rb_funcall(obj, idTo_s, 0);
1690 return rb_obj_as_string_result(str, obj);
1691}
1692
1693MJIT_FUNC_EXPORTED VALUE
1694rb_obj_as_string_result(VALUE str, VALUE obj)
1695{
1696 if (!RB_TYPE_P(str, T_STRING))
1697 return rb_any_to_s(obj);
1698 return str;
1699}
1700
1701static VALUE
1702str_replace(VALUE str, VALUE str2)
1703{
1704 long len;
1705
1706 len = RSTRING_LEN(str2);
1707 if (STR_SHARED_P(str2)) {
1708 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1709 assert(OBJ_FROZEN(shared));
1710 STR_SET_NOEMBED(str);
1711 RSTRING(str)->as.heap.len = len;
1712 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1713 STR_SET_SHARED(str, shared);
1714 rb_enc_cr_str_exact_copy(str, str2);
1715 }
1716 else {
1717 str_replace_shared(str, str2);
1718 }
1719
1720 return str;
1721}
1722
1723static inline VALUE
1724ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1725{
1726 size_t size = rb_str_embed_size(capa);
1727 assert(size > 0);
1728 assert(rb_gc_size_allocatable_p(size));
1729#if !USE_RVARGC
1730 assert(size <= sizeof(struct RString));
1731#endif
1732
1733 RB_RVARGC_EC_NEWOBJ_OF(ec, str, struct RString, klass,
1735
1736 return (VALUE)str;
1737}
1738
1739static inline VALUE
1740ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1741{
1742 RB_RVARGC_EC_NEWOBJ_OF(ec, str, struct RString, klass,
1743 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString));
1744
1745 return (VALUE)str;
1746}
1747
1748static inline VALUE
1749str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1750{
1751 const VALUE flag_mask =
1752#if !USE_RVARGC
1753 RSTRING_NOEMBED | RSTRING_EMBED_LEN_MASK |
1754#endif
1756 FL_FREEZE
1757 ;
1758 VALUE flags = FL_TEST_RAW(str, flag_mask);
1759 int encidx = 0;
1760 if (STR_EMBED_P(str)) {
1761 long len = RSTRING_EMBED_LEN(str);
1762
1763 assert(STR_EMBED_P(dup));
1764 assert(str_embed_capa(dup) >= len + 1);
1765 STR_SET_EMBED_LEN(dup, len);
1766 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1767 }
1768 else {
1769 VALUE root = str;
1770 if (FL_TEST_RAW(str, STR_SHARED)) {
1771 root = RSTRING(str)->as.heap.aux.shared;
1772 }
1773 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1774 root = str = str_new_frozen(klass, str);
1775 flags = FL_TEST_RAW(str, flag_mask);
1776 }
1777 assert(!STR_SHARED_P(root));
1778 assert(RB_OBJ_FROZEN_RAW(root));
1779 if (0) {}
1780#if !USE_RVARGC
1781 else if (STR_EMBED_P(root)) {
1782 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(root)->as.embed.ary,
1783 char, RSTRING_EMBED_LEN_MAX + 1);
1784 FL_UNSET(dup, STR_NOEMBED);
1785 }
1786#endif
1787 else {
1788 RSTRING(dup)->as.heap.len = RSTRING_LEN(str);
1789 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1790 FL_SET(root, STR_SHARED_ROOT);
1791 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1792 flags |= RSTRING_NOEMBED | STR_SHARED;
1793 }
1794 }
1795
1796 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1797 encidx = rb_enc_get_index(str);
1798 flags &= ~ENCODING_MASK;
1799 }
1800 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1801 if (encidx) rb_enc_associate_index(dup, encidx);
1802 return dup;
1803}
1804
1805static inline VALUE
1806ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1807{
1808 VALUE dup;
1809 if (FL_TEST(str, STR_NOEMBED)) {
1810 dup = ec_str_alloc_heap(ec, klass);
1811 }
1812 else {
1813 dup = ec_str_alloc_embed(ec, klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1814 }
1815
1816 return str_duplicate_setup(klass, str, dup);
1817}
1818
1819static inline VALUE
1820str_duplicate(VALUE klass, VALUE str)
1821{
1822 VALUE dup;
1823 if (FL_TEST(str, STR_NOEMBED)) {
1824 dup = str_alloc_heap(klass);
1825 }
1826 else {
1827 dup = str_alloc_embed(klass, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
1828 }
1829
1830 return str_duplicate_setup(klass, str, dup);
1831}
1832
1833VALUE
1835{
1836 return str_duplicate(rb_obj_class(str), str);
1837}
1838
1839VALUE
1841{
1842 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1843 return str_duplicate(rb_cString, str);
1844}
1845
1846VALUE
1847rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str)
1848{
1849 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1850 return ec_str_duplicate(ec, rb_cString, str);
1851}
1852
1853/*
1854 *
1855 * call-seq:
1856 * String.new(string = '', **opts) -> new_string
1857 *
1858 * :include: doc/string/new.rdoc
1859 *
1860 */
1861
1862static VALUE
1863rb_str_init(int argc, VALUE *argv, VALUE str)
1864{
1865 static ID keyword_ids[2];
1866 VALUE orig, opt, venc, vcapa;
1867 VALUE kwargs[2];
1868 rb_encoding *enc = 0;
1869 int n;
1870
1871 if (!keyword_ids[0]) {
1872 keyword_ids[0] = rb_id_encoding();
1873 CONST_ID(keyword_ids[1], "capacity");
1874 }
1875
1876 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1877 if (!NIL_P(opt)) {
1878 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1879 venc = kwargs[0];
1880 vcapa = kwargs[1];
1881 if (!UNDEF_P(venc) && !NIL_P(venc)) {
1882 enc = rb_to_encoding(venc);
1883 }
1884 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
1885 long capa = NUM2LONG(vcapa);
1886 long len = 0;
1887 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1888
1889 if (capa < STR_BUF_MIN_SIZE) {
1890 capa = STR_BUF_MIN_SIZE;
1891 }
1892 if (n == 1) {
1893 StringValue(orig);
1894 len = RSTRING_LEN(orig);
1895 if (capa < len) {
1896 capa = len;
1897 }
1898 if (orig == str) n = 0;
1899 }
1900 str_modifiable(str);
1901 if (STR_EMBED_P(str)) { /* make noembed always */
1902 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1903#if USE_RVARGC
1904 assert(RSTRING(str)->as.embed.len + 1 <= str_embed_capa(str));
1905 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING(str)->as.embed.len + 1);
1906#else
1907 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING_EMBED_LEN_MAX + 1);
1908#endif
1909 RSTRING(str)->as.heap.ptr = new_ptr;
1910 }
1911 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1912 const size_t size = (size_t)capa + termlen;
1913 const char *const old_ptr = RSTRING_PTR(str);
1914 const size_t osize = RSTRING(str)->as.heap.len + TERM_LEN(str);
1915 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1916 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1917 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
1918 RSTRING(str)->as.heap.ptr = new_ptr;
1919 }
1920 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1921 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1922 (size_t)capa + termlen, STR_HEAP_SIZE(str));
1923 }
1924 RSTRING(str)->as.heap.len = len;
1925 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1926 if (n == 1) {
1927 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1928 rb_enc_cr_str_exact_copy(str, orig);
1929 }
1930 FL_SET(str, STR_NOEMBED);
1931 RSTRING(str)->as.heap.aux.capa = capa;
1932 }
1933 else if (n == 1) {
1934 rb_str_replace(str, orig);
1935 }
1936 if (enc) {
1937 rb_enc_associate(str, enc);
1939 }
1940 }
1941 else if (n == 1) {
1942 rb_str_replace(str, orig);
1943 }
1944 return str;
1945}
1946
1947#ifdef NONASCII_MASK
1948#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1949
1950/*
1951 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1952 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1953 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1954 *
1955 * if (!(byte & 0x80))
1956 * byte |= 0x40; // turn on bit6
1957 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1958 *
1959 * This function calculates whether a byte is leading or not for all bytes
1960 * in the argument word by concurrently using the above logic, and then
1961 * adds up the number of leading bytes in the word.
1962 */
1963static inline uintptr_t
1964count_utf8_lead_bytes_with_word(const uintptr_t *s)
1965{
1966 uintptr_t d = *s;
1967
1968 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1969 d = (d>>6) | (~d>>7);
1970 d &= NONASCII_MASK >> 7;
1971
1972 /* Gather all bytes. */
1973#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1974 /* use only if it can use POPCNT */
1975 return rb_popcount_intptr(d);
1976#else
1977 d += (d>>8);
1978 d += (d>>16);
1979# if SIZEOF_VOIDP == 8
1980 d += (d>>32);
1981# endif
1982 return (d&0xF);
1983#endif
1984}
1985#endif
1986
1987static inline long
1988enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
1989{
1990 long c;
1991 const char *q;
1992
1993 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
1994 long diff = (long)(e - p);
1995 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
1996 }
1997#ifdef NONASCII_MASK
1998 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
1999 uintptr_t len = 0;
2000 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2001 const uintptr_t *s, *t;
2002 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2003 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2004 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2005 while (p < (const char *)s) {
2006 if (is_utf8_lead_byte(*p)) len++;
2007 p++;
2008 }
2009 while (s < t) {
2010 len += count_utf8_lead_bytes_with_word(s);
2011 s++;
2012 }
2013 p = (const char *)s;
2014 }
2015 while (p < e) {
2016 if (is_utf8_lead_byte(*p)) len++;
2017 p++;
2018 }
2019 return (long)len;
2020 }
2021#endif
2022 else if (rb_enc_asciicompat(enc)) {
2023 c = 0;
2024 if (ENC_CODERANGE_CLEAN_P(cr)) {
2025 while (p < e) {
2026 if (ISASCII(*p)) {
2027 q = search_nonascii(p, e);
2028 if (!q)
2029 return c + (e - p);
2030 c += q - p;
2031 p = q;
2032 }
2033 p += rb_enc_fast_mbclen(p, e, enc);
2034 c++;
2035 }
2036 }
2037 else {
2038 while (p < e) {
2039 if (ISASCII(*p)) {
2040 q = search_nonascii(p, e);
2041 if (!q)
2042 return c + (e - p);
2043 c += q - p;
2044 p = q;
2045 }
2046 p += rb_enc_mbclen(p, e, enc);
2047 c++;
2048 }
2049 }
2050 return c;
2051 }
2052
2053 for (c=0; p<e; c++) {
2054 p += rb_enc_mbclen(p, e, enc);
2055 }
2056 return c;
2057}
2058
2059long
2060rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2061{
2062 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2063}
2064
2065/* To get strlen with cr
2066 * Note that given cr is not used.
2067 */
2068long
2069rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2070{
2071 long c;
2072 const char *q;
2073 int ret;
2074
2075 *cr = 0;
2076 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2077 long diff = (long)(e - p);
2078 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2079 }
2080 else if (rb_enc_asciicompat(enc)) {
2081 c = 0;
2082 while (p < e) {
2083 if (ISASCII(*p)) {
2084 q = search_nonascii(p, e);
2085 if (!q) {
2086 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2087 return c + (e - p);
2088 }
2089 c += q - p;
2090 p = q;
2091 }
2092 ret = rb_enc_precise_mbclen(p, e, enc);
2093 if (MBCLEN_CHARFOUND_P(ret)) {
2094 *cr |= ENC_CODERANGE_VALID;
2095 p += MBCLEN_CHARFOUND_LEN(ret);
2096 }
2097 else {
2099 p++;
2100 }
2101 c++;
2102 }
2103 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2104 return c;
2105 }
2106
2107 for (c=0; p<e; c++) {
2108 ret = rb_enc_precise_mbclen(p, e, enc);
2109 if (MBCLEN_CHARFOUND_P(ret)) {
2110 *cr |= ENC_CODERANGE_VALID;
2111 p += MBCLEN_CHARFOUND_LEN(ret);
2112 }
2113 else {
2115 if (p + rb_enc_mbminlen(enc) <= e)
2116 p += rb_enc_mbminlen(enc);
2117 else
2118 p = e;
2119 }
2120 }
2121 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2122 return c;
2123}
2124
2125/* enc must be str's enc or rb_enc_check(str, str2) */
2126static long
2127str_strlen(VALUE str, rb_encoding *enc)
2128{
2129 const char *p, *e;
2130 int cr;
2131
2132 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2133 if (!enc) enc = STR_ENC_GET(str);
2134 p = RSTRING_PTR(str);
2135 e = RSTRING_END(str);
2136 cr = ENC_CODERANGE(str);
2137
2138 if (cr == ENC_CODERANGE_UNKNOWN) {
2139 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2140 if (cr) ENC_CODERANGE_SET(str, cr);
2141 return n;
2142 }
2143 else {
2144 return enc_strlen(p, e, enc, cr);
2145 }
2146}
2147
2148long
2150{
2151 return str_strlen(str, NULL);
2152}
2153
2154/*
2155 * call-seq:
2156 * length -> integer
2157 *
2158 * :include: doc/string/length.rdoc
2159 *
2160 */
2161
2162VALUE
2164{
2165 return LONG2NUM(str_strlen(str, NULL));
2166}
2167
2168/*
2169 * call-seq:
2170 * bytesize -> integer
2171 *
2172 * :include: doc/string/bytesize.rdoc
2173 *
2174 */
2175
2176static VALUE
2177rb_str_bytesize(VALUE str)
2178{
2179 return LONG2NUM(RSTRING_LEN(str));
2180}
2181
2182/*
2183 * call-seq:
2184 * empty? -> true or false
2185 *
2186 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2187 *
2188 * "hello".empty? # => false
2189 * " ".empty? # => false
2190 * "".empty? # => true
2191 *
2192 */
2193
2194static VALUE
2195rb_str_empty(VALUE str)
2196{
2197 return RBOOL(RSTRING_LEN(str) == 0);
2198}
2199
2200/*
2201 * call-seq:
2202 * string + other_string -> new_string
2203 *
2204 * Returns a new \String containing +other_string+ concatenated to +self+:
2205 *
2206 * "Hello from " + self.to_s # => "Hello from main"
2207 *
2208 */
2209
2210VALUE
2212{
2213 VALUE str3;
2214 rb_encoding *enc;
2215 char *ptr1, *ptr2, *ptr3;
2216 long len1, len2;
2217 int termlen;
2218
2219 StringValue(str2);
2220 enc = rb_enc_check_str(str1, str2);
2221 RSTRING_GETMEM(str1, ptr1, len1);
2222 RSTRING_GETMEM(str2, ptr2, len2);
2223 termlen = rb_enc_mbminlen(enc);
2224 if (len1 > LONG_MAX - len2) {
2225 rb_raise(rb_eArgError, "string size too big");
2226 }
2227 str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2228 ptr3 = RSTRING_PTR(str3);
2229 memcpy(ptr3, ptr1, len1);
2230 memcpy(ptr3+len1, ptr2, len2);
2231 TERM_FILL(&ptr3[len1+len2], termlen);
2232
2233 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2235 RB_GC_GUARD(str1);
2236 RB_GC_GUARD(str2);
2237 return str3;
2238}
2239
2240/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2241MJIT_FUNC_EXPORTED VALUE
2242rb_str_opt_plus(VALUE str1, VALUE str2)
2243{
2244 assert(RBASIC_CLASS(str1) == rb_cString);
2245 assert(RBASIC_CLASS(str2) == rb_cString);
2246 long len1, len2;
2247 MAYBE_UNUSED(char) *ptr1, *ptr2;
2248 RSTRING_GETMEM(str1, ptr1, len1);
2249 RSTRING_GETMEM(str2, ptr2, len2);
2250 int enc1 = rb_enc_get_index(str1);
2251 int enc2 = rb_enc_get_index(str2);
2252
2253 if (enc1 < 0) {
2254 return Qundef;
2255 }
2256 else if (enc2 < 0) {
2257 return Qundef;
2258 }
2259 else if (enc1 != enc2) {
2260 return Qundef;
2261 }
2262 else if (len1 > LONG_MAX - len2) {
2263 return Qundef;
2264 }
2265 else {
2266 return rb_str_plus(str1, str2);
2267 }
2268
2269}
2270
2271/*
2272 * call-seq:
2273 * string * integer -> new_string
2274 *
2275 * Returns a new \String containing +integer+ copies of +self+:
2276 *
2277 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2278 * "Ho! " * 0 # => ""
2279 *
2280 */
2281
2282VALUE
2284{
2285 VALUE str2;
2286 long n, len;
2287 char *ptr2;
2288 int termlen;
2289
2290 if (times == INT2FIX(1)) {
2291 return str_duplicate(rb_cString, str);
2292 }
2293 if (times == INT2FIX(0)) {
2294 str2 = str_alloc_embed(rb_cString, 0);
2295 rb_enc_copy(str2, str);
2296 return str2;
2297 }
2298 len = NUM2LONG(times);
2299 if (len < 0) {
2300 rb_raise(rb_eArgError, "negative argument");
2301 }
2302 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2303 if (STR_EMBEDDABLE_P(len, 1)) {
2304 str2 = str_alloc_embed(rb_cString, len + 1);
2305 memset(RSTRING_PTR(str2), 0, len + 1);
2306 }
2307 else {
2308 str2 = str_alloc_heap(rb_cString);
2309 RSTRING(str2)->as.heap.aux.capa = len;
2310 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2311 }
2312 STR_SET_LEN(str2, len);
2313 rb_enc_copy(str2, str);
2314 return str2;
2315 }
2316 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2317 rb_raise(rb_eArgError, "argument too big");
2318 }
2319
2320 len *= RSTRING_LEN(str);
2321 termlen = TERM_LEN(str);
2322 str2 = str_new0(rb_cString, 0, len, termlen);
2323 ptr2 = RSTRING_PTR(str2);
2324 if (len) {
2325 n = RSTRING_LEN(str);
2326 memcpy(ptr2, RSTRING_PTR(str), n);
2327 while (n <= len/2) {
2328 memcpy(ptr2 + n, ptr2, n);
2329 n *= 2;
2330 }
2331 memcpy(ptr2 + n, ptr2, len-n);
2332 }
2333 STR_SET_LEN(str2, len);
2334 TERM_FILL(&ptr2[len], termlen);
2335 rb_enc_cr_str_copy_for_substr(str2, str);
2336
2337 return str2;
2338}
2339
2340/*
2341 * call-seq:
2342 * string % object -> new_string
2343 *
2344 * Returns the result of formatting +object+ into the format specification +self+
2345 * (see Kernel#sprintf for formatting details):
2346 *
2347 * "%05d" % 123 # => "00123"
2348 *
2349 * If +self+ contains multiple substitutions, +object+ must be
2350 * an \Array or \Hash containing the values to be substituted:
2351 *
2352 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2353 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2354 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2355 *
2356 */
2357
2358static VALUE
2359rb_str_format_m(VALUE str, VALUE arg)
2360{
2361 VALUE tmp = rb_check_array_type(arg);
2362
2363 if (!NIL_P(tmp)) {
2364 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2365 }
2366 return rb_str_format(1, &arg, str);
2367}
2368
2369static inline void
2370rb_check_lockedtmp(VALUE str)
2371{
2372 if (FL_TEST(str, STR_TMPLOCK)) {
2373 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2374 }
2375}
2376
2377static inline void
2378str_modifiable(VALUE str)
2379{
2380 rb_check_lockedtmp(str);
2381 rb_check_frozen(str);
2382}
2383
2384static inline int
2385str_dependent_p(VALUE str)
2386{
2387 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2388 return 0;
2389 }
2390 else {
2391 return 1;
2392 }
2393}
2394
2395static inline int
2396str_independent(VALUE str)
2397{
2398 str_modifiable(str);
2399 return !str_dependent_p(str);
2400}
2401
2402static void
2403str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2404{
2405 char *ptr;
2406 char *oldptr;
2407 long capa = len + expand;
2408
2409 if (len > capa) len = capa;
2410
2411 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2412 ptr = RSTRING(str)->as.heap.ptr;
2413 STR_SET_EMBED(str);
2414 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2415 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2416 STR_SET_EMBED_LEN(str, len);
2417 return;
2418 }
2419
2420 ptr = ALLOC_N(char, (size_t)capa + termlen);
2421 oldptr = RSTRING_PTR(str);
2422 if (oldptr) {
2423 memcpy(ptr, oldptr, len);
2424 }
2425 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2426 xfree(oldptr);
2427 }
2428 STR_SET_NOEMBED(str);
2429 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2430 TERM_FILL(ptr + len, termlen);
2431 RSTRING(str)->as.heap.ptr = ptr;
2432 RSTRING(str)->as.heap.len = len;
2433 RSTRING(str)->as.heap.aux.capa = capa;
2434}
2435
2436void
2438{
2439 if (!str_independent(str))
2440 str_make_independent(str);
2442}
2443
2444void
2446{
2447 int termlen = TERM_LEN(str);
2448 long len = RSTRING_LEN(str);
2449
2450 if (expand < 0) {
2451 rb_raise(rb_eArgError, "negative expanding string size");
2452 }
2453 if (expand >= LONG_MAX - len) {
2454 rb_raise(rb_eArgError, "string size too big");
2455 }
2456
2457 if (!str_independent(str)) {
2458 str_make_independent_expand(str, len, expand, termlen);
2459 }
2460 else if (expand > 0) {
2461 RESIZE_CAPA_TERM(str, len + expand, termlen);
2462 }
2464}
2465
2466/* As rb_str_modify(), but don't clear coderange */
2467static void
2468str_modify_keep_cr(VALUE str)
2469{
2470 if (!str_independent(str))
2471 str_make_independent(str);
2473 /* Force re-scan later */
2475}
2476
2477static inline void
2478str_discard(VALUE str)
2479{
2480 str_modifiable(str);
2481 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2482 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2483 RSTRING(str)->as.heap.ptr = 0;
2484 RSTRING(str)->as.heap.len = 0;
2485 }
2486}
2487
2488void
2490{
2491 rb_encoding *enc = rb_enc_get(str);
2492 if (!enc) {
2493 rb_raise(rb_eTypeError, "not encoding capable object");
2494 }
2495 if (!rb_enc_asciicompat(enc)) {
2496 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2497 }
2498}
2499
2500VALUE
2502{
2503 VALUE s = *ptr;
2504 if (!RB_TYPE_P(s, T_STRING)) {
2505 s = rb_str_to_str(s);
2506 *ptr = s;
2507 }
2508 return s;
2509}
2510
2511char *
2513{
2514 VALUE str = rb_string_value(ptr);
2515 return RSTRING_PTR(str);
2516}
2517
2518static int
2519zero_filled(const char *s, int n)
2520{
2521 for (; n > 0; --n) {
2522 if (*s++) return 0;
2523 }
2524 return 1;
2525}
2526
2527static const char *
2528str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2529{
2530 const char *e = s + len;
2531
2532 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2533 if (zero_filled(s, minlen)) return s;
2534 }
2535 return 0;
2536}
2537
2538static char *
2539str_fill_term(VALUE str, char *s, long len, int termlen)
2540{
2541 /* This function assumes that (capa + termlen) bytes of memory
2542 * is allocated, like many other functions in this file.
2543 */
2544 if (str_dependent_p(str)) {
2545 if (!zero_filled(s + len, termlen))
2546 str_make_independent_expand(str, len, 0L, termlen);
2547 }
2548 else {
2549 TERM_FILL(s + len, termlen);
2550 return s;
2551 }
2552 return RSTRING_PTR(str);
2553}
2554
2555void
2556rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2557{
2558 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2559 long len = RSTRING_LEN(str);
2560
2561 assert(capa >= len);
2562 if (capa - len < termlen) {
2563 rb_check_lockedtmp(str);
2564 str_make_independent_expand(str, len, 0L, termlen);
2565 }
2566 else if (str_dependent_p(str)) {
2567 if (termlen > oldtermlen)
2568 str_make_independent_expand(str, len, 0L, termlen);
2569 }
2570 else {
2571 if (!STR_EMBED_P(str)) {
2572 /* modify capa instead of realloc */
2573 assert(!FL_TEST((str), STR_SHARED));
2574 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2575 }
2576 if (termlen > oldtermlen) {
2577 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2578 }
2579 }
2580
2581 return;
2582}
2583
2584static char *
2585str_null_check(VALUE str, int *w)
2586{
2587 char *s = RSTRING_PTR(str);
2588 long len = RSTRING_LEN(str);
2589 rb_encoding *enc = rb_enc_get(str);
2590 const int minlen = rb_enc_mbminlen(enc);
2591
2592 if (minlen > 1) {
2593 *w = 1;
2594 if (str_null_char(s, len, minlen, enc)) {
2595 return NULL;
2596 }
2597 return str_fill_term(str, s, len, minlen);
2598 }
2599 *w = 0;
2600 if (!s || memchr(s, 0, len)) {
2601 return NULL;
2602 }
2603 if (s[len]) {
2604 s = str_fill_term(str, s, len, minlen);
2605 }
2606 return s;
2607}
2608
2609char *
2610rb_str_to_cstr(VALUE str)
2611{
2612 int w;
2613 return str_null_check(str, &w);
2614}
2615
2616char *
2618{
2619 VALUE str = rb_string_value(ptr);
2620 int w;
2621 char *s = str_null_check(str, &w);
2622 if (!s) {
2623 if (w) {
2624 rb_raise(rb_eArgError, "string contains null char");
2625 }
2626 rb_raise(rb_eArgError, "string contains null byte");
2627 }
2628 return s;
2629}
2630
2631char *
2632rb_str_fill_terminator(VALUE str, const int newminlen)
2633{
2634 char *s = RSTRING_PTR(str);
2635 long len = RSTRING_LEN(str);
2636 return str_fill_term(str, s, len, newminlen);
2637}
2638
2639VALUE
2641{
2642 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2643 return str;
2644}
2645
2646/*
2647 * call-seq:
2648 * String.try_convert(object) -> object, new_string, or nil
2649 *
2650 * If +object+ is a \String object, returns +object+.
2651 *
2652 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2653 * calls <tt>object.to_str</tt> and returns the result.
2654 *
2655 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2656 *
2657 * Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2658 */
2659static VALUE
2660rb_str_s_try_convert(VALUE dummy, VALUE str)
2661{
2662 return rb_check_string_type(str);
2663}
2664
2665static char*
2666str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2667{
2668 long nth = *nthp;
2669 if (rb_enc_mbmaxlen(enc) == 1) {
2670 p += nth;
2671 }
2672 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2673 p += nth * rb_enc_mbmaxlen(enc);
2674 }
2675 else if (rb_enc_asciicompat(enc)) {
2676 const char *p2, *e2;
2677 int n;
2678
2679 while (p < e && 0 < nth) {
2680 e2 = p + nth;
2681 if (e < e2) {
2682 *nthp = nth;
2683 return (char *)e;
2684 }
2685 if (ISASCII(*p)) {
2686 p2 = search_nonascii(p, e2);
2687 if (!p2) {
2688 nth -= e2 - p;
2689 *nthp = nth;
2690 return (char *)e2;
2691 }
2692 nth -= p2 - p;
2693 p = p2;
2694 }
2695 n = rb_enc_mbclen(p, e, enc);
2696 p += n;
2697 nth--;
2698 }
2699 *nthp = nth;
2700 if (nth != 0) {
2701 return (char *)e;
2702 }
2703 return (char *)p;
2704 }
2705 else {
2706 while (p < e && nth--) {
2707 p += rb_enc_mbclen(p, e, enc);
2708 }
2709 }
2710 if (p > e) p = e;
2711 *nthp = nth;
2712 return (char*)p;
2713}
2714
2715char*
2716rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2717{
2718 return str_nth_len(p, e, &nth, enc);
2719}
2720
2721static char*
2722str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2723{
2724 if (singlebyte)
2725 p += nth;
2726 else {
2727 p = str_nth_len(p, e, &nth, enc);
2728 }
2729 if (!p) return 0;
2730 if (p > e) p = e;
2731 return (char *)p;
2732}
2733
2734/* char offset to byte offset */
2735static long
2736str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2737{
2738 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2739 if (!pp) return e - p;
2740 return pp - p;
2741}
2742
2743long
2744rb_str_offset(VALUE str, long pos)
2745{
2746 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2747 STR_ENC_GET(str), single_byte_optimizable(str));
2748}
2749
2750#ifdef NONASCII_MASK
2751static char *
2752str_utf8_nth(const char *p, const char *e, long *nthp)
2753{
2754 long nth = *nthp;
2755 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2756 const uintptr_t *s, *t;
2757 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2758 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2759 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2760 while (p < (const char *)s) {
2761 if (is_utf8_lead_byte(*p)) nth--;
2762 p++;
2763 }
2764 do {
2765 nth -= count_utf8_lead_bytes_with_word(s);
2766 s++;
2767 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2768 p = (char *)s;
2769 }
2770 while (p < e) {
2771 if (is_utf8_lead_byte(*p)) {
2772 if (nth == 0) break;
2773 nth--;
2774 }
2775 p++;
2776 }
2777 *nthp = nth;
2778 return (char *)p;
2779}
2780
2781static long
2782str_utf8_offset(const char *p, const char *e, long nth)
2783{
2784 const char *pp = str_utf8_nth(p, e, &nth);
2785 return pp - p;
2786}
2787#endif
2788
2789/* byte offset to char offset */
2790long
2791rb_str_sublen(VALUE str, long pos)
2792{
2793 if (single_byte_optimizable(str) || pos < 0)
2794 return pos;
2795 else {
2796 char *p = RSTRING_PTR(str);
2797 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2798 }
2799}
2800
2801static VALUE
2802str_subseq(VALUE str, long beg, long len)
2803{
2804 VALUE str2;
2805
2806 const long rstring_embed_capa_max = ((sizeof(struct RString) - offsetof(struct RString, as.embed.ary)) / sizeof(char)) - 1;
2807
2808 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str)) ||
2809 len <= rstring_embed_capa_max) {
2810 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
2811 RB_GC_GUARD(str);
2812 }
2813 else {
2814 str2 = str_new_shared(rb_cString, str);
2815 ENC_CODERANGE_CLEAR(str2);
2816 RSTRING(str2)->as.heap.ptr += beg;
2817 if (RSTRING(str2)->as.heap.len > len) {
2818 RSTRING(str2)->as.heap.len = len;
2819 }
2820 }
2821
2822 return str2;
2823}
2824
2825VALUE
2826rb_str_subseq(VALUE str, long beg, long len)
2827{
2828 VALUE str2 = str_subseq(str, beg, len);
2829 rb_enc_cr_str_copy_for_substr(str2, str);
2830 return str2;
2831}
2832
2833char *
2834rb_str_subpos(VALUE str, long beg, long *lenp)
2835{
2836 long len = *lenp;
2837 long slen = -1L;
2838 long blen = RSTRING_LEN(str);
2839 rb_encoding *enc = STR_ENC_GET(str);
2840 char *p, *s = RSTRING_PTR(str), *e = s + blen;
2841
2842 if (len < 0) return 0;
2843 if (!blen) {
2844 len = 0;
2845 }
2846 if (single_byte_optimizable(str)) {
2847 if (beg > blen) return 0;
2848 if (beg < 0) {
2849 beg += blen;
2850 if (beg < 0) return 0;
2851 }
2852 if (len > blen - beg)
2853 len = blen - beg;
2854 if (len < 0) return 0;
2855 p = s + beg;
2856 goto end;
2857 }
2858 if (beg < 0) {
2859 if (len > -beg) len = -beg;
2860 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2861 beg = -beg;
2862 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2863 p = e;
2864 if (!p) return 0;
2865 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2866 if (!p) return 0;
2867 len = e - p;
2868 goto end;
2869 }
2870 else {
2871 slen = str_strlen(str, enc);
2872 beg += slen;
2873 if (beg < 0) return 0;
2874 p = s + beg;
2875 if (len == 0) goto end;
2876 }
2877 }
2878 else if (beg > 0 && beg > RSTRING_LEN(str)) {
2879 return 0;
2880 }
2881 if (len == 0) {
2882 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2883 p = s + beg;
2884 }
2885#ifdef NONASCII_MASK
2886 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2887 enc == rb_utf8_encoding()) {
2888 p = str_utf8_nth(s, e, &beg);
2889 if (beg > 0) return 0;
2890 len = str_utf8_offset(p, e, len);
2891 }
2892#endif
2893 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2894 int char_sz = rb_enc_mbmaxlen(enc);
2895
2896 p = s + beg * char_sz;
2897 if (p > e) {
2898 return 0;
2899 }
2900 else if (len * char_sz > e - p)
2901 len = e - p;
2902 else
2903 len *= char_sz;
2904 }
2905 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2906 if (beg > 0) return 0;
2907 len = 0;
2908 }
2909 else {
2910 len = str_offset(p, e, len, enc, 0);
2911 }
2912 end:
2913 *lenp = len;
2914 RB_GC_GUARD(str);
2915 return p;
2916}
2917
2918static VALUE str_substr(VALUE str, long beg, long len, int empty);
2919
2920VALUE
2921rb_str_substr(VALUE str, long beg, long len)
2922{
2923 return str_substr(str, beg, len, TRUE);
2924}
2925
2926static VALUE
2927str_substr(VALUE str, long beg, long len, int empty)
2928{
2929 char *p = rb_str_subpos(str, beg, &len);
2930
2931 if (!p) return Qnil;
2932 if (!len && !empty) return Qnil;
2933
2934 beg = p - RSTRING_PTR(str);
2935
2936 VALUE str2 = str_subseq(str, beg, len);
2937 rb_enc_cr_str_copy_for_substr(str2, str);
2938 return str2;
2939}
2940
2941VALUE
2943{
2944 if (OBJ_FROZEN(str)) return str;
2945 rb_str_resize(str, RSTRING_LEN(str));
2946 return rb_obj_freeze(str);
2947}
2948
2949
2950/*
2951 * call-seq:
2952 * +string -> new_string or self
2953 *
2954 * Returns +self+ if +self+ is not frozen.
2955 *
2956 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
2957 */
2958static VALUE
2959str_uplus(VALUE str)
2960{
2961 if (OBJ_FROZEN(str)) {
2962 return rb_str_dup(str);
2963 }
2964 else {
2965 return str;
2966 }
2967}
2968
2969/*
2970 * call-seq:
2971 * -string -> frozen_string
2972 *
2973 * Returns a frozen, possibly pre-existing copy of the string.
2974 *
2975 * The returned \String will be deduplicated as long as it does not have
2976 * any instance variables set on it and is not a String subclass.
2977 *
2978 * String#dedup is an alias for String#-@.
2979 */
2980static VALUE
2981str_uminus(VALUE str)
2982{
2983 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
2984 str = rb_str_dup(str);
2985 }
2986 return rb_fstring(str);
2987}
2988
2989RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
2990#define rb_str_dup_frozen rb_str_new_frozen
2991
2992VALUE
2994{
2995 if (FL_TEST(str, STR_TMPLOCK)) {
2996 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
2997 }
2998 FL_SET(str, STR_TMPLOCK);
2999 return str;
3000}
3001
3002VALUE
3004{
3005 if (!FL_TEST(str, STR_TMPLOCK)) {
3006 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3007 }
3008 FL_UNSET(str, STR_TMPLOCK);
3009 return str;
3010}
3011
3012RUBY_FUNC_EXPORTED VALUE
3013rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3014{
3015 rb_str_locktmp(str);
3016 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3017}
3018
3019void
3021{
3022 long capa;
3023 const int termlen = TERM_LEN(str);
3024
3025 str_modifiable(str);
3026 if (STR_SHARED_P(str)) {
3027 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3028 }
3029 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3030 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3031 }
3032
3033 int cr = ENC_CODERANGE(str);
3034 if (cr == ENC_CODERANGE_UNKNOWN) {
3035 /* Leave unknown. */
3036 }
3037 else if (len > RSTRING_LEN(str)) {
3038 if (ENC_CODERANGE_CLEAN_P(cr)) {
3039 /* Update the coderange regarding the extended part. */
3040 const char *const prev_end = RSTRING_END(str);
3041 const char *const new_end = RSTRING_PTR(str) + len;
3042 rb_encoding *enc = rb_enc_get(str);
3043 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3044 ENC_CODERANGE_SET(str, cr);
3045 }
3046 else if (cr == ENC_CODERANGE_BROKEN) {
3047 /* May be valid now, by appended part. */
3049 }
3050 }
3051 else if (len < RSTRING_LEN(str)) {
3052 if (cr != ENC_CODERANGE_7BIT) {
3053 /* ASCII-only string is keeping after truncated. Valid
3054 * and broken may be invalid or valid, leave unknown. */
3056 }
3057 }
3058
3059 STR_SET_LEN(str, len);
3060 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3061}
3062
3063VALUE
3065{
3066 if (len < 0) {
3067 rb_raise(rb_eArgError, "negative string size (or size too big)");
3068 }
3069
3070 int independent = str_independent(str);
3071 long slen = RSTRING_LEN(str);
3072
3073 if (slen > len && ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
3075 }
3076
3077 {
3078 long capa;
3079 const int termlen = TERM_LEN(str);
3080 if (STR_EMBED_P(str)) {
3081 if (len == slen) return str;
3082 if (str_embed_capa(str) >= len + termlen) {
3083 STR_SET_EMBED_LEN(str, len);
3084 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3085 return str;
3086 }
3087 str_make_independent_expand(str, slen, len - slen, termlen);
3088 }
3089 else if (str_embed_capa(str) >= len + termlen) {
3090 char *ptr = STR_HEAP_PTR(str);
3091 STR_SET_EMBED(str);
3092 if (slen > len) slen = len;
3093 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3094 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3095 STR_SET_EMBED_LEN(str, len);
3096 if (independent) ruby_xfree(ptr);
3097 return str;
3098 }
3099 else if (!independent) {
3100 if (len == slen) return str;
3101 str_make_independent_expand(str, slen, len - slen, termlen);
3102 }
3103 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3104 (capa - len) > (len < 1024 ? len : 1024)) {
3105 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3106 (size_t)len + termlen, STR_HEAP_SIZE(str));
3107 RSTRING(str)->as.heap.aux.capa = len;
3108 }
3109 else if (len == slen) return str;
3110 RSTRING(str)->as.heap.len = len;
3111 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3112 }
3113 return str;
3114}
3115
3116static VALUE
3117str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3118{
3119 if (keep_cr) {
3120 str_modify_keep_cr(str);
3121 }
3122 else {
3123 rb_str_modify(str);
3124 }
3125 if (len == 0) return 0;
3126
3127 long capa, total, olen, off = -1;
3128 char *sptr;
3129 const int termlen = TERM_LEN(str);
3130#if !USE_RVARGC
3131 assert(termlen < RSTRING_EMBED_LEN_MAX + 1); /* < (LONG_MAX/2) */
3132#endif
3133
3134 RSTRING_GETMEM(str, sptr, olen);
3135 if (ptr >= sptr && ptr <= sptr + olen) {
3136 off = ptr - sptr;
3137 }
3138
3139 if (STR_EMBED_P(str)) {
3140 capa = str_embed_capa(str) - termlen;
3141 sptr = RSTRING(str)->as.embed.ary;
3142 olen = RSTRING_EMBED_LEN(str);
3143 }
3144 else {
3145 capa = RSTRING(str)->as.heap.aux.capa;
3146 sptr = RSTRING(str)->as.heap.ptr;
3147 olen = RSTRING(str)->as.heap.len;
3148 }
3149 if (olen > LONG_MAX - len) {
3150 rb_raise(rb_eArgError, "string sizes too big");
3151 }
3152 total = olen + len;
3153 if (capa < total) {
3154 if (total >= LONG_MAX / 2) {
3155 capa = total;
3156 }
3157 while (total > capa) {
3158 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3159 }
3160 RESIZE_CAPA_TERM(str, capa, termlen);
3161 sptr = RSTRING_PTR(str);
3162 }
3163 if (off != -1) {
3164 ptr = sptr + off;
3165 }
3166 memcpy(sptr + olen, ptr, len);
3167 STR_SET_LEN(str, total);
3168 TERM_FILL(sptr + total, termlen); /* sentinel */
3169
3170 return str;
3171}
3172
3173#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3174#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3175
3176VALUE
3177rb_str_cat(VALUE str, const char *ptr, long len)
3178{
3179 if (len == 0) return str;
3180 if (len < 0) {
3181 rb_raise(rb_eArgError, "negative string size (or size too big)");
3182 }
3183 return str_buf_cat(str, ptr, len);
3184}
3185
3186VALUE
3187rb_str_cat_cstr(VALUE str, const char *ptr)
3188{
3189 must_not_null(ptr);
3190 return rb_str_buf_cat(str, ptr, strlen(ptr));
3191}
3192
3193RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3194RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3195RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3196
3197static VALUE
3198rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3199 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3200{
3201 int str_encindex = ENCODING_GET(str);
3202 int res_encindex;
3203 int str_cr, res_cr;
3204 rb_encoding *str_enc, *ptr_enc;
3205
3206 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3207
3208 if (str_encindex == ptr_encindex) {
3209 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3210 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3211 }
3212 }
3213 else {
3214 str_enc = rb_enc_from_index(str_encindex);
3215 ptr_enc = rb_enc_from_index(ptr_encindex);
3216 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3217 if (len == 0)
3218 return str;
3219 if (RSTRING_LEN(str) == 0) {
3220 rb_str_buf_cat(str, ptr, len);
3221 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3222 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3223 return str;
3224 }
3225 goto incompatible;
3226 }
3227 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3228 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3229 }
3230 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3231 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3232 str_cr = rb_enc_str_coderange(str);
3233 }
3234 }
3235 }
3236 if (ptr_cr_ret)
3237 *ptr_cr_ret = ptr_cr;
3238
3239 if (str_encindex != ptr_encindex &&
3240 str_cr != ENC_CODERANGE_7BIT &&
3241 ptr_cr != ENC_CODERANGE_7BIT) {
3242 str_enc = rb_enc_from_index(str_encindex);
3243 ptr_enc = rb_enc_from_index(ptr_encindex);
3244 goto incompatible;
3245 }
3246
3247 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3248 res_encindex = str_encindex;
3249 res_cr = ENC_CODERANGE_UNKNOWN;
3250 }
3251 else if (str_cr == ENC_CODERANGE_7BIT) {
3252 if (ptr_cr == ENC_CODERANGE_7BIT) {
3253 res_encindex = str_encindex;
3254 res_cr = ENC_CODERANGE_7BIT;
3255 }
3256 else {
3257 res_encindex = ptr_encindex;
3258 res_cr = ptr_cr;
3259 }
3260 }
3261 else if (str_cr == ENC_CODERANGE_VALID) {
3262 res_encindex = str_encindex;
3263 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3264 res_cr = str_cr;
3265 else
3266 res_cr = ptr_cr;
3267 }
3268 else { /* str_cr == ENC_CODERANGE_BROKEN */
3269 res_encindex = str_encindex;
3270 res_cr = str_cr;
3271 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3272 }
3273
3274 if (len < 0) {
3275 rb_raise(rb_eArgError, "negative string size (or size too big)");
3276 }
3277 str_buf_cat(str, ptr, len);
3278 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3279 return str;
3280
3281 incompatible:
3282 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3283 rb_enc_name(str_enc), rb_enc_name(ptr_enc));
3285}
3286
3287VALUE
3288rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3289{
3290 return rb_enc_cr_str_buf_cat(str, ptr, len,
3291 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3292}
3293
3294VALUE
3296{
3297 /* ptr must reference NUL terminated ASCII string. */
3298 int encindex = ENCODING_GET(str);
3299 rb_encoding *enc = rb_enc_from_index(encindex);
3300 if (rb_enc_asciicompat(enc)) {
3301 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3302 encindex, ENC_CODERANGE_7BIT, 0);
3303 }
3304 else {
3305 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3306 while (*ptr) {
3307 unsigned int c = (unsigned char)*ptr;
3308 int len = rb_enc_codelen(c, enc);
3309 rb_enc_mbcput(c, buf, enc);
3310 rb_enc_cr_str_buf_cat(str, buf, len,
3311 encindex, ENC_CODERANGE_VALID, 0);
3312 ptr++;
3313 }
3314 return str;
3315 }
3316}
3317
3318VALUE
3320{
3321 int str2_cr = rb_enc_str_coderange(str2);
3322
3323 if (str_enc_fastpath(str)) {
3324 switch (str2_cr) {
3325 case ENC_CODERANGE_7BIT:
3326 // If RHS is 7bit we can do simple concatenation
3327 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3328 RB_GC_GUARD(str2);
3329 return str;
3331 // If RHS is valid, we can do simple concatenation if encodings are the same
3332 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3333 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3334 int str_cr = ENC_CODERANGE(str);
3335 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3336 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3337 }
3338 RB_GC_GUARD(str2);
3339 return str;
3340 }
3341 }
3342 }
3343
3344 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3345 ENCODING_GET(str2), str2_cr, &str2_cr);
3346
3347 ENC_CODERANGE_SET(str2, str2_cr);
3348
3349 return str;
3350}
3351
3352VALUE
3354{
3355 StringValue(str2);
3356 return rb_str_buf_append(str, str2);
3357}
3358
3359#define MIN_PRE_ALLOC_SIZE 48
3360
3361MJIT_FUNC_EXPORTED VALUE
3362rb_str_concat_literals(size_t num, const VALUE *strary)
3363{
3364 VALUE str;
3365 size_t i, s;
3366 long len = 1;
3367
3368 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3369 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3370
3371 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3372 if (LIKELY(len < MIN_PRE_ALLOC_SIZE)) {
3373 str = rb_str_resurrect(strary[0]);
3374 s = 1;
3375 }
3376 else {
3377 str = rb_str_buf_new(len);
3378 rb_enc_copy(str, strary[0]);
3379 s = 0;
3380 }
3381
3382 for (i = s; i < num; ++i) {
3383 const VALUE v = strary[i];
3384 int encidx = ENCODING_GET(v);
3385
3386 rb_str_buf_append(str, v);
3387 if (encidx != ENCINDEX_US_ASCII) {
3388 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3389 rb_enc_set_index(str, encidx);
3390 }
3391 }
3392 return str;
3393}
3394
3395/*
3396 * call-seq:
3397 * concat(*objects) -> string
3398 *
3399 * Concatenates each object in +objects+ to +self+ and returns +self+:
3400 *
3401 * s = 'foo'
3402 * s.concat('bar', 'baz') # => "foobarbaz"
3403 * s # => "foobarbaz"
3404 *
3405 * For each given object +object+ that is an \Integer,
3406 * the value is considered a codepoint and converted to a character before concatenation:
3407 *
3408 * s = 'foo'
3409 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3410 *
3411 * Related: String#<<, which takes a single argument.
3412 */
3413static VALUE
3414rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3415{
3416 str_modifiable(str);
3417
3418 if (argc == 1) {
3419 return rb_str_concat(str, argv[0]);
3420 }
3421 else if (argc > 1) {
3422 int i;
3423 VALUE arg_str = rb_str_tmp_new(0);
3424 rb_enc_copy(arg_str, str);
3425 for (i = 0; i < argc; i++) {
3426 rb_str_concat(arg_str, argv[i]);
3427 }
3428 rb_str_buf_append(str, arg_str);
3429 }
3430
3431 return str;
3432}
3433
3434/*
3435 * call-seq:
3436 * string << object -> string
3437 *
3438 * Concatenates +object+ to +self+ and returns +self+:
3439 *
3440 * s = 'foo'
3441 * s << 'bar' # => "foobar"
3442 * s # => "foobar"
3443 *
3444 * If +object+ is an \Integer,
3445 * the value is considered a codepoint and converted to a character before concatenation:
3446 *
3447 * s = 'foo'
3448 * s << 33 # => "foo!"
3449 *
3450 * Related: String#concat, which takes multiple arguments.
3451 */
3452VALUE
3454{
3455 unsigned int code;
3456 rb_encoding *enc = STR_ENC_GET(str1);
3457 int encidx;
3458
3459 if (RB_INTEGER_TYPE_P(str2)) {
3460 if (rb_num_to_uint(str2, &code) == 0) {
3461 }
3462 else if (FIXNUM_P(str2)) {
3463 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3464 }
3465 else {
3466 rb_raise(rb_eRangeError, "bignum out of char range");
3467 }
3468 }
3469 else {
3470 return rb_str_append(str1, str2);
3471 }
3472
3473 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3474 if (encidx >= 0) {
3475 char buf[1];
3476 buf[0] = (char)code;
3477 rb_str_cat(str1, buf, 1);
3478 if (encidx != rb_enc_to_index(enc)) {
3479 rb_enc_associate_index(str1, encidx);
3481 }
3482 }
3483 else {
3484 long pos = RSTRING_LEN(str1);
3485 int cr = ENC_CODERANGE(str1);
3486 int len;
3487 char *buf;
3488
3489 switch (len = rb_enc_codelen(code, enc)) {
3490 case ONIGERR_INVALID_CODE_POINT_VALUE:
3491 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3492 break;
3493 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3494 case 0:
3495 rb_raise(rb_eRangeError, "%u out of char range", code);
3496 break;
3497 }
3498 buf = ALLOCA_N(char, len + 1);
3499 rb_enc_mbcput(code, buf, enc);
3500 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3501 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3502 }
3503 rb_str_resize(str1, pos+len);
3504 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3505 if (cr == ENC_CODERANGE_7BIT && code > 127)
3507 ENC_CODERANGE_SET(str1, cr);
3508 }
3509 return str1;
3510}
3511
3512int
3513rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3514{
3515 int encidx = rb_enc_to_index(enc);
3516
3517 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3518 /* US-ASCII automatically extended to ASCII-8BIT */
3519 if (code > 0xFF) {
3520 rb_raise(rb_eRangeError, "%u out of char range", code);
3521 }
3522 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3523 return ENCINDEX_ASCII_8BIT;
3524 }
3525 return encidx;
3526 }
3527 else {
3528 return -1;
3529 }
3530}
3531
3532/*
3533 * call-seq:
3534 * prepend(*other_strings) -> string
3535 *
3536 * Prepends each string in +other_strings+ to +self+ and returns +self+:
3537 *
3538 * s = 'foo'
3539 * s.prepend('bar', 'baz') # => "barbazfoo"
3540 * s # => "barbazfoo"
3541 *
3542 * Related: String#concat.
3543 */
3544
3545static VALUE
3546rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3547{
3548 str_modifiable(str);
3549
3550 if (argc == 1) {
3551 rb_str_update(str, 0L, 0L, argv[0]);
3552 }
3553 else if (argc > 1) {
3554 int i;
3555 VALUE arg_str = rb_str_tmp_new(0);
3556 rb_enc_copy(arg_str, str);
3557 for (i = 0; i < argc; i++) {
3558 rb_str_append(arg_str, argv[i]);
3559 }
3560 rb_str_update(str, 0L, 0L, arg_str);
3561 }
3562
3563 return str;
3564}
3565
3566st_index_t
3568{
3569 int e = ENCODING_GET(str);
3570 if (e && is_ascii_string(str)) {
3571 e = 0;
3572 }
3573 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
3574}
3575
3576int
3578{
3579 long len1, len2;
3580 const char *ptr1, *ptr2;
3581 RSTRING_GETMEM(str1, ptr1, len1);
3582 RSTRING_GETMEM(str2, ptr2, len2);
3583 return (len1 != len2 ||
3584 !rb_str_comparable(str1, str2) ||
3585 memcmp(ptr1, ptr2, len1) != 0);
3586}
3587
3588/*
3589 * call-seq:
3590 * hash -> integer
3591 *
3592 * Returns the integer hash value for +self+.
3593 * The value is based on the length, content and encoding of +self+.
3594 *
3595 * Related: Object#hash.
3596 */
3597
3598static VALUE
3599rb_str_hash_m(VALUE str)
3600{
3601 st_index_t hval = rb_str_hash(str);
3602 return ST2FIX(hval);
3603}
3604
3605#define lesser(a,b) (((a)>(b))?(b):(a))
3606
3607int
3609{
3610 int idx1, idx2;
3611 int rc1, rc2;
3612
3613 if (RSTRING_LEN(str1) == 0) return TRUE;
3614 if (RSTRING_LEN(str2) == 0) return TRUE;
3615 idx1 = ENCODING_GET(str1);
3616 idx2 = ENCODING_GET(str2);
3617 if (idx1 == idx2) return TRUE;
3618 rc1 = rb_enc_str_coderange(str1);
3619 rc2 = rb_enc_str_coderange(str2);
3620 if (rc1 == ENC_CODERANGE_7BIT) {
3621 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3622 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3623 return TRUE;
3624 }
3625 if (rc2 == ENC_CODERANGE_7BIT) {
3626 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3627 return TRUE;
3628 }
3629 return FALSE;
3630}
3631
3632int
3634{
3635 long len1, len2;
3636 const char *ptr1, *ptr2;
3637 int retval;
3638
3639 if (str1 == str2) return 0;
3640 RSTRING_GETMEM(str1, ptr1, len1);
3641 RSTRING_GETMEM(str2, ptr2, len2);
3642 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3643 if (len1 == len2) {
3644 if (!rb_str_comparable(str1, str2)) {
3645 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3646 return 1;
3647 return -1;
3648 }
3649 return 0;
3650 }
3651 if (len1 > len2) return 1;
3652 return -1;
3653 }
3654 if (retval > 0) return 1;
3655 return -1;
3656}
3657
3658/*
3659 * call-seq:
3660 * string == object -> true or false
3661 * string === object -> true or false
3662 *
3663 * Returns +true+ if +object+ has the same length and content;
3664 * as +self+; +false+ otherwise:
3665 *
3666 * s = 'foo'
3667 * s == 'foo' # => true
3668 * s == 'food' # => false
3669 * s == 'FOO' # => false
3670 *
3671 * Returns +false+ if the two strings' encodings are not compatible:
3672 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3673 *
3674 * If +object+ is not an instance of \String but responds to +to_str+, then the
3675 * two strings are compared using <code>object.==</code>.
3676 */
3677
3678VALUE
3680{
3681 if (str1 == str2) return Qtrue;
3682 if (!RB_TYPE_P(str2, T_STRING)) {
3683 if (!rb_respond_to(str2, idTo_str)) {
3684 return Qfalse;
3685 }
3686 return rb_equal(str2, str1);
3687 }
3688 return rb_str_eql_internal(str1, str2);
3689}
3690
3691/*
3692 * call-seq:
3693 * eql?(object) -> true or false
3694 *
3695 * Returns +true+ if +object+ has the same length and content;
3696 * as +self+; +false+ otherwise:
3697 *
3698 * s = 'foo'
3699 * s.eql?('foo') # => true
3700 * s.eql?('food') # => false
3701 * s.eql?('FOO') # => false
3702 *
3703 * Returns +false+ if the two strings' encodings are not compatible:
3704 *
3705 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3706 *
3707 */
3708
3709MJIT_FUNC_EXPORTED VALUE
3710rb_str_eql(VALUE str1, VALUE str2)
3711{
3712 if (str1 == str2) return Qtrue;
3713 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3714 return rb_str_eql_internal(str1, str2);
3715}
3716
3717/*
3718 * call-seq:
3719 * string <=> other_string -> -1, 0, 1, or nil
3720 *
3721 * Compares +self+ and +other_string+, returning:
3722 *
3723 * - -1 if +other_string+ is larger.
3724 * - 0 if the two are equal.
3725 * - 1 if +other_string+ is smaller.
3726 * - +nil+ if the two are incomparable.
3727 *
3728 * Examples:
3729 *
3730 * 'foo' <=> 'foo' # => 0
3731 * 'foo' <=> 'food' # => -1
3732 * 'food' <=> 'foo' # => 1
3733 * 'FOO' <=> 'foo' # => -1
3734 * 'foo' <=> 'FOO' # => 1
3735 * 'foo' <=> 1 # => nil
3736 *
3737 */
3738
3739static VALUE
3740rb_str_cmp_m(VALUE str1, VALUE str2)
3741{
3742 int result;
3743 VALUE s = rb_check_string_type(str2);
3744 if (NIL_P(s)) {
3745 return rb_invcmp(str1, str2);
3746 }
3747 result = rb_str_cmp(str1, s);
3748 return INT2FIX(result);
3749}
3750
3751static VALUE str_casecmp(VALUE str1, VALUE str2);
3752static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3753
3754/*
3755 * call-seq:
3756 * casecmp(other_string) -> -1, 0, 1, or nil
3757 *
3758 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3759 *
3760 * - -1 if <tt>other_string.downcase</tt> is larger.
3761 * - 0 if the two are equal.
3762 * - 1 if <tt>other_string.downcase</tt> is smaller.
3763 * - +nil+ if the two are incomparable.
3764 *
3765 * Examples:
3766 *
3767 * 'foo'.casecmp('foo') # => 0
3768 * 'foo'.casecmp('food') # => -1
3769 * 'food'.casecmp('foo') # => 1
3770 * 'FOO'.casecmp('foo') # => 0
3771 * 'foo'.casecmp('FOO') # => 0
3772 * 'foo'.casecmp(1) # => nil
3773 *
3774 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3775 *
3776 * Related: String#casecmp?.
3777 *
3778 */
3779
3780static VALUE
3781rb_str_casecmp(VALUE str1, VALUE str2)
3782{
3783 VALUE s = rb_check_string_type(str2);
3784 if (NIL_P(s)) {
3785 return Qnil;
3786 }
3787 return str_casecmp(str1, s);
3788}
3789
3790static VALUE
3791str_casecmp(VALUE str1, VALUE str2)
3792{
3793 long len;
3794 rb_encoding *enc;
3795 const char *p1, *p1end, *p2, *p2end;
3796
3797 enc = rb_enc_compatible(str1, str2);
3798 if (!enc) {
3799 return Qnil;
3800 }
3801
3802 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3803 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3804 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3805 while (p1 < p1end && p2 < p2end) {
3806 if (*p1 != *p2) {
3807 unsigned int c1 = TOLOWER(*p1 & 0xff);
3808 unsigned int c2 = TOLOWER(*p2 & 0xff);
3809 if (c1 != c2)
3810 return INT2FIX(c1 < c2 ? -1 : 1);
3811 }
3812 p1++;
3813 p2++;
3814 }
3815 }
3816 else {
3817 while (p1 < p1end && p2 < p2end) {
3818 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3819 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3820
3821 if (0 <= c1 && 0 <= c2) {
3822 c1 = TOLOWER(c1);
3823 c2 = TOLOWER(c2);
3824 if (c1 != c2)
3825 return INT2FIX(c1 < c2 ? -1 : 1);
3826 }
3827 else {
3828 int r;
3829 l1 = rb_enc_mbclen(p1, p1end, enc);
3830 l2 = rb_enc_mbclen(p2, p2end, enc);
3831 len = l1 < l2 ? l1 : l2;
3832 r = memcmp(p1, p2, len);
3833 if (r != 0)
3834 return INT2FIX(r < 0 ? -1 : 1);
3835 if (l1 != l2)
3836 return INT2FIX(l1 < l2 ? -1 : 1);
3837 }
3838 p1 += l1;
3839 p2 += l2;
3840 }
3841 }
3842 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3843 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3844 return INT2FIX(-1);
3845}
3846
3847/*
3848 * call-seq:
3849 * casecmp?(other_string) -> true, false, or nil
3850 *
3851 * Returns +true+ if +self+ and +other_string+ are equal after
3852 * Unicode case folding, otherwise +false+:
3853 *
3854 * 'foo'.casecmp?('foo') # => true
3855 * 'foo'.casecmp?('food') # => false
3856 * 'food'.casecmp?('foo') # => false
3857 * 'FOO'.casecmp?('foo') # => true
3858 * 'foo'.casecmp?('FOO') # => true
3859 *
3860 * Returns +nil+ if the two values are incomparable:
3861 *
3862 * 'foo'.casecmp?(1) # => nil
3863 *
3864 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3865 *
3866 * Related: String#casecmp.
3867 *
3868 */
3869
3870static VALUE
3871rb_str_casecmp_p(VALUE str1, VALUE str2)
3872{
3873 VALUE s = rb_check_string_type(str2);
3874 if (NIL_P(s)) {
3875 return Qnil;
3876 }
3877 return str_casecmp_p(str1, s);
3878}
3879
3880static VALUE
3881str_casecmp_p(VALUE str1, VALUE str2)
3882{
3883 rb_encoding *enc;
3884 VALUE folded_str1, folded_str2;
3885 VALUE fold_opt = sym_fold;
3886
3887 enc = rb_enc_compatible(str1, str2);
3888 if (!enc) {
3889 return Qnil;
3890 }
3891
3892 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3893 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3894
3895 return rb_str_eql(folded_str1, folded_str2);
3896}
3897
3898static long
3899strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3900 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3901{
3902 const char *search_start = str_ptr;
3903 long pos, search_len = str_len - offset;
3904
3905 for (;;) {
3906 const char *t;
3907 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3908 if (pos < 0) return pos;
3909 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3910 if (t == search_start + pos) break;
3911 search_len -= t - search_start;
3912 if (search_len <= 0) return -1;
3913 offset += t - search_start;
3914 search_start = t;
3915 }
3916 return pos + offset;
3917}
3918
3919#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3920
3921static long
3922rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3923{
3924 const char *str_ptr, *str_ptr_end, *sub_ptr;
3925 long str_len, sub_len;
3926 rb_encoding *enc;
3927
3928 enc = rb_enc_check(str, sub);
3929 if (is_broken_string(sub)) return -1;
3930
3931 str_ptr = RSTRING_PTR(str);
3932 str_ptr_end = RSTRING_END(str);
3933 str_len = RSTRING_LEN(str);
3934 sub_ptr = RSTRING_PTR(sub);
3935 sub_len = RSTRING_LEN(sub);
3936
3937 if (str_len < sub_len) return -1;
3938
3939 if (offset != 0) {
3940 long str_len_char, sub_len_char;
3941 int single_byte = single_byte_optimizable(str);
3942 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3943 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3944 if (offset < 0) {
3945 offset += str_len_char;
3946 if (offset < 0) return -1;
3947 }
3948 if (str_len_char - offset < sub_len_char) return -1;
3949 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3950 str_ptr += offset;
3951 }
3952 if (sub_len == 0) return offset;
3953
3954 /* need proceed one character at a time */
3955 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3956}
3957
3958
3959/*
3960 * call-seq:
3961 * index(substring, offset = 0) -> integer or nil
3962 * index(regexp, offset = 0) -> integer or nil
3963 *
3964 * :include: doc/string/index.rdoc
3965 *
3966 */
3967
3968static VALUE
3969rb_str_index_m(int argc, VALUE *argv, VALUE str)
3970{
3971 VALUE sub;
3972 VALUE initpos;
3973 long pos;
3974
3975 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
3976 pos = NUM2LONG(initpos);
3977 }
3978 else {
3979 pos = 0;
3980 }
3981 if (pos < 0) {
3982 pos += str_strlen(str, NULL);
3983 if (pos < 0) {
3984 if (RB_TYPE_P(sub, T_REGEXP)) {
3986 }
3987 return Qnil;
3988 }
3989 }
3990
3991 if (RB_TYPE_P(sub, T_REGEXP)) {
3992 if (pos > str_strlen(str, NULL))
3993 return Qnil;
3994 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
3995 rb_enc_check(str, sub), single_byte_optimizable(str));
3996
3997 if (rb_reg_search(sub, str, pos, 0) < 0) {
3998 return Qnil;
3999 }
4000 else {
4001 VALUE match = rb_backref_get();
4002 struct re_registers *regs = RMATCH_REGS(match);
4003 pos = rb_str_sublen(str, BEG(0));
4004 return LONG2NUM(pos);
4005 }
4006 }
4007 else {
4008 StringValue(sub);
4009 pos = rb_str_index(str, sub, pos);
4010 pos = rb_str_sublen(str, pos);
4011 }
4012
4013 if (pos == -1) return Qnil;
4014 return LONG2NUM(pos);
4015}
4016
4017/* whether given pos is valid character boundary or not
4018 * Note that in this function, "character" means a code point
4019 * (Unicode scalar value), not a grapheme cluster.
4020 */
4021static bool
4022str_check_byte_pos(VALUE str, long pos)
4023{
4024 const char *s = RSTRING_PTR(str);
4025 const char *e = RSTRING_END(str);
4026 const char *p = s + pos;
4027 const char *pp = rb_enc_left_char_head(s, p, e, rb_enc_get(str));
4028 return p == pp;
4029}
4030
4031/*
4032 * call-seq:
4033 * byteindex(substring, offset = 0) -> integer or nil
4034 * byteindex(regexp, offset = 0) -> integer or nil
4035 *
4036 * Returns the \Integer byte-based index of the first occurrence of the given +substring+,
4037 * or +nil+ if none found:
4038 *
4039 * 'foo'.byteindex('f') # => 0
4040 * 'foo'.byteindex('o') # => 1
4041 * 'foo'.byteindex('oo') # => 1
4042 * 'foo'.byteindex('ooo') # => nil
4043 *
4044 * Returns the \Integer byte-based index of the first match for the given \Regexp +regexp+,
4045 * or +nil+ if none found:
4046 *
4047 * 'foo'.byteindex(/f/) # => 0
4048 * 'foo'.byteindex(/o/) # => 1
4049 * 'foo'.byteindex(/oo/) # => 1
4050 * 'foo'.byteindex(/ooo/) # => nil
4051 *
4052 * \Integer argument +offset+, if given, specifies the byte-based position in the
4053 * string to begin the search:
4054 *
4055 * 'foo'.byteindex('o', 1) # => 1
4056 * 'foo'.byteindex('o', 2) # => 2
4057 * 'foo'.byteindex('o', 3) # => nil
4058 *
4059 * If +offset+ is negative, counts backward from the end of +self+:
4060 *
4061 * 'foo'.byteindex('o', -1) # => 2
4062 * 'foo'.byteindex('o', -2) # => 1
4063 * 'foo'.byteindex('o', -3) # => 1
4064 * 'foo'.byteindex('o', -4) # => nil
4065 *
4066 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4067 * raised.
4068 *
4069 * Related: String#index, String#byterindex.
4070 */
4071
4072static VALUE
4073rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4074{
4075 VALUE sub;
4076 VALUE initpos;
4077 long pos;
4078
4079 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4080 long slen = RSTRING_LEN(str);
4081 pos = NUM2LONG(initpos);
4082 if (pos < 0) {
4083 pos += slen;
4084 }
4085 if (pos < 0 || pos > slen) {
4086 if (RB_TYPE_P(sub, T_REGEXP)) {
4088 }
4089 return Qnil;
4090 }
4091 }
4092 else {
4093 pos = 0;
4094 }
4095
4096 if (!str_check_byte_pos(str, pos)) {
4098 "offset %ld does not land on character boundary", pos);
4099 }
4100
4101 if (RB_TYPE_P(sub, T_REGEXP)) {
4102 if (rb_reg_search(sub, str, pos, 0) < 0) {
4103 return Qnil;
4104 }
4105 else {
4106 VALUE match = rb_backref_get();
4107 struct re_registers *regs = RMATCH_REGS(match);
4108 pos = BEG(0);
4109 return LONG2NUM(pos);
4110 }
4111 }
4112 else {
4113 StringValue(sub);
4114 pos = rb_strseq_index(str, sub, pos, 1);
4115 }
4116
4117 if (pos == -1) return Qnil;
4118 return LONG2NUM(pos);
4119}
4120
4121#ifdef HAVE_MEMRCHR
4122static long
4123str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4124{
4125 char *hit, *adjusted;
4126 int c;
4127 long slen, searchlen;
4128 char *sbeg, *e, *t;
4129
4130 sbeg = RSTRING_PTR(str);
4131 slen = RSTRING_LEN(sub);
4132 if (slen == 0) return s - sbeg;
4133 e = RSTRING_END(str);
4134 t = RSTRING_PTR(sub);
4135 c = *t & 0xff;
4136 searchlen = s - sbeg + 1;
4137
4138 do {
4139 hit = memrchr(sbeg, c, searchlen);
4140 if (!hit) break;
4141 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4142 if (hit != adjusted) {
4143 searchlen = adjusted - sbeg;
4144 continue;
4145 }
4146 if (memcmp(hit, t, slen) == 0)
4147 return hit - sbeg;
4148 searchlen = adjusted - sbeg;
4149 } while (searchlen > 0);
4150
4151 return -1;
4152}
4153#else
4154static long
4155str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4156{
4157 long slen;
4158 char *sbeg, *e, *t;
4159
4160 sbeg = RSTRING_PTR(str);
4161 e = RSTRING_END(str);
4162 t = RSTRING_PTR(sub);
4163 slen = RSTRING_LEN(sub);
4164
4165 while (s) {
4166 if (memcmp(s, t, slen) == 0) {
4167 return s - sbeg;
4168 }
4169 if (s <= sbeg) break;
4170 s = rb_enc_prev_char(sbeg, s, e, enc);
4171 }
4172
4173 return -1;
4174}
4175#endif
4176
4177static long
4178rb_str_rindex(VALUE str, VALUE sub, long pos)
4179{
4180 long len, slen;
4181 char *sbeg, *s;
4182 rb_encoding *enc;
4183 int singlebyte;
4184
4185 enc = rb_enc_check(str, sub);
4186 if (is_broken_string(sub)) return -1;
4187 singlebyte = single_byte_optimizable(str);
4188 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4189 slen = str_strlen(sub, enc); /* rb_enc_check */
4190
4191 /* substring longer than string */
4192 if (len < slen) return -1;
4193 if (len - pos < slen) pos = len - slen;
4194 if (len == 0) return pos;
4195
4196 sbeg = RSTRING_PTR(str);
4197
4198 if (pos == 0) {
4199 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4200 return 0;
4201 else
4202 return -1;
4203 }
4204
4205 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4206 return rb_str_sublen(str, str_rindex(str, sub, s, enc));
4207}
4208
4209/*
4210 * call-seq:
4211 * rindex(substring, offset = self.length) -> integer or nil
4212 * rindex(regexp, offset = self.length) -> integer or nil
4213 *
4214 * Returns the \Integer index of the _last_ occurrence of the given +substring+,
4215 * or +nil+ if none found:
4216 *
4217 * 'foo'.rindex('f') # => 0
4218 * 'foo'.rindex('o') # => 2
4219 * 'foo'.rindex('oo') # => 1
4220 * 'foo'.rindex('ooo') # => nil
4221 *
4222 * Returns the \Integer index of the _last_ match for the given \Regexp +regexp+,
4223 * or +nil+ if none found:
4224 *
4225 * 'foo'.rindex(/f/) # => 0
4226 * 'foo'.rindex(/o/) # => 2
4227 * 'foo'.rindex(/oo/) # => 1
4228 * 'foo'.rindex(/ooo/) # => nil
4229 *
4230 * The _last_ match means starting at the possible last position, not
4231 * the last of longest matches.
4232 *
4233 * 'foo'.rindex(/o+/) # => 2
4234 * $~ #=> #<MatchData "o">
4235 *
4236 * To get the last longest match, needs to combine with negative
4237 * lookbehind.
4238 *
4239 * 'foo'.rindex(/(?<!o)o+/) # => 1
4240 * $~ #=> #<MatchData "oo">
4241 *
4242 * Or String#index with negative lookforward.
4243 *
4244 * 'foo'.index(/o+(?!.*o)/) # => 1
4245 * $~ #=> #<MatchData "oo">
4246 *
4247 * \Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4248 * string to _end_ the search:
4249 *
4250 * 'foo'.rindex('o', 0) # => nil
4251 * 'foo'.rindex('o', 1) # => 1
4252 * 'foo'.rindex('o', 2) # => 2
4253 * 'foo'.rindex('o', 3) # => 2
4254 *
4255 * If +offset+ is a negative \Integer, the maximum starting position in the
4256 * string to _end_ the search is the sum of the string's length and +offset+:
4257 *
4258 * 'foo'.rindex('o', -1) # => 2
4259 * 'foo'.rindex('o', -2) # => 1
4260 * 'foo'.rindex('o', -3) # => nil
4261 * 'foo'.rindex('o', -4) # => nil
4262 *
4263 * Related: String#index.
4264 */
4265
4266static VALUE
4267rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4268{
4269 VALUE sub;
4270 VALUE vpos;
4271 rb_encoding *enc = STR_ENC_GET(str);
4272 long pos, len = str_strlen(str, enc); /* str's enc */
4273
4274 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
4275 pos = NUM2LONG(vpos);
4276 if (pos < 0) {
4277 pos += len;
4278 if (pos < 0) {
4279 if (RB_TYPE_P(sub, T_REGEXP)) {
4281 }
4282 return Qnil;
4283 }
4284 }
4285 if (pos > len) pos = len;
4286 }
4287 else {
4288 pos = len;
4289 }
4290
4291 if (RB_TYPE_P(sub, T_REGEXP)) {
4292 /* enc = rb_get_check(str, sub); */
4293 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4294 enc, single_byte_optimizable(str));
4295
4296 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4297 VALUE match = rb_backref_get();
4298 struct re_registers *regs = RMATCH_REGS(match);
4299 pos = rb_str_sublen(str, BEG(0));
4300 return LONG2NUM(pos);
4301 }
4302 }
4303 else {
4304 StringValue(sub);
4305 pos = rb_str_rindex(str, sub, pos);
4306 if (pos >= 0) return LONG2NUM(pos);
4307 }
4308 return Qnil;
4309}
4310
4311static long
4312rb_str_byterindex(VALUE str, VALUE sub, long pos)
4313{
4314 long len, slen;
4315 char *sbeg, *s;
4316 rb_encoding *enc;
4317
4318 enc = rb_enc_check(str, sub);
4319 if (is_broken_string(sub)) return -1;
4320 len = RSTRING_LEN(str);
4321 slen = RSTRING_LEN(sub);
4322
4323 /* substring longer than string */
4324 if (len < slen) return -1;
4325 if (len - pos < slen) pos = len - slen;
4326 if (len == 0) return pos;
4327
4328 sbeg = RSTRING_PTR(str);
4329
4330 if (pos == 0) {
4331 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4332 return 0;
4333 else
4334 return -1;
4335 }
4336
4337 s = sbeg + pos;
4338 return str_rindex(str, sub, s, enc);
4339}
4340
4341
4342/*
4343 * call-seq:
4344 * byterindex(substring, offset = self.bytesize) -> integer or nil
4345 * byterindex(regexp, offset = self.bytesize) -> integer or nil
4346 *
4347 * Returns the \Integer byte-based index of the _last_ occurrence of the given +substring+,
4348 * or +nil+ if none found:
4349 *
4350 * 'foo'.byterindex('f') # => 0
4351 * 'foo'.byterindex('o') # => 2
4352 * 'foo'.byterindex('oo') # => 1
4353 * 'foo'.byterindex('ooo') # => nil
4354 *
4355 * Returns the \Integer byte-based index of the _last_ match for the given \Regexp +regexp+,
4356 * or +nil+ if none found:
4357 *
4358 * 'foo'.byterindex(/f/) # => 0
4359 * 'foo'.byterindex(/o/) # => 2
4360 * 'foo'.byterindex(/oo/) # => 1
4361 * 'foo'.byterindex(/ooo/) # => nil
4362 *
4363 * The _last_ match means starting at the possible last position, not
4364 * the last of longest matches.
4365 *
4366 * 'foo'.byterindex(/o+/) # => 2
4367 * $~ #=> #<MatchData "o">
4368 *
4369 * To get the last longest match, needs to combine with negative
4370 * lookbehind.
4371 *
4372 * 'foo'.byterindex(/(?<!o)o+/) # => 1
4373 * $~ #=> #<MatchData "oo">
4374 *
4375 * Or String#byteindex with negative lookforward.
4376 *
4377 * 'foo'.byteindex(/o+(?!.*o)/) # => 1
4378 * $~ #=> #<MatchData "oo">
4379 *
4380 * \Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4381 * string to _end_ the search:
4382 *
4383 * 'foo'.byterindex('o', 0) # => nil
4384 * 'foo'.byterindex('o', 1) # => 1
4385 * 'foo'.byterindex('o', 2) # => 2
4386 * 'foo'.byterindex('o', 3) # => 2
4387 *
4388 * If +offset+ is a negative \Integer, the maximum starting position in the
4389 * string to _end_ the search is the sum of the string's length and +offset+:
4390 *
4391 * 'foo'.byterindex('o', -1) # => 2
4392 * 'foo'.byterindex('o', -2) # => 1
4393 * 'foo'.byterindex('o', -3) # => nil
4394 * 'foo'.byterindex('o', -4) # => nil
4395 *
4396 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4397 * raised.
4398 *
4399 * Related: String#byteindex.
4400 */
4401
4402static VALUE
4403rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4404{
4405 VALUE sub;
4406 VALUE vpos;
4407 long pos, len = RSTRING_LEN(str);
4408
4409 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
4410 pos = NUM2LONG(vpos);
4411 if (pos < 0) {
4412 pos += len;
4413 if (pos < 0) {
4414 if (RB_TYPE_P(sub, T_REGEXP)) {
4416 }
4417 return Qnil;
4418 }
4419 }
4420 if (pos > len) pos = len;
4421 }
4422 else {
4423 pos = len;
4424 }
4425
4426 if (!str_check_byte_pos(str, pos)) {
4428 "offset %ld does not land on character boundary", pos);
4429 }
4430
4431 if (RB_TYPE_P(sub, T_REGEXP)) {
4432 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4433 VALUE match = rb_backref_get();
4434 struct re_registers *regs = RMATCH_REGS(match);
4435 pos = BEG(0);
4436 return LONG2NUM(pos);
4437 }
4438 }
4439 else {
4440 StringValue(sub);
4441 pos = rb_str_byterindex(str, sub, pos);
4442 if (pos >= 0) return LONG2NUM(pos);
4443 }
4444 return Qnil;
4445}
4446
4447/*
4448 * call-seq:
4449 * string =~ regexp -> integer or nil
4450 * string =~ object -> integer or nil
4451 *
4452 * Returns the \Integer index of the first substring that matches
4453 * the given +regexp+, or +nil+ if no match found:
4454 *
4455 * 'foo' =~ /f/ # => 0
4456 * 'foo' =~ /o/ # => 1
4457 * 'foo' =~ /x/ # => nil
4458 *
4459 * Note: also updates Regexp@Special+global+variables.
4460 *
4461 * If the given +object+ is not a \Regexp, returns the value
4462 * returned by <tt>object =~ self</tt>.
4463 *
4464 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4465 * (see Regexp#=~):
4466 *
4467 * number= nil
4468 * "no. 9" =~ /(?<number>\d+)/
4469 * number # => nil (not assigned)
4470 * /(?<number>\d+)/ =~ "no. 9"
4471 * number #=> "9"
4472 *
4473 */
4474
4475static VALUE
4476rb_str_match(VALUE x, VALUE y)
4477{
4478 switch (OBJ_BUILTIN_TYPE(y)) {
4479 case T_STRING:
4480 rb_raise(rb_eTypeError, "type mismatch: String given");
4481
4482 case T_REGEXP:
4483 return rb_reg_match(y, x);
4484
4485 default:
4486 return rb_funcall(y, idEqTilde, 1, x);
4487 }
4488}
4489
4490
4491static VALUE get_pat(VALUE);
4492
4493
4494/*
4495 * call-seq:
4496 * match(pattern, offset = 0) -> matchdata or nil
4497 * match(pattern, offset = 0) {|matchdata| ... } -> object
4498 *
4499 * Returns a \MatchData object (or +nil+) based on +self+ and the given +pattern+.
4500 *
4501 * Note: also updates Regexp@Special+global+variables.
4502 *
4503 * - Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4504 * regexp = Regexp.new(pattern)
4505 * - Computes +matchdata+, which will be either a \MatchData object or +nil+
4506 * (see Regexp#match):
4507 * matchdata = <tt>regexp.match(self)
4508 *
4509 * With no block given, returns the computed +matchdata+:
4510 *
4511 * 'foo'.match('f') # => #<MatchData "f">
4512 * 'foo'.match('o') # => #<MatchData "o">
4513 * 'foo'.match('x') # => nil
4514 *
4515 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4516 *
4517 * 'foo'.match('f', 1) # => nil
4518 * 'foo'.match('o', 1) # => #<MatchData "o">
4519 *
4520 * With a block given, calls the block with the computed +matchdata+
4521 * and returns the block's return value:
4522 *
4523 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4524 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4525 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4526 *
4527 */
4528
4529static VALUE
4530rb_str_match_m(int argc, VALUE *argv, VALUE str)
4531{
4532 VALUE re, result;
4533 if (argc < 1)
4534 rb_check_arity(argc, 1, 2);
4535 re = argv[0];
4536 argv[0] = str;
4537 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4538 if (!NIL_P(result) && rb_block_given_p()) {
4539 return rb_yield(result);
4540 }
4541 return result;
4542}
4543
4544/*
4545 * call-seq:
4546 * match?(pattern, offset = 0) -> true or false
4547 *
4548 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4549 *
4550 * Note: does not update Regexp@Special+global+variables.
4551 *
4552 * Computes +regexp+ by converting +pattern+ (if not already a \Regexp).
4553 * regexp = Regexp.new(pattern)
4554 *
4555 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a \MatchData object,
4556 * +false+ otherwise:
4557 *
4558 * 'foo'.match?(/o/) # => true
4559 * 'foo'.match?('o') # => true
4560 * 'foo'.match?(/x/) # => false
4561 *
4562 * If \Integer argument +offset+ is given, the search begins at index +offset+:
4563 * 'foo'.match?('f', 1) # => false
4564 * 'foo'.match?('o', 1) # => true
4565 *
4566 */
4567
4568static VALUE
4569rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4570{
4571 VALUE re;
4572 rb_check_arity(argc, 1, 2);
4573 re = get_pat(argv[0]);
4574 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4575}
4576
4577enum neighbor_char {
4578 NEIGHBOR_NOT_CHAR,
4579 NEIGHBOR_FOUND,
4580 NEIGHBOR_WRAPPED
4581};
4582
4583static enum neighbor_char
4584enc_succ_char(char *p, long len, rb_encoding *enc)
4585{
4586 long i;
4587 int l;
4588
4589 if (rb_enc_mbminlen(enc) > 1) {
4590 /* wchar, trivial case */
4591 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4592 if (!MBCLEN_CHARFOUND_P(r)) {
4593 return NEIGHBOR_NOT_CHAR;
4594 }
4595 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4596 l = rb_enc_code_to_mbclen(c, enc);
4597 if (!l) return NEIGHBOR_NOT_CHAR;
4598 if (l != len) return NEIGHBOR_WRAPPED;
4599 rb_enc_mbcput(c, p, enc);
4600 r = rb_enc_precise_mbclen(p, p + len, enc);
4601 if (!MBCLEN_CHARFOUND_P(r)) {
4602 return NEIGHBOR_NOT_CHAR;
4603 }
4604 return NEIGHBOR_FOUND;
4605 }
4606 while (1) {
4607 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4608 p[i] = '\0';
4609 if (i < 0)
4610 return NEIGHBOR_WRAPPED;
4611 ++((unsigned char*)p)[i];
4612 l = rb_enc_precise_mbclen(p, p+len, enc);
4613 if (MBCLEN_CHARFOUND_P(l)) {
4614 l = MBCLEN_CHARFOUND_LEN(l);
4615 if (l == len) {
4616 return NEIGHBOR_FOUND;
4617 }
4618 else {
4619 memset(p+l, 0xff, len-l);
4620 }
4621 }
4622 if (MBCLEN_INVALID_P(l) && i < len-1) {
4623 long len2;
4624 int l2;
4625 for (len2 = len-1; 0 < len2; len2--) {
4626 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4627 if (!MBCLEN_INVALID_P(l2))
4628 break;
4629 }
4630 memset(p+len2+1, 0xff, len-(len2+1));
4631 }
4632 }
4633}
4634
4635static enum neighbor_char
4636enc_pred_char(char *p, long len, rb_encoding *enc)
4637{
4638 long i;
4639 int l;
4640 if (rb_enc_mbminlen(enc) > 1) {
4641 /* wchar, trivial case */
4642 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4643 if (!MBCLEN_CHARFOUND_P(r)) {
4644 return NEIGHBOR_NOT_CHAR;
4645 }
4646 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4647 if (!c) return NEIGHBOR_NOT_CHAR;
4648 --c;
4649 l = rb_enc_code_to_mbclen(c, enc);
4650 if (!l) return NEIGHBOR_NOT_CHAR;
4651 if (l != len) return NEIGHBOR_WRAPPED;
4652 rb_enc_mbcput(c, p, enc);
4653 r = rb_enc_precise_mbclen(p, p + len, enc);
4654 if (!MBCLEN_CHARFOUND_P(r)) {
4655 return NEIGHBOR_NOT_CHAR;
4656 }
4657 return NEIGHBOR_FOUND;
4658 }
4659 while (1) {
4660 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4661 p[i] = '\xff';
4662 if (i < 0)
4663 return NEIGHBOR_WRAPPED;
4664 --((unsigned char*)p)[i];
4665 l = rb_enc_precise_mbclen(p, p+len, enc);
4666 if (MBCLEN_CHARFOUND_P(l)) {
4667 l = MBCLEN_CHARFOUND_LEN(l);
4668 if (l == len) {
4669 return NEIGHBOR_FOUND;
4670 }
4671 else {
4672 memset(p+l, 0, len-l);
4673 }
4674 }
4675 if (MBCLEN_INVALID_P(l) && i < len-1) {
4676 long len2;
4677 int l2;
4678 for (len2 = len-1; 0 < len2; len2--) {
4679 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4680 if (!MBCLEN_INVALID_P(l2))
4681 break;
4682 }
4683 memset(p+len2+1, 0, len-(len2+1));
4684 }
4685 }
4686}
4687
4688/*
4689 overwrite +p+ by succeeding letter in +enc+ and returns
4690 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4691 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4692 assuming each ranges are successive, and mbclen
4693 never change in each ranges.
4694 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4695 character.
4696 */
4697static enum neighbor_char
4698enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4699{
4700 enum neighbor_char ret;
4701 unsigned int c;
4702 int ctype;
4703 int range;
4704 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4705
4706 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4707 int try;
4708 const int max_gaps = 1;
4709
4710 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4711 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4712 ctype = ONIGENC_CTYPE_DIGIT;
4713 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4714 ctype = ONIGENC_CTYPE_ALPHA;
4715 else
4716 return NEIGHBOR_NOT_CHAR;
4717
4718 MEMCPY(save, p, char, len);
4719 for (try = 0; try <= max_gaps; ++try) {
4720 ret = enc_succ_char(p, len, enc);
4721 if (ret == NEIGHBOR_FOUND) {
4722 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4723 if (rb_enc_isctype(c, ctype, enc))
4724 return NEIGHBOR_FOUND;
4725 }
4726 }
4727 MEMCPY(p, save, char, len);
4728 range = 1;
4729 while (1) {
4730 MEMCPY(save, p, char, len);
4731 ret = enc_pred_char(p, len, enc);
4732 if (ret == NEIGHBOR_FOUND) {
4733 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4734 if (!rb_enc_isctype(c, ctype, enc)) {
4735 MEMCPY(p, save, char, len);
4736 break;
4737 }
4738 }
4739 else {
4740 MEMCPY(p, save, char, len);
4741 break;
4742 }
4743 range++;
4744 }
4745 if (range == 1) {
4746 return NEIGHBOR_NOT_CHAR;
4747 }
4748
4749 if (ctype != ONIGENC_CTYPE_DIGIT) {
4750 MEMCPY(carry, p, char, len);
4751 return NEIGHBOR_WRAPPED;
4752 }
4753
4754 MEMCPY(carry, p, char, len);
4755 enc_succ_char(carry, len, enc);
4756 return NEIGHBOR_WRAPPED;
4757}
4758
4759
4760static VALUE str_succ(VALUE str);
4761
4762/*
4763 * call-seq:
4764 * succ -> new_str
4765 *
4766 * Returns the successor to +self+. The successor is calculated by
4767 * incrementing characters.
4768 *
4769 * The first character to be incremented is the rightmost alphanumeric:
4770 * or, if no alphanumerics, the rightmost character:
4771 *
4772 * 'THX1138'.succ # => "THX1139"
4773 * '<<koala>>'.succ # => "<<koalb>>"
4774 * '***'.succ # => '**+'
4775 *
4776 * The successor to a digit is another digit, "carrying" to the next-left
4777 * character for a "rollover" from 9 to 0, and prepending another digit
4778 * if necessary:
4779 *
4780 * '00'.succ # => "01"
4781 * '09'.succ # => "10"
4782 * '99'.succ # => "100"
4783 *
4784 * The successor to a letter is another letter of the same case,
4785 * carrying to the next-left character for a rollover,
4786 * and prepending another same-case letter if necessary:
4787 *
4788 * 'aa'.succ # => "ab"
4789 * 'az'.succ # => "ba"
4790 * 'zz'.succ # => "aaa"
4791 * 'AA'.succ # => "AB"
4792 * 'AZ'.succ # => "BA"
4793 * 'ZZ'.succ # => "AAA"
4794 *
4795 * The successor to a non-alphanumeric character is the next character
4796 * in the underlying character set's collating sequence,
4797 * carrying to the next-left character for a rollover,
4798 * and prepending another character if necessary:
4799 *
4800 * s = 0.chr * 3
4801 * s # => "\x00\x00\x00"
4802 * s.succ # => "\x00\x00\x01"
4803 * s = 255.chr * 3
4804 * s # => "\xFF\xFF\xFF"
4805 * s.succ # => "\x01\x00\x00\x00"
4806 *
4807 * Carrying can occur between and among mixtures of alphanumeric characters:
4808 *
4809 * s = 'zz99zz99'
4810 * s.succ # => "aaa00aa00"
4811 * s = '99zz99zz'
4812 * s.succ # => "100aa00aa"
4813 *
4814 * The successor to an empty \String is a new empty \String:
4815 *
4816 * ''.succ # => ""
4817 *
4818 * String#next is an alias for String#succ.
4819 */
4820
4821VALUE
4823{
4824 VALUE str;
4825 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4826 rb_enc_cr_str_copy_for_substr(str, orig);
4827 return str_succ(str);
4828}
4829
4830static VALUE
4831str_succ(VALUE str)
4832{
4833 rb_encoding *enc;
4834 char *sbeg, *s, *e, *last_alnum = 0;
4835 int found_alnum = 0;
4836 long l, slen;
4837 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4838 long carry_pos = 0, carry_len = 1;
4839 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4840
4841 slen = RSTRING_LEN(str);
4842 if (slen == 0) return str;
4843
4844 enc = STR_ENC_GET(str);
4845 sbeg = RSTRING_PTR(str);
4846 s = e = sbeg + slen;
4847
4848 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4849 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4850 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4851 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4852 break;
4853 }
4854 }
4855 l = rb_enc_precise_mbclen(s, e, enc);
4856 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4857 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4858 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4859 switch (neighbor) {
4860 case NEIGHBOR_NOT_CHAR:
4861 continue;
4862 case NEIGHBOR_FOUND:
4863 return str;
4864 case NEIGHBOR_WRAPPED:
4865 last_alnum = s;
4866 break;
4867 }
4868 found_alnum = 1;
4869 carry_pos = s - sbeg;
4870 carry_len = l;
4871 }
4872 if (!found_alnum) { /* str contains no alnum */
4873 s = e;
4874 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4875 enum neighbor_char neighbor;
4876 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4877 l = rb_enc_precise_mbclen(s, e, enc);
4878 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4879 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4880 MEMCPY(tmp, s, char, l);
4881 neighbor = enc_succ_char(tmp, l, enc);
4882 switch (neighbor) {
4883 case NEIGHBOR_FOUND:
4884 MEMCPY(s, tmp, char, l);
4885 return str;
4886 break;
4887 case NEIGHBOR_WRAPPED:
4888 MEMCPY(s, tmp, char, l);
4889 break;
4890 case NEIGHBOR_NOT_CHAR:
4891 break;
4892 }
4893 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4894 /* wrapped to \0...\0. search next valid char. */
4895 enc_succ_char(s, l, enc);
4896 }
4897 if (!rb_enc_asciicompat(enc)) {
4898 MEMCPY(carry, s, char, l);
4899 carry_len = l;
4900 }
4901 carry_pos = s - sbeg;
4902 }
4904 }
4905 RESIZE_CAPA(str, slen + carry_len);
4906 sbeg = RSTRING_PTR(str);
4907 s = sbeg + carry_pos;
4908 memmove(s + carry_len, s, slen - carry_pos);
4909 memmove(s, carry, carry_len);
4910 slen += carry_len;
4911 STR_SET_LEN(str, slen);
4912 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4914 return str;
4915}
4916
4917
4918/*
4919 * call-seq:
4920 * succ! -> self
4921 *
4922 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4923 *
4924 * String#next! is an alias for String#succ!.
4925 */
4926
4927static VALUE
4928rb_str_succ_bang(VALUE str)
4929{
4930 rb_str_modify(str);
4931 str_succ(str);
4932 return str;
4933}
4934
4935static int
4936all_digits_p(const char *s, long len)
4937{
4938 while (len-- > 0) {
4939 if (!ISDIGIT(*s)) return 0;
4940 s++;
4941 }
4942 return 1;
4943}
4944
4945static int
4946str_upto_i(VALUE str, VALUE arg)
4947{
4948 rb_yield(str);
4949 return 0;
4950}
4951
4952/*
4953 * call-seq:
4954 * upto(other_string, exclusive = false) {|string| ... } -> self
4955 * upto(other_string, exclusive = false) -> new_enumerator
4956 *
4957 * With a block given, calls the block with each \String value
4958 * returned by successive calls to String#succ;
4959 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4960 * the sequence terminates when value +other_string+ is reached;
4961 * returns +self+:
4962 *
4963 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4964 * Output:
4965 *
4966 * a8 a9 b0 b1 b2 b3 b4 b5 b6
4967 *
4968 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
4969 *
4970 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
4971 *
4972 * Output:
4973 *
4974 * a8 a9 b0 b1 b2 b3 b4 b5
4975 *
4976 * If +other_string+ would not be reached, does not call the block:
4977 *
4978 * '25'.upto('5') {|s| fail s }
4979 * 'aa'.upto('a') {|s| fail s }
4980 *
4981 * With no block given, returns a new \Enumerator:
4982 *
4983 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
4984 *
4985 */
4986
4987static VALUE
4988rb_str_upto(int argc, VALUE *argv, VALUE beg)
4989{
4990 VALUE end, exclusive;
4991
4992 rb_scan_args(argc, argv, "11", &end, &exclusive);
4993 RETURN_ENUMERATOR(beg, argc, argv);
4994 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4995}
4996
4997VALUE
4998rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
4999{
5000 VALUE current, after_end;
5001 ID succ;
5002 int n, ascii;
5003 rb_encoding *enc;
5004
5005 CONST_ID(succ, "succ");
5006 StringValue(end);
5007 enc = rb_enc_check(beg, end);
5008 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5009 /* single character */
5010 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5011 char c = RSTRING_PTR(beg)[0];
5012 char e = RSTRING_PTR(end)[0];
5013
5014 if (c > e || (excl && c == e)) return beg;
5015 for (;;) {
5016 if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
5017 if (!excl && c == e) break;
5018 c++;
5019 if (excl && c == e) break;
5020 }
5021 return beg;
5022 }
5023 /* both edges are all digits */
5024 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5025 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5026 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5027 VALUE b, e;
5028 int width;
5029
5030 width = RSTRING_LENINT(beg);
5031 b = rb_str_to_inum(beg, 10, FALSE);
5032 e = rb_str_to_inum(end, 10, FALSE);
5033 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5034 long bi = FIX2LONG(b);
5035 long ei = FIX2LONG(e);
5036 rb_encoding *usascii = rb_usascii_encoding();
5037
5038 while (bi <= ei) {
5039 if (excl && bi == ei) break;
5040 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5041 bi++;
5042 }
5043 }
5044 else {
5045 ID op = excl ? '<' : idLE;
5046 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5047
5048 args[0] = INT2FIX(width);
5049 while (rb_funcall(b, op, 1, e)) {
5050 args[1] = b;
5051 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5052 b = rb_funcallv(b, succ, 0, 0);
5053 }
5054 }
5055 return beg;
5056 }
5057 /* normal case */
5058 n = rb_str_cmp(beg, end);
5059 if (n > 0 || (excl && n == 0)) return beg;
5060
5061 after_end = rb_funcallv(end, succ, 0, 0);
5062 current = str_duplicate(rb_cString, beg);
5063 while (!rb_str_equal(current, after_end)) {
5064 VALUE next = Qnil;
5065 if (excl || !rb_str_equal(current, end))
5066 next = rb_funcallv(current, succ, 0, 0);
5067 if ((*each)(current, arg)) break;
5068 if (NIL_P(next)) break;
5069 current = next;
5070 StringValue(current);
5071 if (excl && rb_str_equal(current, end)) break;
5072 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5073 break;
5074 }
5075
5076 return beg;
5077}
5078
5079VALUE
5080rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5081{
5082 VALUE current;
5083 ID succ;
5084
5085 CONST_ID(succ, "succ");
5086 /* both edges are all digits */
5087 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5088 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5089 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5090 int width = RSTRING_LENINT(beg);
5091 b = rb_str_to_inum(beg, 10, FALSE);
5092 if (FIXNUM_P(b)) {
5093 long bi = FIX2LONG(b);
5094 rb_encoding *usascii = rb_usascii_encoding();
5095
5096 while (FIXABLE(bi)) {
5097 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5098 bi++;
5099 }
5100 b = LONG2NUM(bi);
5101 }
5102 args[0] = INT2FIX(width);
5103 while (1) {
5104 args[1] = b;
5105 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5106 b = rb_funcallv(b, succ, 0, 0);
5107 }
5108 }
5109 /* normal case */
5110 current = str_duplicate(rb_cString, beg);
5111 while (1) {
5112 VALUE next = rb_funcallv(current, succ, 0, 0);
5113 if ((*each)(current, arg)) break;
5114 current = next;
5115 StringValue(current);
5116 if (RSTRING_LEN(current) == 0)
5117 break;
5118 }
5119
5120 return beg;
5121}
5122
5123static int
5124include_range_i(VALUE str, VALUE arg)
5125{
5126 VALUE *argp = (VALUE *)arg;
5127 if (!rb_equal(str, *argp)) return 0;
5128 *argp = Qnil;
5129 return 1;
5130}
5131
5132VALUE
5133rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5134{
5135 beg = rb_str_new_frozen(beg);
5136 StringValue(end);
5137 end = rb_str_new_frozen(end);
5138 if (NIL_P(val)) return Qfalse;
5139 val = rb_check_string_type(val);
5140 if (NIL_P(val)) return Qfalse;
5141 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5142 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5143 rb_enc_asciicompat(STR_ENC_GET(val))) {
5144 const char *bp = RSTRING_PTR(beg);
5145 const char *ep = RSTRING_PTR(end);
5146 const char *vp = RSTRING_PTR(val);
5147 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5148 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5149 return Qfalse;
5150 else {
5151 char b = *bp;
5152 char e = *ep;
5153 char v = *vp;
5154
5155 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5156 if (b <= v && v < e) return Qtrue;
5157 return RBOOL(!RTEST(exclusive) && v == e);
5158 }
5159 }
5160 }
5161#if 0
5162 /* both edges are all digits */
5163 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5164 all_digits_p(bp, RSTRING_LEN(beg)) &&
5165 all_digits_p(ep, RSTRING_LEN(end))) {
5166 /* TODO */
5167 }
5168#endif
5169 }
5170 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5171
5172 return RBOOL(NIL_P(val));
5173}
5174
5175static VALUE
5176rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5177{
5178 if (rb_reg_search(re, str, 0, 0) >= 0) {
5179 VALUE match = rb_backref_get();
5180 int nth = rb_reg_backref_number(match, backref);
5181 return rb_reg_nth_match(nth, match);
5182 }
5183 return Qnil;
5184}
5185
5186static VALUE
5187rb_str_aref(VALUE str, VALUE indx)
5188{
5189 long idx;
5190
5191 if (FIXNUM_P(indx)) {
5192 idx = FIX2LONG(indx);
5193 }
5194 else if (RB_TYPE_P(indx, T_REGEXP)) {
5195 return rb_str_subpat(str, indx, INT2FIX(0));
5196 }
5197 else if (RB_TYPE_P(indx, T_STRING)) {
5198 if (rb_str_index(str, indx, 0) != -1)
5199 return str_duplicate(rb_cString, indx);
5200 return Qnil;
5201 }
5202 else {
5203 /* check if indx is Range */
5204 long beg, len = str_strlen(str, NULL);
5205 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5206 case Qfalse:
5207 break;
5208 case Qnil:
5209 return Qnil;
5210 default:
5211 return rb_str_substr(str, beg, len);
5212 }
5213 idx = NUM2LONG(indx);
5214 }
5215
5216 return str_substr(str, idx, 1, FALSE);
5217}
5218
5219
5220/*
5221 * call-seq:
5222 * string[index] -> new_string or nil
5223 * string[start, length] -> new_string or nil
5224 * string[range] -> new_string or nil
5225 * string[regexp, capture = 0] -> new_string or nil
5226 * string[substring] -> new_string or nil
5227 *
5228 * Returns the substring of +self+ specified by the arguments.
5229 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5230 *
5231 *
5232 */
5233
5234static VALUE
5235rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5236{
5237 if (argc == 2) {
5238 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5239 return rb_str_subpat(str, argv[0], argv[1]);
5240 }
5241 else {
5242 long beg = NUM2LONG(argv[0]);
5243 long len = NUM2LONG(argv[1]);
5244 return rb_str_substr(str, beg, len);
5245 }
5246 }
5247 rb_check_arity(argc, 1, 2);
5248 return rb_str_aref(str, argv[0]);
5249}
5250
5251VALUE
5253{
5254 char *ptr = RSTRING_PTR(str);
5255 long olen = RSTRING_LEN(str), nlen;
5256
5257 str_modifiable(str);
5258 if (len > olen) len = olen;
5259 nlen = olen - len;
5260 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5261 char *oldptr = ptr;
5262 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5263 STR_SET_EMBED(str);
5264 STR_SET_EMBED_LEN(str, nlen);
5265 ptr = RSTRING(str)->as.embed.ary;
5266 memmove(ptr, oldptr + len, nlen);
5267 if (fl == STR_NOEMBED) xfree(oldptr);
5268 }
5269 else {
5270 if (!STR_SHARED_P(str)) {
5271 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5272 rb_enc_cr_str_exact_copy(shared, str);
5273 OBJ_FREEZE(shared);
5274 }
5275 ptr = RSTRING(str)->as.heap.ptr += len;
5276 RSTRING(str)->as.heap.len = nlen;
5277 }
5278 ptr[nlen] = 0;
5280 return str;
5281}
5282
5283static void
5284rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
5285{
5286 char *sptr;
5287 long slen, vlen = RSTRING_LEN(val);
5288 int cr;
5289
5290 if (beg == 0 && vlen == 0) {
5291 rb_str_drop_bytes(str, len);
5292 return;
5293 }
5294
5295 str_modify_keep_cr(str);
5296 RSTRING_GETMEM(str, sptr, slen);
5297 if (len < vlen) {
5298 /* expand string */
5299 RESIZE_CAPA(str, slen + vlen - len);
5300 sptr = RSTRING_PTR(str);
5301 }
5302
5304 cr = rb_enc_str_coderange(val);
5305 else
5307
5308 if (vlen != len) {
5309 memmove(sptr + beg + vlen,
5310 sptr + beg + len,
5311 slen - (beg + len));
5312 }
5313 if (vlen < beg && len < 0) {
5314 MEMZERO(sptr + slen, char, -len);
5315 }
5316 if (vlen > 0) {
5317 memmove(sptr + beg, RSTRING_PTR(val), vlen);
5318 }
5319 slen += vlen - len;
5320 STR_SET_LEN(str, slen);
5321 TERM_FILL(&sptr[slen], TERM_LEN(str));
5322 ENC_CODERANGE_SET(str, cr);
5323}
5324
5325void
5326rb_str_update(VALUE str, long beg, long len, VALUE val)
5327{
5328 long slen;
5329 char *p, *e;
5330 rb_encoding *enc;
5331 int singlebyte = single_byte_optimizable(str);
5332 int cr;
5333
5334 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5335
5336 StringValue(val);
5337 enc = rb_enc_check(str, val);
5338 slen = str_strlen(str, enc); /* rb_enc_check */
5339
5340 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5341 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5342 }
5343 if (beg < 0) {
5344 beg += slen;
5345 }
5346 assert(beg >= 0);
5347 assert(beg <= slen);
5348 if (len > slen - beg) {
5349 len = slen - beg;
5350 }
5351 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5352 if (!p) p = RSTRING_END(str);
5353 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5354 if (!e) e = RSTRING_END(str);
5355 /* error check */
5356 beg = p - RSTRING_PTR(str); /* physical position */
5357 len = e - p; /* physical length */
5358 rb_str_splice_0(str, beg, len, val);
5359 rb_enc_associate(str, enc);
5361 if (cr != ENC_CODERANGE_BROKEN)
5362 ENC_CODERANGE_SET(str, cr);
5363}
5364
5365#define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
5366
5367static void
5368rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5369{
5370 int nth;
5371 VALUE match;
5372 long start, end, len;
5373 rb_encoding *enc;
5374 struct re_registers *regs;
5375
5376 if (rb_reg_search(re, str, 0, 0) < 0) {
5377 rb_raise(rb_eIndexError, "regexp not matched");
5378 }
5379 match = rb_backref_get();
5380 nth = rb_reg_backref_number(match, backref);
5381 regs = RMATCH_REGS(match);
5382 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5383 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5384 }
5385 if (nth < 0) {
5386 nth += regs->num_regs;
5387 }
5388
5389 start = BEG(nth);
5390 if (start == -1) {
5391 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5392 }
5393 end = END(nth);
5394 len = end - start;
5395 StringValue(val);
5396 enc = rb_enc_check_str(str, val);
5397 rb_str_splice_0(str, start, len, val);
5398 rb_enc_associate(str, enc);
5399}
5400
5401static VALUE
5402rb_str_aset(VALUE str, VALUE indx, VALUE val)
5403{
5404 long idx, beg;
5405
5406 switch (TYPE(indx)) {
5407 case T_REGEXP:
5408 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5409 return val;
5410
5411 case T_STRING:
5412 beg = rb_str_index(str, indx, 0);
5413 if (beg < 0) {
5414 rb_raise(rb_eIndexError, "string not matched");
5415 }
5416 beg = rb_str_sublen(str, beg);
5417 rb_str_splice(str, beg, str_strlen(indx, NULL), val);
5418 return val;
5419
5420 default:
5421 /* check if indx is Range */
5422 {
5423 long beg, len;
5424 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5425 rb_str_splice(str, beg, len, val);
5426 return val;
5427 }
5428 }
5429 /* FALLTHROUGH */
5430
5431 case T_FIXNUM:
5432 idx = NUM2LONG(indx);
5433 rb_str_splice(str, idx, 1, val);
5434 return val;
5435 }
5436}
5437
5438/*
5439 * call-seq:
5440 * string[index] = new_string
5441 * string[start, length] = new_string
5442 * string[range] = new_string
5443 * string[regexp, capture = 0] = new_string
5444 * string[substring] = new_string
5445 *
5446 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
5447 * See {String Slices}[rdoc-ref:String@String+Slices].
5448 *
5449 * A few examples:
5450 *
5451 * s = 'foo'
5452 * s[2] = 'rtune' # => "rtune"
5453 * s # => "fortune"
5454 * s[1, 5] = 'init' # => "init"
5455 * s # => "finite"
5456 * s[3..4] = 'al' # => "al"
5457 * s # => "finale"
5458 * s[/e$/] = 'ly' # => "ly"
5459 * s # => "finally"
5460 * s['lly'] = 'ncial' # => "ncial"
5461 * s # => "financial"
5462 *
5463 * String#slice is an alias for String#[].
5464 *
5465 */
5466
5467static VALUE
5468rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5469{
5470 if (argc == 3) {
5471 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5472 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5473 }
5474 else {
5475 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5476 }
5477 return argv[2];
5478 }
5479 rb_check_arity(argc, 2, 3);
5480 return rb_str_aset(str, argv[0], argv[1]);
5481}
5482
5483/*
5484 * call-seq:
5485 * insert(index, other_string) -> self
5486 *
5487 * Inserts the given +other_string+ into +self+; returns +self+.
5488 *
5489 * If the \Integer +index+ is positive, inserts +other_string+ at offset +index+:
5490 *
5491 * 'foo'.insert(1, 'bar') # => "fbaroo"
5492 *
5493 * If the \Integer +index+ is negative, counts backward from the end of +self+
5494 * and inserts +other_string+ at offset <tt>index+1</tt>
5495 * (that is, _after_ <tt>self[index]</tt>):
5496 *
5497 * 'foo'.insert(-2, 'bar') # => "fobaro"
5498 *
5499 */
5500
5501static VALUE
5502rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5503{
5504 long pos = NUM2LONG(idx);
5505
5506 if (pos == -1) {
5507 return rb_str_append(str, str2);
5508 }
5509 else if (pos < 0) {
5510 pos++;
5511 }
5512 rb_str_splice(str, pos, 0, str2);
5513 return str;
5514}
5515
5516
5517/*
5518 * call-seq:
5519 * slice!(index) -> new_string or nil
5520 * slice!(start, length) -> new_string or nil
5521 * slice!(range) -> new_string or nil
5522 * slice!(regexp, capture = 0) -> new_string or nil
5523 * slice!(substring) -> new_string or nil
5524 *
5525 * Removes and returns the substring of +self+ specified by the arguments.
5526 * See {String Slices}[rdoc-ref:String@String+Slices].
5527 *
5528 * A few examples:
5529 *
5530 * string = "This is a string"
5531 * string.slice!(2) #=> "i"
5532 * string.slice!(3..6) #=> " is "
5533 * string.slice!(/s.*t/) #=> "sa st"
5534 * string.slice!("r") #=> "r"
5535 * string #=> "Thing"
5536 *
5537 */
5538
5539static VALUE
5540rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5541{
5542 VALUE result = Qnil;
5543 VALUE indx;
5544 long beg, len = 1;
5545 char *p;
5546
5547 rb_check_arity(argc, 1, 2);
5548 str_modify_keep_cr(str);
5549 indx = argv[0];
5550 if (RB_TYPE_P(indx, T_REGEXP)) {
5551 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5552 VALUE match = rb_backref_get();
5553 struct re_registers *regs = RMATCH_REGS(match);
5554 int nth = 0;
5555 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5556 if ((nth += regs->num_regs) <= 0) return Qnil;
5557 }
5558 else if (nth >= regs->num_regs) return Qnil;
5559 beg = BEG(nth);
5560 len = END(nth) - beg;
5561 goto subseq;
5562 }
5563 else if (argc == 2) {
5564 beg = NUM2LONG(indx);
5565 len = NUM2LONG(argv[1]);
5566 goto num_index;
5567 }
5568 else if (FIXNUM_P(indx)) {
5569 beg = FIX2LONG(indx);
5570 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5571 if (!len) return Qnil;
5572 beg = p - RSTRING_PTR(str);
5573 goto subseq;
5574 }
5575 else if (RB_TYPE_P(indx, T_STRING)) {
5576 beg = rb_str_index(str, indx, 0);
5577 if (beg == -1) return Qnil;
5578 len = RSTRING_LEN(indx);
5579 result = str_duplicate(rb_cString, indx);
5580 goto squash;
5581 }
5582 else {
5583 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5584 case Qnil:
5585 return Qnil;
5586 case Qfalse:
5587 beg = NUM2LONG(indx);
5588 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5589 if (!len) return Qnil;
5590 beg = p - RSTRING_PTR(str);
5591 goto subseq;
5592 default:
5593 goto num_index;
5594 }
5595 }
5596
5597 num_index:
5598 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5599 beg = p - RSTRING_PTR(str);
5600
5601 subseq:
5602 result = rb_str_new(RSTRING_PTR(str)+beg, len);
5603 rb_enc_cr_str_copy_for_substr(result, str);
5604
5605 squash:
5606 if (len > 0) {
5607 if (beg == 0) {
5608 rb_str_drop_bytes(str, len);
5609 }
5610 else {
5611 char *sptr = RSTRING_PTR(str);
5612 long slen = RSTRING_LEN(str);
5613 if (beg + len > slen) /* pathological check */
5614 len = slen - beg;
5615 memmove(sptr + beg,
5616 sptr + beg + len,
5617 slen - (beg + len));
5618 slen -= len;
5619 STR_SET_LEN(str, slen);
5620 TERM_FILL(&sptr[slen], TERM_LEN(str));
5621 }
5622 }
5623 return result;
5624}
5625
5626static VALUE
5627get_pat(VALUE pat)
5628{
5629 VALUE val;
5630
5631 switch (OBJ_BUILTIN_TYPE(pat)) {
5632 case T_REGEXP:
5633 return pat;
5634
5635 case T_STRING:
5636 break;
5637
5638 default:
5639 val = rb_check_string_type(pat);
5640 if (NIL_P(val)) {
5641 Check_Type(pat, T_REGEXP);
5642 }
5643 pat = val;
5644 }
5645
5646 return rb_reg_regcomp(pat);
5647}
5648
5649static VALUE
5650get_pat_quoted(VALUE pat, int check)
5651{
5652 VALUE val;
5653
5654 switch (OBJ_BUILTIN_TYPE(pat)) {
5655 case T_REGEXP:
5656 return pat;
5657
5658 case T_STRING:
5659 break;
5660
5661 default:
5662 val = rb_check_string_type(pat);
5663 if (NIL_P(val)) {
5664 Check_Type(pat, T_REGEXP);
5665 }
5666 pat = val;
5667 }
5668 if (check && is_broken_string(pat)) {
5669 rb_exc_raise(rb_reg_check_preprocess(pat));
5670 }
5671 return pat;
5672}
5673
5674static long
5675rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5676{
5677 if (BUILTIN_TYPE(pat) == T_STRING) {
5678 pos = rb_strseq_index(str, pat, pos, 1);
5679 if (set_backref_str) {
5680 if (pos >= 0) {
5681 str = rb_str_new_frozen_String(str);
5682 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5683 }
5684 else {
5686 }
5687 }
5688 return pos;
5689 }
5690 else {
5691 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5692 }
5693}
5694
5695
5696/*
5697 * call-seq:
5698 * sub!(pattern, replacement) -> self or nil
5699 * sub!(pattern) {|match| ... } -> self or nil
5700 *
5701 * Returns +self+ with only the first occurrence
5702 * (not all occurrences) of the given +pattern+ replaced.
5703 *
5704 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5705 *
5706 * Related: String#sub, String#gsub, String#gsub!.
5707 *
5708 */
5709
5710static VALUE
5711rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5712{
5713 VALUE pat, repl, hash = Qnil;
5714 int iter = 0;
5715 long plen;
5716 int min_arity = rb_block_given_p() ? 1 : 2;
5717 long beg;
5718
5719 rb_check_arity(argc, min_arity, 2);
5720 if (argc == 1) {
5721 iter = 1;
5722 }
5723 else {
5724 repl = argv[1];
5725 hash = rb_check_hash_type(argv[1]);
5726 if (NIL_P(hash)) {
5727 StringValue(repl);
5728 }
5729 }
5730
5731 pat = get_pat_quoted(argv[0], 1);
5732
5733 str_modifiable(str);
5734 beg = rb_pat_search(pat, str, 0, 1);
5735 if (beg >= 0) {
5736 rb_encoding *enc;
5737 int cr = ENC_CODERANGE(str);
5738 long beg0, end0;
5739 VALUE match, match0 = Qnil;
5740 struct re_registers *regs;
5741 char *p, *rp;
5742 long len, rlen;
5743
5744 match = rb_backref_get();
5745 regs = RMATCH_REGS(match);
5746 if (RB_TYPE_P(pat, T_STRING)) {
5747 beg0 = beg;
5748 end0 = beg0 + RSTRING_LEN(pat);
5749 match0 = pat;
5750 }
5751 else {
5752 beg0 = BEG(0);
5753 end0 = END(0);
5754 if (iter) match0 = rb_reg_nth_match(0, match);
5755 }
5756
5757 if (iter || !NIL_P(hash)) {
5758 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5759
5760 if (iter) {
5761 repl = rb_obj_as_string(rb_yield(match0));
5762 }
5763 else {
5764 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5765 repl = rb_obj_as_string(repl);
5766 }
5767 str_mod_check(str, p, len);
5768 rb_check_frozen(str);
5769 }
5770 else {
5771 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5772 }
5773
5774 enc = rb_enc_compatible(str, repl);
5775 if (!enc) {
5776 rb_encoding *str_enc = STR_ENC_GET(str);
5777 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5778 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5779 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5780 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5781 rb_enc_name(str_enc),
5782 rb_enc_name(STR_ENC_GET(repl)));
5783 }
5784 enc = STR_ENC_GET(repl);
5785 }
5786 rb_str_modify(str);
5787 rb_enc_associate(str, enc);
5789 int cr2 = ENC_CODERANGE(repl);
5790 if (cr2 == ENC_CODERANGE_BROKEN ||
5791 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5793 else
5794 cr = cr2;
5795 }
5796 plen = end0 - beg0;
5797 rlen = RSTRING_LEN(repl);
5798 len = RSTRING_LEN(str);
5799 if (rlen > plen) {
5800 RESIZE_CAPA(str, len + rlen - plen);
5801 }
5802 p = RSTRING_PTR(str);
5803 if (rlen != plen) {
5804 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5805 }
5806 rp = RSTRING_PTR(repl);
5807 memmove(p + beg0, rp, rlen);
5808 len += rlen - plen;
5809 STR_SET_LEN(str, len);
5810 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5811 ENC_CODERANGE_SET(str, cr);
5812
5813 return str;
5814 }
5815 return Qnil;
5816}
5817
5818
5819/*
5820 * call-seq:
5821 * sub(pattern, replacement) -> new_string
5822 * sub(pattern) {|match| ... } -> new_string
5823 *
5824 * Returns a copy of +self+ with only the first occurrence
5825 * (not all occurrences) of the given +pattern+ replaced.
5826 *
5827 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5828 *
5829 * Related: String#sub!, String#gsub, String#gsub!.
5830 *
5831 */
5832
5833static VALUE
5834rb_str_sub(int argc, VALUE *argv, VALUE str)
5835{
5836 str = str_duplicate(rb_cString, str);
5837 rb_str_sub_bang(argc, argv, str);
5838 return str;
5839}
5840
5841static VALUE
5842str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5843{
5844 VALUE pat, val = Qnil, repl, match, match0 = Qnil, dest, hash = Qnil;
5845 struct re_registers *regs;
5846 long beg, beg0, end0;
5847 long offset, blen, slen, len, last;
5848 enum {STR, ITER, MAP} mode = STR;
5849 char *sp, *cp;
5850 int need_backref = -1;
5851 rb_encoding *str_enc;
5852
5853 switch (argc) {
5854 case 1:
5855 RETURN_ENUMERATOR(str, argc, argv);
5856 mode = ITER;
5857 break;
5858 case 2:
5859 repl = argv[1];
5860 hash = rb_check_hash_type(argv[1]);
5861 if (NIL_P(hash)) {
5862 StringValue(repl);
5863 }
5864 else {
5865 mode = MAP;
5866 }
5867 break;
5868 default:
5869 rb_error_arity(argc, 1, 2);
5870 }
5871
5872 pat = get_pat_quoted(argv[0], 1);
5873 beg = rb_pat_search(pat, str, 0, need_backref);
5874 if (beg < 0) {
5875 if (bang) return Qnil; /* no match, no substitution */
5876 return str_duplicate(rb_cString, str);
5877 }
5878
5879 offset = 0;
5880 blen = RSTRING_LEN(str) + 30; /* len + margin */
5881 dest = rb_str_buf_new(blen);
5882 sp = RSTRING_PTR(str);
5883 slen = RSTRING_LEN(str);
5884 cp = sp;
5885 str_enc = STR_ENC_GET(str);
5886 rb_enc_associate(dest, str_enc);
5888
5889 do {
5890 match = rb_backref_get();
5891 regs = RMATCH_REGS(match);
5892 if (RB_TYPE_P(pat, T_STRING)) {
5893 beg0 = beg;
5894 end0 = beg0 + RSTRING_LEN(pat);
5895 match0 = pat;
5896 }
5897 else {
5898 beg0 = BEG(0);
5899 end0 = END(0);
5900 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5901 }
5902
5903 if (mode) {
5904 if (mode == ITER) {
5905 val = rb_obj_as_string(rb_yield(match0));
5906 }
5907 else {
5908 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5909 val = rb_obj_as_string(val);
5910 }
5911 str_mod_check(str, sp, slen);
5912 if (val == dest) { /* paranoid check [ruby-dev:24827] */
5913 rb_raise(rb_eRuntimeError, "block should not cheat");
5914 }
5915 }
5916 else if (need_backref) {
5917 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5918 if (need_backref < 0) {
5919 need_backref = val != repl;
5920 }
5921 }
5922 else {
5923 val = repl;
5924 }
5925
5926 len = beg0 - offset; /* copy pre-match substr */
5927 if (len) {
5928 rb_enc_str_buf_cat(dest, cp, len, str_enc);
5929 }
5930
5931 rb_str_buf_append(dest, val);
5932
5933 last = offset;
5934 offset = end0;
5935 if (beg0 == end0) {
5936 /*
5937 * Always consume at least one character of the input string
5938 * in order to prevent infinite loops.
5939 */
5940 if (RSTRING_LEN(str) <= end0) break;
5941 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5942 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5943 offset = end0 + len;
5944 }
5945 cp = RSTRING_PTR(str) + offset;
5946 if (offset > RSTRING_LEN(str)) break;
5947 beg = rb_pat_search(pat, str, offset, need_backref);
5948 } while (beg >= 0);
5949 if (RSTRING_LEN(str) > offset) {
5950 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5951 }
5952 rb_pat_search(pat, str, last, 1);
5953 if (bang) {
5954 str_shared_replace(str, dest);
5955 }
5956 else {
5957 str = dest;
5958 }
5959
5960 return str;
5961}
5962
5963
5964/*
5965 * call-seq:
5966 * gsub!(pattern, replacement) -> self or nil
5967 * gsub!(pattern) {|match| ... } -> self or nil
5968 * gsub!(pattern) -> an_enumerator
5969 *
5970 * Performs the specified substring replacement(s) on +self+;
5971 * returns +self+ if any replacement occurred, +nil+ otherwise.
5972 *
5973 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5974 *
5975 * Returns an Enumerator if no +replacement+ and no block given.
5976 *
5977 * Related: String#sub, String#gsub, String#sub!.
5978 *
5979 */
5980
5981static VALUE
5982rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5983{
5984 str_modify_keep_cr(str);
5985 return str_gsub(argc, argv, str, 1);
5986}
5987
5988
5989/*
5990 * call-seq:
5991 * gsub(pattern, replacement) -> new_string
5992 * gsub(pattern) {|match| ... } -> new_string
5993 * gsub(pattern) -> enumerator
5994 *
5995 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
5996 *
5997 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5998 *
5999 * Returns an Enumerator if no +replacement+ and no block given.
6000 *
6001 * Related: String#sub, String#sub!, String#gsub!.
6002 *
6003 */
6004
6005static VALUE
6006rb_str_gsub(int argc, VALUE *argv, VALUE str)
6007{
6008 return str_gsub(argc, argv, str, 0);
6009}
6010
6011
6012/*
6013 * call-seq:
6014 * replace(other_string) -> self
6015 *
6016 * Replaces the contents of +self+ with the contents of +other_string+:
6017 *
6018 * s = 'foo' # => "foo"
6019 * s.replace('bar') # => "bar"
6020 *
6021 */
6022
6023VALUE
6025{
6026 str_modifiable(str);
6027 if (str == str2) return str;
6028
6029 StringValue(str2);
6030 str_discard(str);
6031 return str_replace(str, str2);
6032}
6033
6034/*
6035 * call-seq:
6036 * clear -> self
6037 *
6038 * Removes the contents of +self+:
6039 *
6040 * s = 'foo' # => "foo"
6041 * s.clear # => ""
6042 *
6043 */
6044
6045static VALUE
6046rb_str_clear(VALUE str)
6047{
6048 str_discard(str);
6049 STR_SET_EMBED(str);
6050 STR_SET_EMBED_LEN(str, 0);
6051 RSTRING_PTR(str)[0] = 0;
6052 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6054 else
6056 return str;
6057}
6058
6059/*
6060 * call-seq:
6061 * chr -> string
6062 *
6063 * Returns a string containing the first character of +self+:
6064 *
6065 * s = 'foo' # => "foo"
6066 * s.chr # => "f"
6067 *
6068 */
6069
6070static VALUE
6071rb_str_chr(VALUE str)
6072{
6073 return rb_str_substr(str, 0, 1);
6074}
6075
6076/*
6077 * call-seq:
6078 * getbyte(index) -> integer or nil
6079 *
6080 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6081 *
6082 * s = 'abcde' # => "abcde"
6083 * s.getbyte(0) # => 97
6084 * s.getbyte(-1) # => 101
6085 * s.getbyte(5) # => nil
6086 *
6087 * Related: String#setbyte.
6088 */
6089static VALUE
6090rb_str_getbyte(VALUE str, VALUE index)
6091{
6092 long pos = NUM2LONG(index);
6093
6094 if (pos < 0)
6095 pos += RSTRING_LEN(str);
6096 if (pos < 0 || RSTRING_LEN(str) <= pos)
6097 return Qnil;
6098
6099 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6100}
6101
6102/*
6103 * call-seq:
6104 * setbyte(index, integer) -> integer
6105 *
6106 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6107 *
6108 * s = 'abcde' # => "abcde"
6109 * s.setbyte(0, 98) # => 98
6110 * s # => "bbcde"
6111 *
6112 * Related: String#getbyte.
6113 */
6114static VALUE
6115rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6116{
6117 long pos = NUM2LONG(index);
6118 long len = RSTRING_LEN(str);
6119 char *ptr, *head, *left = 0;
6120 rb_encoding *enc;
6121 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6122
6123 if (pos < -len || len <= pos)
6124 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6125 if (pos < 0)
6126 pos += len;
6127
6128 VALUE v = rb_to_int(value);
6129 VALUE w = rb_int_and(v, INT2FIX(0xff));
6130 char byte = (char)(NUM2INT(w) & 0xFF);
6131
6132 if (!str_independent(str))
6133 str_make_independent(str);
6134 enc = STR_ENC_GET(str);
6135 head = RSTRING_PTR(str);
6136 ptr = &head[pos];
6137 if (!STR_EMBED_P(str)) {
6138 cr = ENC_CODERANGE(str);
6139 switch (cr) {
6140 case ENC_CODERANGE_7BIT:
6141 left = ptr;
6142 *ptr = byte;
6143 if (ISASCII(byte)) goto end;
6144 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6145 if (!MBCLEN_CHARFOUND_P(nlen))
6147 else
6149 goto end;
6151 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6152 width = rb_enc_precise_mbclen(left, head+len, enc);
6153 *ptr = byte;
6154 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6155 if (!MBCLEN_CHARFOUND_P(nlen))
6157 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6159 goto end;
6160 }
6161 }
6163 *ptr = byte;
6164
6165 end:
6166 return value;
6167}
6168
6169static VALUE
6170str_byte_substr(VALUE str, long beg, long len, int empty)
6171{
6172 long n = RSTRING_LEN(str);
6173
6174 if (beg > n || len < 0) return Qnil;
6175 if (beg < 0) {
6176 beg += n;
6177 if (beg < 0) return Qnil;
6178 }
6179 if (len > n - beg)
6180 len = n - beg;
6181 if (len <= 0) {
6182 if (!empty) return Qnil;
6183 len = 0;
6184 }
6185
6186 VALUE str2 = str_subseq(str, beg, len);
6187
6188 str_enc_copy(str2, str);
6189
6190 if (RSTRING_LEN(str2) == 0) {
6191 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6193 else
6195 }
6196 else {
6197 switch (ENC_CODERANGE(str)) {
6198 case ENC_CODERANGE_7BIT:
6200 break;
6201 default:
6203 break;
6204 }
6205 }
6206
6207 return str2;
6208}
6209
6210static VALUE
6211str_byte_aref(VALUE str, VALUE indx)
6212{
6213 long idx;
6214 if (FIXNUM_P(indx)) {
6215 idx = FIX2LONG(indx);
6216 }
6217 else {
6218 /* check if indx is Range */
6219 long beg, len = RSTRING_LEN(str);
6220
6221 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6222 case Qfalse:
6223 break;
6224 case Qnil:
6225 return Qnil;
6226 default:
6227 return str_byte_substr(str, beg, len, TRUE);
6228 }
6229
6230 idx = NUM2LONG(indx);
6231 }
6232 return str_byte_substr(str, idx, 1, FALSE);
6233}
6234
6235/*
6236 * call-seq:
6237 * byteslice(index, length = 1) -> string or nil
6238 * byteslice(range) -> string or nil
6239 *
6240 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6241 *
6242 * With integer arguments +index+ and +length+ given,
6243 * returns the substring beginning at the given +index+
6244 * of the given +length+ (if possible),
6245 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6246 *
6247 * s = '0123456789' # => "0123456789"
6248 * s.byteslice(2) # => "2"
6249 * s.byteslice(200) # => nil
6250 * s.byteslice(4, 3) # => "456"
6251 * s.byteslice(4, 30) # => "456789"
6252 * s.byteslice(4, -1) # => nil
6253 * s.byteslice(40, 2) # => nil
6254 *
6255 * In either case above, counts backwards from the end of +self+
6256 * if +index+ is negative:
6257 *
6258 * s = '0123456789' # => "0123456789"
6259 * s.byteslice(-4) # => "6"
6260 * s.byteslice(-4, 3) # => "678"
6261 *
6262 * With Range argument +range+ given, returns
6263 * <tt>byteslice(range.begin, range.size)</tt>:
6264 *
6265 * s = '0123456789' # => "0123456789"
6266 * s.byteslice(4..6) # => "456"
6267 * s.byteslice(-6..-4) # => "456"
6268 * s.byteslice(5..2) # => "" # range.size is zero.
6269 * s.byteslice(40..42) # => nil
6270 *
6271 * In all cases, a returned string has the same encoding as +self+:
6272 *
6273 * s.encoding # => #<Encoding:UTF-8>
6274 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6275 *
6276 */
6277
6278static VALUE
6279rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6280{
6281 if (argc == 2) {
6282 long beg = NUM2LONG(argv[0]);
6283 long len = NUM2LONG(argv[1]);
6284 return str_byte_substr(str, beg, len, TRUE);
6285 }
6286 rb_check_arity(argc, 1, 2);
6287 return str_byte_aref(str, argv[0]);
6288}
6289
6290/*
6291 * call-seq:
6292 * bytesplice(index, length, str) -> string
6293 * bytesplice(range, str) -> string
6294 *
6295 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
6296 * The portion of the string affected is determined using
6297 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
6298 * If the replacement string is not the same length as the text it is replacing,
6299 * the string will be adjusted accordingly.
6300 * The form that take an Integer will raise an IndexError if the value is out
6301 * of range; the Range form will raise a RangeError.
6302 * If the beginning or ending offset does not land on character (codepoint)
6303 * boundary, an IndexError will be raised.
6304 */
6305
6306static VALUE
6307rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6308{
6309 long beg, end, len, slen;
6310 VALUE val;
6311 rb_encoding *enc;
6312 int cr;
6313
6314 rb_check_arity(argc, 2, 3);
6315 if (argc == 2) {
6316 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6317 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6318 rb_builtin_class_name(argv[0]));
6319 }
6320 val = argv[1];
6321 }
6322 else {
6323 beg = NUM2LONG(argv[0]);
6324 len = NUM2LONG(argv[1]);
6325 val = argv[2];
6326 }
6327 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
6328 slen = RSTRING_LEN(str);
6329 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
6330 rb_raise(rb_eIndexError, "index %ld out of string", beg);
6331 }
6332 if (beg < 0) {
6333 beg += slen;
6334 }
6335 assert(beg >= 0);
6336 assert(beg <= slen);
6337 if (len > slen - beg) {
6338 len = slen - beg;
6339 }
6340 end = beg + len;
6341 if (!str_check_byte_pos(str, beg)) {
6343 "offset %ld does not land on character boundary", beg);
6344 }
6345 if (!str_check_byte_pos(str, end)) {
6347 "offset %ld does not land on character boundary", end);
6348 }
6349 StringValue(val);
6350 enc = rb_enc_check(str, val);
6351 str_modify_keep_cr(str);
6352 rb_str_splice_0(str, beg, len, val);
6353 rb_enc_associate(str, enc);
6355 if (cr != ENC_CODERANGE_BROKEN)
6356 ENC_CODERANGE_SET(str, cr);
6357 return str;
6358}
6359
6360/*
6361 * call-seq:
6362 * reverse -> string
6363 *
6364 * Returns a new string with the characters from +self+ in reverse order.
6365 *
6366 * 'stressed'.reverse # => "desserts"
6367 *
6368 */
6369
6370static VALUE
6371rb_str_reverse(VALUE str)
6372{
6373 rb_encoding *enc;
6374 VALUE rev;
6375 char *s, *e, *p;
6376 int cr;
6377
6378 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6379 enc = STR_ENC_GET(str);
6380 rev = rb_str_new(0, RSTRING_LEN(str));
6381 s = RSTRING_PTR(str); e = RSTRING_END(str);
6382 p = RSTRING_END(rev);
6383 cr = ENC_CODERANGE(str);
6384
6385 if (RSTRING_LEN(str) > 1) {
6386 if (single_byte_optimizable(str)) {
6387 while (s < e) {
6388 *--p = *s++;
6389 }
6390 }
6391 else if (cr == ENC_CODERANGE_VALID) {
6392 while (s < e) {
6393 int clen = rb_enc_fast_mbclen(s, e, enc);
6394
6395 p -= clen;
6396 memcpy(p, s, clen);
6397 s += clen;
6398 }
6399 }
6400 else {
6401 cr = rb_enc_asciicompat(enc) ?
6403 while (s < e) {
6404 int clen = rb_enc_mbclen(s, e, enc);
6405
6406 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6407 p -= clen;
6408 memcpy(p, s, clen);
6409 s += clen;
6410 }
6411 }
6412 }
6413 STR_SET_LEN(rev, RSTRING_LEN(str));
6414 str_enc_copy(rev, str);
6415 ENC_CODERANGE_SET(rev, cr);
6416
6417 return rev;
6418}
6419
6420
6421/*
6422 * call-seq:
6423 * reverse! -> self
6424 *
6425 * Returns +self+ with its characters reversed:
6426 *
6427 * s = 'stressed'
6428 * s.reverse! # => "desserts"
6429 * s # => "desserts"
6430 *
6431 */
6432
6433static VALUE
6434rb_str_reverse_bang(VALUE str)
6435{
6436 if (RSTRING_LEN(str) > 1) {
6437 if (single_byte_optimizable(str)) {
6438 char *s, *e, c;
6439
6440 str_modify_keep_cr(str);
6441 s = RSTRING_PTR(str);
6442 e = RSTRING_END(str) - 1;
6443 while (s < e) {
6444 c = *s;
6445 *s++ = *e;
6446 *e-- = c;
6447 }
6448 }
6449 else {
6450 str_shared_replace(str, rb_str_reverse(str));
6451 }
6452 }
6453 else {
6454 str_modify_keep_cr(str);
6455 }
6456 return str;
6457}
6458
6459
6460/*
6461 * call-seq:
6462 * include? other_string -> true or false
6463 *
6464 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6465 *
6466 * s = 'foo'
6467 * s.include?('f') # => true
6468 * s.include?('fo') # => true
6469 * s.include?('food') # => false
6470 *
6471 */
6472
6473VALUE
6474rb_str_include(VALUE str, VALUE arg)
6475{
6476 long i;
6477
6478 StringValue(arg);
6479 i = rb_str_index(str, arg, 0);
6480
6481 return RBOOL(i != -1);
6482}
6483
6484
6485/*
6486 * call-seq:
6487 * to_i(base = 10) -> integer
6488 *
6489 * Returns the result of interpreting leading characters in +self+
6490 * as an integer in the given +base+ (which must be in (0, 2..36)):
6491 *
6492 * '123456'.to_i # => 123456
6493 * '123def'.to_i(16) # => 1195503
6494 *
6495 * With +base+ zero, string +object+ may contain leading characters
6496 * to specify the actual base:
6497 *
6498 * '123def'.to_i(0) # => 123
6499 * '0123def'.to_i(0) # => 83
6500 * '0b123def'.to_i(0) # => 1
6501 * '0o123def'.to_i(0) # => 83
6502 * '0d123def'.to_i(0) # => 123
6503 * '0x123def'.to_i(0) # => 1195503
6504 *
6505 * Characters past a leading valid number (in the given +base+) are ignored:
6506 *
6507 * '12.345'.to_i # => 12
6508 * '12345'.to_i(2) # => 1
6509 *
6510 * Returns zero if there is no leading valid number:
6511 *
6512 * 'abcdef'.to_i # => 0
6513 * '2'.to_i(2) # => 0
6514 *
6515 */
6516
6517static VALUE
6518rb_str_to_i(int argc, VALUE *argv, VALUE str)
6519{
6520 int base = 10;
6521
6522 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6523 rb_raise(rb_eArgError, "invalid radix %d", base);
6524 }
6525 return rb_str_to_inum(str, base, FALSE);
6526}
6527
6528
6529/*
6530 * call-seq:
6531 * to_f -> float
6532 *
6533 * Returns the result of interpreting leading characters in +self+ as a Float:
6534 *
6535 * '3.14159'.to_f # => 3.14159
6536 '1.234e-2'.to_f # => 0.01234
6537 *
6538 * Characters past a leading valid number (in the given +base+) are ignored:
6539 *
6540 * '3.14 (pi to two places)'.to_f # => 3.14
6541 *
6542 * Returns zero if there is no leading valid number:
6543 *
6544 * 'abcdef'.to_f # => 0.0
6545 *
6546 */
6547
6548static VALUE
6549rb_str_to_f(VALUE str)
6550{
6551 return DBL2NUM(rb_str_to_dbl(str, FALSE));
6552}
6553
6554
6555/*
6556 * call-seq:
6557 * to_s -> self or string
6558 *
6559 * Returns +self+ if +self+ is a \String,
6560 * or +self+ converted to a \String if +self+ is a subclass of \String.
6561 *
6562 * String#to_str is an alias for String#to_s.
6563 *
6564 */
6565
6566static VALUE
6567rb_str_to_s(VALUE str)
6568{
6569 if (rb_obj_class(str) != rb_cString) {
6570 return str_duplicate(rb_cString, str);
6571 }
6572 return str;
6573}
6574
6575#if 0
6576static void
6577str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6578{
6579 char s[RUBY_MAX_CHAR_LEN];
6580 int n = rb_enc_codelen(c, enc);
6581
6582 rb_enc_mbcput(c, s, enc);
6583 rb_enc_str_buf_cat(str, s, n, enc);
6584}
6585#endif
6586
6587#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6588
6589int
6590rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6591{
6592 char buf[CHAR_ESC_LEN + 1];
6593 int l;
6594
6595#if SIZEOF_INT > 4
6596 c &= 0xffffffff;
6597#endif
6598 if (unicode_p) {
6599 if (c < 0x7F && ISPRINT(c)) {
6600 snprintf(buf, CHAR_ESC_LEN, "%c", c);
6601 }
6602 else if (c < 0x10000) {
6603 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6604 }
6605 else {
6606 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6607 }
6608 }
6609 else {
6610 if (c < 0x100) {
6611 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6612 }
6613 else {
6614 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6615 }
6616 }
6617 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
6618 rb_str_buf_cat(result, buf, l);
6619 return l;
6620}
6621
6622const char *
6623ruby_escaped_char(int c)
6624{
6625 switch (c) {
6626 case '\0': return "\\0";
6627 case '\n': return "\\n";
6628 case '\r': return "\\r";
6629 case '\t': return "\\t";
6630 case '\f': return "\\f";
6631 case '\013': return "\\v";
6632 case '\010': return "\\b";
6633 case '\007': return "\\a";
6634 case '\033': return "\\e";
6635 case '\x7f': return "\\c?";
6636 }
6637 return NULL;
6638}
6639
6640VALUE
6641rb_str_escape(VALUE str)
6642{
6643 int encidx = ENCODING_GET(str);
6644 rb_encoding *enc = rb_enc_from_index(encidx);
6645 const char *p = RSTRING_PTR(str);
6646 const char *pend = RSTRING_END(str);
6647 const char *prev = p;
6648 char buf[CHAR_ESC_LEN + 1];
6649 VALUE result = rb_str_buf_new(0);
6650 int unicode_p = rb_enc_unicode_p(enc);
6651 int asciicompat = rb_enc_asciicompat(enc);
6652
6653 while (p < pend) {
6654 unsigned int c;
6655 const char *cc;
6656 int n = rb_enc_precise_mbclen(p, pend, enc);
6657 if (!MBCLEN_CHARFOUND_P(n)) {
6658 if (p > prev) str_buf_cat(result, prev, p - prev);
6659 n = rb_enc_mbminlen(enc);
6660 if (pend < p + n)
6661 n = (int)(pend - p);
6662 while (n--) {
6663 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6664 str_buf_cat(result, buf, strlen(buf));
6665 prev = ++p;
6666 }
6667 continue;
6668 }
6669 n = MBCLEN_CHARFOUND_LEN(n);
6670 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6671 p += n;
6672 cc = ruby_escaped_char(c);
6673 if (cc) {
6674 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6675 str_buf_cat(result, cc, strlen(cc));
6676 prev = p;
6677 }
6678 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6679 }
6680 else {
6681 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6682 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6683 prev = p;
6684 }
6685 }
6686 if (p > prev) str_buf_cat(result, prev, p - prev);
6687 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
6688
6689 return result;
6690}
6691
6692/*
6693 * call-seq:
6694 * inspect -> string
6695 *
6696 * Returns a printable version of +self+, enclosed in double-quotes,
6697 * and with special characters escaped:
6698 *
6699 * s = "foo\tbar\tbaz\n"
6700 * s.inspect
6701 * # => "\"foo\\tbar\\tbaz\\n\""
6702 *
6703 */
6704
6705VALUE
6707{
6708 int encidx = ENCODING_GET(str);
6709 rb_encoding *enc = rb_enc_from_index(encidx);
6710 const char *p, *pend, *prev;
6711 char buf[CHAR_ESC_LEN + 1];
6712 VALUE result = rb_str_buf_new(0);
6713 rb_encoding *resenc = rb_default_internal_encoding();
6714 int unicode_p = rb_enc_unicode_p(enc);
6715 int asciicompat = rb_enc_asciicompat(enc);
6716
6717 if (resenc == NULL) resenc = rb_default_external_encoding();
6718 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6719 rb_enc_associate(result, resenc);
6720 str_buf_cat2(result, "\"");
6721
6722 p = RSTRING_PTR(str); pend = RSTRING_END(str);
6723 prev = p;
6724 while (p < pend) {
6725 unsigned int c, cc;
6726 int n;
6727
6728 n = rb_enc_precise_mbclen(p, pend, enc);
6729 if (!MBCLEN_CHARFOUND_P(n)) {
6730 if (p > prev) str_buf_cat(result, prev, p - prev);
6731 n = rb_enc_mbminlen(enc);
6732 if (pend < p + n)
6733 n = (int)(pend - p);
6734 while (n--) {
6735 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6736 str_buf_cat(result, buf, strlen(buf));
6737 prev = ++p;
6738 }
6739 continue;
6740 }
6741 n = MBCLEN_CHARFOUND_LEN(n);
6742 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6743 p += n;
6744 if ((asciicompat || unicode_p) &&
6745 (c == '"'|| c == '\\' ||
6746 (c == '#' &&
6747 p < pend &&
6748 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
6749 (cc = rb_enc_codepoint(p,pend,enc),
6750 (cc == '$' || cc == '@' || cc == '{'))))) {
6751 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6752 str_buf_cat2(result, "\\");
6753 if (asciicompat || enc == resenc) {
6754 prev = p - n;
6755 continue;
6756 }
6757 }
6758 switch (c) {
6759 case '\n': cc = 'n'; break;
6760 case '\r': cc = 'r'; break;
6761 case '\t': cc = 't'; break;
6762 case '\f': cc = 'f'; break;
6763 case '\013': cc = 'v'; break;
6764 case '\010': cc = 'b'; break;
6765 case '\007': cc = 'a'; break;
6766 case 033: cc = 'e'; break;
6767 default: cc = 0; break;
6768 }
6769 if (cc) {
6770 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6771 buf[0] = '\\';
6772 buf[1] = (char)cc;
6773 str_buf_cat(result, buf, 2);
6774 prev = p;
6775 continue;
6776 }
6777 /* The special casing of 0x85 (NEXT_LINE) here is because
6778 * Oniguruma historically treats it as printable, but it
6779 * doesn't match the print POSIX bracket class or character
6780 * property in regexps.
6781 *
6782 * See Ruby Bug #16842 for details:
6783 * https://bugs.ruby-lang.org/issues/16842
6784 */
6785 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
6786 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6787 continue;
6788 }
6789 else {
6790 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6791 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6792 prev = p;
6793 continue;
6794 }
6795 }
6796 if (p > prev) str_buf_cat(result, prev, p - prev);
6797 str_buf_cat2(result, "\"");
6798
6799 return result;
6800}
6801
6802#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6803
6804/*
6805 * call-seq:
6806 * dump -> string
6807 *
6808 * Returns a printable version of +self+, enclosed in double-quotes,
6809 * with special characters escaped, and with non-printing characters
6810 * replaced by hexadecimal notation:
6811 *
6812 * "hello \n ''".dump # => "\"hello \\n ''\""
6813 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6814 *
6815 * Related: String#undump (inverse of String#dump).
6816 *
6817 */
6818
6819VALUE
6821{
6822 int encidx = rb_enc_get_index(str);
6823 rb_encoding *enc = rb_enc_from_index(encidx);
6824 long len;
6825 const char *p, *pend;
6826 char *q, *qend;
6827 VALUE result;
6828 int u8 = (encidx == rb_utf8_encindex());
6829 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6830
6831 len = 2; /* "" */
6832 if (!rb_enc_asciicompat(enc)) {
6833 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6834 len += strlen(enc->name);
6835 }
6836
6837 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6838 while (p < pend) {
6839 int clen;
6840 unsigned char c = *p++;
6841
6842 switch (c) {
6843 case '"': case '\\':
6844 case '\n': case '\r':
6845 case '\t': case '\f':
6846 case '\013': case '\010': case '\007': case '\033':
6847 clen = 2;
6848 break;
6849
6850 case '#':
6851 clen = IS_EVSTR(p, pend) ? 2 : 1;
6852 break;
6853
6854 default:
6855 if (ISPRINT(c)) {
6856 clen = 1;
6857 }
6858 else {
6859 if (u8 && c > 0x7F) { /* \u notation */
6860 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6861 if (MBCLEN_CHARFOUND_P(n)) {
6862 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6863 if (cc <= 0xFFFF)
6864 clen = 6; /* \uXXXX */
6865 else if (cc <= 0xFFFFF)
6866 clen = 9; /* \u{XXXXX} */
6867 else
6868 clen = 10; /* \u{XXXXXX} */
6869 p += MBCLEN_CHARFOUND_LEN(n)-1;
6870 break;
6871 }
6872 }
6873 clen = 4; /* \xNN */
6874 }
6875 break;
6876 }
6877
6878 if (clen > LONG_MAX - len) {
6879 rb_raise(rb_eRuntimeError, "string size too big");
6880 }
6881 len += clen;
6882 }
6883
6884 result = rb_str_new(0, len);
6885 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6886 q = RSTRING_PTR(result); qend = q + len + 1;
6887
6888 *q++ = '"';
6889 while (p < pend) {
6890 unsigned char c = *p++;
6891
6892 if (c == '"' || c == '\\') {
6893 *q++ = '\\';
6894 *q++ = c;
6895 }
6896 else if (c == '#') {
6897 if (IS_EVSTR(p, pend)) *q++ = '\\';
6898 *q++ = '#';
6899 }
6900 else if (c == '\n') {
6901 *q++ = '\\';
6902 *q++ = 'n';
6903 }
6904 else if (c == '\r') {
6905 *q++ = '\\';
6906 *q++ = 'r';
6907 }
6908 else if (c == '\t') {
6909 *q++ = '\\';
6910 *q++ = 't';
6911 }
6912 else if (c == '\f') {
6913 *q++ = '\\';
6914 *q++ = 'f';
6915 }
6916 else if (c == '\013') {
6917 *q++ = '\\';
6918 *q++ = 'v';
6919 }
6920 else if (c == '\010') {
6921 *q++ = '\\';
6922 *q++ = 'b';
6923 }
6924 else if (c == '\007') {
6925 *q++ = '\\';
6926 *q++ = 'a';
6927 }
6928 else if (c == '\033') {
6929 *q++ = '\\';
6930 *q++ = 'e';
6931 }
6932 else if (ISPRINT(c)) {
6933 *q++ = c;
6934 }
6935 else {
6936 *q++ = '\\';
6937 if (u8) {
6938 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6939 if (MBCLEN_CHARFOUND_P(n)) {
6940 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6941 p += n;
6942 if (cc <= 0xFFFF)
6943 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
6944 else
6945 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
6946 q += strlen(q);
6947 continue;
6948 }
6949 }
6950 snprintf(q, qend-q, "x%02X", c);
6951 q += 3;
6952 }
6953 }
6954 *q++ = '"';
6955 *q = '\0';
6956 if (!rb_enc_asciicompat(enc)) {
6957 snprintf(q, qend-q, nonascii_suffix, enc->name);
6958 encidx = rb_ascii8bit_encindex();
6959 }
6960 /* result from dump is ASCII */
6961 rb_enc_associate_index(result, encidx);
6963 return result;
6964}
6965
6966static int
6967unescape_ascii(unsigned int c)
6968{
6969 switch (c) {
6970 case 'n':
6971 return '\n';
6972 case 'r':
6973 return '\r';
6974 case 't':
6975 return '\t';
6976 case 'f':
6977 return '\f';
6978 case 'v':
6979 return '\13';
6980 case 'b':
6981 return '\010';
6982 case 'a':
6983 return '\007';
6984 case 'e':
6985 return 033;
6986 }
6988}
6989
6990static void
6991undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
6992{
6993 const char *s = *ss;
6994 unsigned int c;
6995 int codelen;
6996 size_t hexlen;
6997 unsigned char buf[6];
6998 static rb_encoding *enc_utf8 = NULL;
6999
7000 switch (*s) {
7001 case '\\':
7002 case '"':
7003 case '#':
7004 rb_str_cat(undumped, s, 1); /* cat itself */
7005 s++;
7006 break;
7007 case 'n':
7008 case 'r':
7009 case 't':
7010 case 'f':
7011 case 'v':
7012 case 'b':
7013 case 'a':
7014 case 'e':
7015 *buf = unescape_ascii(*s);
7016 rb_str_cat(undumped, (char *)buf, 1);
7017 s++;
7018 break;
7019 case 'u':
7020 if (*binary) {
7021 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7022 }
7023 *utf8 = true;
7024 if (++s >= s_end) {
7025 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7026 }
7027 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7028 if (*penc != enc_utf8) {
7029 *penc = enc_utf8;
7030 rb_enc_associate(undumped, enc_utf8);
7031 }
7032 if (*s == '{') { /* handle \u{...} form */
7033 s++;
7034 for (;;) {
7035 if (s >= s_end) {
7036 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7037 }
7038 if (*s == '}') {
7039 s++;
7040 break;
7041 }
7042 if (ISSPACE(*s)) {
7043 s++;
7044 continue;
7045 }
7046 c = scan_hex(s, s_end-s, &hexlen);
7047 if (hexlen == 0 || hexlen > 6) {
7048 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7049 }
7050 if (c > 0x10ffff) {
7051 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7052 }
7053 if (0xd800 <= c && c <= 0xdfff) {
7054 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7055 }
7056 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7057 rb_str_cat(undumped, (char *)buf, codelen);
7058 s += hexlen;
7059 }
7060 }
7061 else { /* handle \uXXXX form */
7062 c = scan_hex(s, 4, &hexlen);
7063 if (hexlen != 4) {
7064 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7065 }
7066 if (0xd800 <= c && c <= 0xdfff) {
7067 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7068 }
7069 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7070 rb_str_cat(undumped, (char *)buf, codelen);
7071 s += hexlen;
7072 }
7073 break;
7074 case 'x':
7075 if (*utf8) {
7076 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7077 }
7078 *binary = true;
7079 if (++s >= s_end) {
7080 rb_raise(rb_eRuntimeError, "invalid hex escape");
7081 }
7082 *buf = scan_hex(s, 2, &hexlen);
7083 if (hexlen != 2) {
7084 rb_raise(rb_eRuntimeError, "invalid hex escape");
7085 }
7086 rb_str_cat(undumped, (char *)buf, 1);
7087 s += hexlen;
7088 break;
7089 default:
7090 rb_str_cat(undumped, s-1, 2);
7091 s++;
7092 }
7093
7094 *ss = s;
7095}
7096
7097static VALUE rb_str_is_ascii_only_p(VALUE str);
7098
7099/*
7100 * call-seq:
7101 * undump -> string
7102 *
7103 * Returns an unescaped version of +self+:
7104 *
7105 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7106 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7107 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7108 * s_undumped == s_orig # => true
7109 *
7110 * Related: String#dump (inverse of String#undump).
7111 *
7112 */
7113
7114static VALUE
7115str_undump(VALUE str)
7116{
7117 const char *s = RSTRING_PTR(str);
7118 const char *s_end = RSTRING_END(str);
7119 rb_encoding *enc = rb_enc_get(str);
7120 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7121 bool utf8 = false;
7122 bool binary = false;
7123 int w;
7124
7126 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7127 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7128 }
7129 if (!str_null_check(str, &w)) {
7130 rb_raise(rb_eRuntimeError, "string contains null byte");
7131 }
7132 if (RSTRING_LEN(str) < 2) goto invalid_format;
7133 if (*s != '"') goto invalid_format;
7134
7135 /* strip '"' at the start */
7136 s++;
7137
7138 for (;;) {
7139 if (s >= s_end) {
7140 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7141 }
7142
7143 if (*s == '"') {
7144 /* epilogue */
7145 s++;
7146 if (s == s_end) {
7147 /* ascii compatible dumped string */
7148 break;
7149 }
7150 else {
7151 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7152 static const char dup_suffix[] = ".dup";
7153 const char *encname;
7154 int encidx;
7155 ptrdiff_t size;
7156
7157 /* check separately for strings dumped by older versions */
7158 size = sizeof(dup_suffix) - 1;
7159 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7160
7161 size = sizeof(force_encoding_suffix) - 1;
7162 if (s_end - s <= size) goto invalid_format;
7163 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7164 s += size;
7165
7166 if (utf8) {
7167 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7168 }
7169
7170 encname = s;
7171 s = memchr(s, '"', s_end-s);
7172 size = s - encname;
7173 if (!s) goto invalid_format;
7174 if (s_end - s != 2) goto invalid_format;
7175 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7176
7177 encidx = rb_enc_find_index2(encname, (long)size);
7178 if (encidx < 0) {
7179 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7180 }
7181 rb_enc_associate_index(undumped, encidx);
7182 }
7183 break;
7184 }
7185
7186 if (*s == '\\') {
7187 s++;
7188 if (s >= s_end) {
7189 rb_raise(rb_eRuntimeError, "invalid escape");
7190 }
7191 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7192 }
7193 else {
7194 rb_str_cat(undumped, s++, 1);
7195 }
7196 }
7197
7198 return undumped;
7199invalid_format:
7200 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7201}
7202
7203static void
7204rb_str_check_dummy_enc(rb_encoding *enc)
7205{
7206 if (rb_enc_dummy_p(enc)) {
7207 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7208 rb_enc_name(enc));
7209 }
7210}
7211
7212static rb_encoding *
7213str_true_enc(VALUE str)
7214{
7215 rb_encoding *enc = STR_ENC_GET(str);
7216 rb_str_check_dummy_enc(enc);
7217 return enc;
7218}
7219
7220static OnigCaseFoldType
7221check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7222{
7223 if (argc==0)
7224 return flags;
7225 if (argc>2)
7226 rb_raise(rb_eArgError, "too many options");
7227 if (argv[0]==sym_turkic) {
7228 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7229 if (argc==2) {
7230 if (argv[1]==sym_lithuanian)
7231 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7232 else
7233 rb_raise(rb_eArgError, "invalid second option");
7234 }
7235 }
7236 else if (argv[0]==sym_lithuanian) {
7237 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7238 if (argc==2) {
7239 if (argv[1]==sym_turkic)
7240 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7241 else
7242 rb_raise(rb_eArgError, "invalid second option");
7243 }
7244 }
7245 else if (argc>1)
7246 rb_raise(rb_eArgError, "too many options");
7247 else if (argv[0]==sym_ascii)
7248 flags |= ONIGENC_CASE_ASCII_ONLY;
7249 else if (argv[0]==sym_fold) {
7250 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7251 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7252 else
7253 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7254 }
7255 else
7256 rb_raise(rb_eArgError, "invalid option");
7257 return flags;
7258}
7259
7260static inline bool
7261case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7262{
7263 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7264 return true;
7265 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7266}
7267
7268/* 16 should be long enough to absorb any kind of single character length increase */
7269#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7270#ifndef CASEMAP_DEBUG
7271# define CASEMAP_DEBUG 0
7272#endif
7273
7274struct mapping_buffer;
7275typedef struct mapping_buffer {
7276 size_t capa;
7277 size_t used;
7278 struct mapping_buffer *next;
7279 OnigUChar space[FLEX_ARY_LEN];
7281
7282static void
7283mapping_buffer_free(void *p)
7284{
7285 mapping_buffer *previous_buffer;
7286 mapping_buffer *current_buffer = p;
7287 while (current_buffer) {
7288 previous_buffer = current_buffer;
7289 current_buffer = current_buffer->next;
7290 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7291 }
7292}
7293
7294static const rb_data_type_t mapping_buffer_type = {
7295 "mapping_buffer",
7296 {0, mapping_buffer_free,}
7297};
7298
7299static VALUE
7300rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7301{
7302 VALUE target;
7303
7304 const OnigUChar *source_current, *source_end;
7305 int target_length = 0;
7306 VALUE buffer_anchor;
7307 mapping_buffer *current_buffer = 0;
7308 mapping_buffer **pre_buffer;
7309 size_t buffer_count = 0;
7310 int buffer_length_or_invalid;
7311
7312 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7313
7314 source_current = (OnigUChar*)RSTRING_PTR(source);
7315 source_end = (OnigUChar*)RSTRING_END(source);
7316
7317 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7318 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7319 while (source_current < source_end) {
7320 /* increase multiplier using buffer count to converge quickly */
7321 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7322 if (CASEMAP_DEBUG) {
7323 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7324 }
7325 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7326 *pre_buffer = current_buffer;
7327 pre_buffer = &current_buffer->next;
7328 current_buffer->next = NULL;
7329 current_buffer->capa = capa;
7330 buffer_length_or_invalid = enc->case_map(flags,
7331 &source_current, source_end,
7332 current_buffer->space,
7333 current_buffer->space+current_buffer->capa,
7334 enc);
7335 if (buffer_length_or_invalid < 0) {
7336 current_buffer = DATA_PTR(buffer_anchor);
7337 DATA_PTR(buffer_anchor) = 0;
7338 mapping_buffer_free(current_buffer);
7339 rb_raise(rb_eArgError, "input string invalid");
7340 }
7341 target_length += current_buffer->used = buffer_length_or_invalid;
7342 }
7343 if (CASEMAP_DEBUG) {
7344 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7345 }
7346
7347 if (buffer_count==1) {
7348 target = rb_str_new((const char*)current_buffer->space, target_length);
7349 }
7350 else {
7351 char *target_current;
7352
7353 target = rb_str_new(0, target_length);
7354 target_current = RSTRING_PTR(target);
7355 current_buffer = DATA_PTR(buffer_anchor);
7356 while (current_buffer) {
7357 memcpy(target_current, current_buffer->space, current_buffer->used);
7358 target_current += current_buffer->used;
7359 current_buffer = current_buffer->next;
7360 }
7361 }
7362 current_buffer = DATA_PTR(buffer_anchor);
7363 DATA_PTR(buffer_anchor) = 0;
7364 mapping_buffer_free(current_buffer);
7365
7366 RB_GC_GUARD(buffer_anchor);
7367
7368 /* TODO: check about string terminator character */
7369 str_enc_copy(target, source);
7370 /*ENC_CODERANGE_SET(mapped, cr);*/
7371
7372 return target;
7373}
7374
7375static VALUE
7376rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7377{
7378 const OnigUChar *source_current, *source_end;
7379 OnigUChar *target_current, *target_end;
7380 long old_length = RSTRING_LEN(source);
7381 int length_or_invalid;
7382
7383 if (old_length == 0) return Qnil;
7384
7385 source_current = (OnigUChar*)RSTRING_PTR(source);
7386 source_end = (OnigUChar*)RSTRING_END(source);
7387 if (source == target) {
7388 target_current = (OnigUChar*)source_current;
7389 target_end = (OnigUChar*)source_end;
7390 }
7391 else {
7392 target_current = (OnigUChar*)RSTRING_PTR(target);
7393 target_end = (OnigUChar*)RSTRING_END(target);
7394 }
7395
7396 length_or_invalid = onigenc_ascii_only_case_map(flags,
7397 &source_current, source_end,
7398 target_current, target_end, enc);
7399 if (length_or_invalid < 0)
7400 rb_raise(rb_eArgError, "input string invalid");
7401 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7402 fprintf(stderr, "problem with rb_str_ascii_casemap"
7403 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7404 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7405 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7406 }
7407
7408 str_enc_copy(target, source);
7409
7410 return target;
7411}
7412
7413static bool
7414upcase_single(VALUE str)
7415{
7416 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7417 bool modified = false;
7418
7419 while (s < send) {
7420 unsigned int c = *(unsigned char*)s;
7421
7422 if ('a' <= c && c <= 'z') {
7423 *s = 'A' + (c - 'a');
7424 modified = true;
7425 }
7426 s++;
7427 }
7428 return modified;
7429}
7430
7431/*
7432 * call-seq:
7433 * upcase!(*options) -> self or nil
7434 *
7435 * Upcases the characters in +self+;
7436 * returns +self+ if any changes were made, +nil+ otherwise:
7437 *
7438 * s = 'Hello World!' # => "Hello World!"
7439 * s.upcase! # => "HELLO WORLD!"
7440 * s # => "HELLO WORLD!"
7441 * s.upcase! # => nil
7442 *
7443 * The casing may be affected by the given +options+;
7444 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7445 *
7446 * Related: String#upcase, String#downcase, String#downcase!.
7447 *
7448 */
7449
7450static VALUE
7451rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7452{
7453 rb_encoding *enc;
7454 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7455
7456 flags = check_case_options(argc, argv, flags);
7457 str_modify_keep_cr(str);
7458 enc = str_true_enc(str);
7459 if (case_option_single_p(flags, enc, str)) {
7460 if (upcase_single(str))
7461 flags |= ONIGENC_CASE_MODIFIED;
7462 }
7463 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7464 rb_str_ascii_casemap(str, str, &flags, enc);
7465 else
7466 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7467
7468 if (ONIGENC_CASE_MODIFIED&flags) return str;
7469 return Qnil;
7470}
7471
7472
7473/*
7474 * call-seq:
7475 * upcase(*options) -> string
7476 *
7477 * Returns a string containing the upcased characters in +self+:
7478 *
7479 * s = 'Hello World!' # => "Hello World!"
7480 * s.upcase # => "HELLO WORLD!"
7481 *
7482 * The casing may be affected by the given +options+;
7483 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7484 *
7485 * Related: String#upcase!, String#downcase, String#downcase!.
7486 *
7487 */
7488
7489static VALUE
7490rb_str_upcase(int argc, VALUE *argv, VALUE str)
7491{
7492 rb_encoding *enc;
7493 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7494 VALUE ret;
7495
7496 flags = check_case_options(argc, argv, flags);
7497 enc = str_true_enc(str);
7498 if (case_option_single_p(flags, enc, str)) {
7499 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7500 str_enc_copy(ret, str);
7501 upcase_single(ret);
7502 }
7503 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7504 ret = rb_str_new(0, RSTRING_LEN(str));
7505 rb_str_ascii_casemap(str, ret, &flags, enc);
7506 }
7507 else {
7508 ret = rb_str_casemap(str, &flags, enc);
7509 }
7510
7511 return ret;
7512}
7513
7514static bool
7515downcase_single(VALUE str)
7516{
7517 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7518 bool modified = false;
7519
7520 while (s < send) {
7521 unsigned int c = *(unsigned char*)s;
7522
7523 if ('A' <= c && c <= 'Z') {
7524 *s = 'a' + (c - 'A');
7525 modified = true;
7526 }
7527 s++;
7528 }
7529
7530 return modified;
7531}
7532
7533/*
7534 * call-seq:
7535 * downcase!(*options) -> self or nil
7536 *
7537 * Downcases the characters in +self+;
7538 * returns +self+ if any changes were made, +nil+ otherwise:
7539 *
7540 * s = 'Hello World!' # => "Hello World!"
7541 * s.downcase! # => "hello world!"
7542 * s # => "hello world!"
7543 * s.downcase! # => nil
7544 *
7545 * The casing may be affected by the given +options+;
7546 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7547 *
7548 * Related: String#downcase, String#upcase, String#upcase!.
7549 *
7550 */
7551
7552static VALUE
7553rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7554{
7555 rb_encoding *enc;
7556 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7557
7558 flags = check_case_options(argc, argv, flags);
7559 str_modify_keep_cr(str);
7560 enc = str_true_enc(str);
7561 if (case_option_single_p(flags, enc, str)) {
7562 if (downcase_single(str))
7563 flags |= ONIGENC_CASE_MODIFIED;
7564 }
7565 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7566 rb_str_ascii_casemap(str, str, &flags, enc);
7567 else
7568 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7569
7570 if (ONIGENC_CASE_MODIFIED&flags) return str;
7571 return Qnil;
7572}
7573
7574
7575/*
7576 * call-seq:
7577 * downcase(*options) -> string
7578 *
7579 * Returns a string containing the downcased characters in +self+:
7580 *
7581 * s = 'Hello World!' # => "Hello World!"
7582 * s.downcase # => "hello world!"
7583 *
7584 * The casing may be affected by the given +options+;
7585 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7586 *
7587 * Related: String#downcase!, String#upcase, String#upcase!.
7588 *
7589 */
7590
7591static VALUE
7592rb_str_downcase(int argc, VALUE *argv, VALUE str)
7593{
7594 rb_encoding *enc;
7595 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7596 VALUE ret;
7597
7598 flags = check_case_options(argc, argv, flags);
7599 enc = str_true_enc(str);
7600 if (case_option_single_p(flags, enc, str)) {
7601 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7602 str_enc_copy(ret, str);
7603 downcase_single(ret);
7604 }
7605 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7606 ret = rb_str_new(0, RSTRING_LEN(str));
7607 rb_str_ascii_casemap(str, ret, &flags, enc);
7608 }
7609 else {
7610 ret = rb_str_casemap(str, &flags, enc);
7611 }
7612
7613 return ret;
7614}
7615
7616
7617/*
7618 * call-seq:
7619 * capitalize!(*options) -> self or nil
7620 *
7621 * Upcases the first character in +self+;
7622 * downcases the remaining characters;
7623 * returns +self+ if any changes were made, +nil+ otherwise:
7624 *
7625 * s = 'hello World!' # => "hello World!"
7626 * s.capitalize! # => "Hello world!"
7627 * s # => "Hello world!"
7628 * s.capitalize! # => nil
7629 *
7630 * The casing may be affected by the given +options+;
7631 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7632 *
7633 * Related: String#capitalize.
7634 *
7635 */
7636
7637static VALUE
7638rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7639{
7640 rb_encoding *enc;
7641 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7642
7643 flags = check_case_options(argc, argv, flags);
7644 str_modify_keep_cr(str);
7645 enc = str_true_enc(str);
7646 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7647 if (flags&ONIGENC_CASE_ASCII_ONLY)
7648 rb_str_ascii_casemap(str, str, &flags, enc);
7649 else
7650 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7651
7652 if (ONIGENC_CASE_MODIFIED&flags) return str;
7653 return Qnil;
7654}
7655
7656
7657/*
7658 * call-seq:
7659 * capitalize(*options) -> string
7660 *
7661 * Returns a string containing the characters in +self+;
7662 * the first character is upcased;
7663 * the remaining characters are downcased:
7664 *
7665 * s = 'hello World!' # => "hello World!"
7666 * s.capitalize # => "Hello world!"
7667 *
7668 * The casing may be affected by the given +options+;
7669 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7670 *
7671 * Related: String#capitalize!.
7672 *
7673 */
7674
7675static VALUE
7676rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7677{
7678 rb_encoding *enc;
7679 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7680 VALUE ret;
7681
7682 flags = check_case_options(argc, argv, flags);
7683 enc = str_true_enc(str);
7684 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7685 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7686 ret = rb_str_new(0, RSTRING_LEN(str));
7687 rb_str_ascii_casemap(str, ret, &flags, enc);
7688 }
7689 else {
7690 ret = rb_str_casemap(str, &flags, enc);
7691 }
7692 return ret;
7693}
7694
7695
7696/*
7697 * call-seq:
7698 * swapcase!(*options) -> self or nil
7699 *
7700 * Upcases each lowercase character in +self+;
7701 * downcases uppercase character;
7702 * returns +self+ if any changes were made, +nil+ otherwise:
7703 *
7704 * s = 'Hello World!' # => "Hello World!"
7705 * s.swapcase! # => "hELLO wORLD!"
7706 * s # => "hELLO wORLD!"
7707 * ''.swapcase! # => nil
7708 *
7709 * The casing may be affected by the given +options+;
7710 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7711 *
7712 * Related: String#swapcase.
7713 *
7714 */
7715
7716static VALUE
7717rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7718{
7719 rb_encoding *enc;
7720 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7721
7722 flags = check_case_options(argc, argv, flags);
7723 str_modify_keep_cr(str);
7724 enc = str_true_enc(str);
7725 if (flags&ONIGENC_CASE_ASCII_ONLY)
7726 rb_str_ascii_casemap(str, str, &flags, enc);
7727 else
7728 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7729
7730 if (ONIGENC_CASE_MODIFIED&flags) return str;
7731 return Qnil;
7732}
7733
7734
7735/*
7736 * call-seq:
7737 * swapcase(*options) -> string
7738 *
7739 * Returns a string containing the characters in +self+, with cases reversed;
7740 * each uppercase character is downcased;
7741 * each lowercase character is upcased:
7742 *
7743 * s = 'Hello World!' # => "Hello World!"
7744 * s.swapcase # => "hELLO wORLD!"
7745 *
7746 * The casing may be affected by the given +options+;
7747 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7748 *
7749 * Related: String#swapcase!.
7750 *
7751 */
7752
7753static VALUE
7754rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7755{
7756 rb_encoding *enc;
7757 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7758 VALUE ret;
7759
7760 flags = check_case_options(argc, argv, flags);
7761 enc = str_true_enc(str);
7762 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7763 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7764 ret = rb_str_new(0, RSTRING_LEN(str));
7765 rb_str_ascii_casemap(str, ret, &flags, enc);
7766 }
7767 else {
7768 ret = rb_str_casemap(str, &flags, enc);
7769 }
7770 return ret;
7771}
7772
7773typedef unsigned char *USTR;
7774
7775struct tr {
7776 int gen;
7777 unsigned int now, max;
7778 char *p, *pend;
7779};
7780
7781static unsigned int
7782trnext(struct tr *t, rb_encoding *enc)
7783{
7784 int n;
7785
7786 for (;;) {
7787 nextpart:
7788 if (!t->gen) {
7789 if (t->p == t->pend) return -1;
7790 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7791 t->p += n;
7792 }
7793 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7794 t->p += n;
7795 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7796 t->p += n;
7797 if (t->p < t->pend) {
7798 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7799 t->p += n;
7800 if (t->now > c) {
7801 if (t->now < 0x80 && c < 0x80) {
7803 "invalid range \"%c-%c\" in string transliteration",
7804 t->now, c);
7805 }
7806 else {
7807 rb_raise(rb_eArgError, "invalid range in string transliteration");
7808 }
7809 continue; /* not reached */
7810 }
7811 t->gen = 1;
7812 t->max = c;
7813 }
7814 }
7815 return t->now;
7816 }
7817 else {
7818 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7819 if (t->now == t->max) {
7820 t->gen = 0;
7821 goto nextpart;
7822 }
7823 }
7824 if (t->now < t->max) {
7825 return t->now;
7826 }
7827 else {
7828 t->gen = 0;
7829 return t->max;
7830 }
7831 }
7832 }
7833}
7834
7835static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7836
7837static VALUE
7838tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7839{
7840 const unsigned int errc = -1;
7841 unsigned int trans[256];
7842 rb_encoding *enc, *e1, *e2;
7843 struct tr trsrc, trrepl;
7844 int cflag = 0;
7845 unsigned int c, c0, last = 0;
7846 int modify = 0, i, l;
7847 unsigned char *s, *send;
7848 VALUE hash = 0;
7849 int singlebyte = single_byte_optimizable(str);
7850 int termlen;
7851 int cr;
7852
7853#define CHECK_IF_ASCII(c) \
7854 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7855 (cr = ENC_CODERANGE_VALID) : 0)
7856
7857 StringValue(src);
7858 StringValue(repl);
7859 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7860 if (RSTRING_LEN(repl) == 0) {
7861 return rb_str_delete_bang(1, &src, str);
7862 }
7863
7864 cr = ENC_CODERANGE(str);
7865 e1 = rb_enc_check(str, src);
7866 e2 = rb_enc_check(str, repl);
7867 if (e1 == e2) {
7868 enc = e1;
7869 }
7870 else {
7871 enc = rb_enc_check(src, repl);
7872 }
7873 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7874 if (RSTRING_LEN(src) > 1 &&
7875 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7876 trsrc.p + l < trsrc.pend) {
7877 cflag = 1;
7878 trsrc.p += l;
7879 }
7880 trrepl.p = RSTRING_PTR(repl);
7881 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7882 trsrc.gen = trrepl.gen = 0;
7883 trsrc.now = trrepl.now = 0;
7884 trsrc.max = trrepl.max = 0;
7885
7886 if (cflag) {
7887 for (i=0; i<256; i++) {
7888 trans[i] = 1;
7889 }
7890 while ((c = trnext(&trsrc, enc)) != errc) {
7891 if (c < 256) {
7892 trans[c] = errc;
7893 }
7894 else {
7895 if (!hash) hash = rb_hash_new();
7896 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7897 }
7898 }
7899 while ((c = trnext(&trrepl, enc)) != errc)
7900 /* retrieve last replacer */;
7901 last = trrepl.now;
7902 for (i=0; i<256; i++) {
7903 if (trans[i] != errc) {
7904 trans[i] = last;
7905 }
7906 }
7907 }
7908 else {
7909 unsigned int r;
7910
7911 for (i=0; i<256; i++) {
7912 trans[i] = errc;
7913 }
7914 while ((c = trnext(&trsrc, enc)) != errc) {
7915 r = trnext(&trrepl, enc);
7916 if (r == errc) r = trrepl.now;
7917 if (c < 256) {
7918 trans[c] = r;
7919 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7920 }
7921 else {
7922 if (!hash) hash = rb_hash_new();
7923 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7924 }
7925 }
7926 }
7927
7928 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
7929 cr = ENC_CODERANGE_7BIT;
7930 str_modify_keep_cr(str);
7931 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
7932 termlen = rb_enc_mbminlen(enc);
7933 if (sflag) {
7934 int clen, tlen;
7935 long offset, max = RSTRING_LEN(str);
7936 unsigned int save = -1;
7937 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7938
7939 while (s < send) {
7940 int may_modify = 0;
7941
7942 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7943 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7944
7945 s += clen;
7946 if (c < 256) {
7947 c = trans[c];
7948 }
7949 else if (hash) {
7950 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7951 if (NIL_P(tmp)) {
7952 if (cflag) c = last;
7953 else c = errc;
7954 }
7955 else if (cflag) c = errc;
7956 else c = NUM2INT(tmp);
7957 }
7958 else {
7959 c = errc;
7960 }
7961 if (c != (unsigned int)-1) {
7962 if (save == c) {
7963 CHECK_IF_ASCII(c);
7964 continue;
7965 }
7966 save = c;
7967 tlen = rb_enc_codelen(c, enc);
7968 modify = 1;
7969 }
7970 else {
7971 save = -1;
7972 c = c0;
7973 if (enc != e1) may_modify = 1;
7974 }
7975 if ((offset = t - buf) + tlen > max) {
7976 size_t MAYBE_UNUSED(old) = max + termlen;
7977 max = offset + tlen + (send - s);
7978 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
7979 t = buf + offset;
7980 }
7981 rb_enc_mbcput(c, t, enc);
7982 if (may_modify && memcmp(s, t, tlen) != 0) {
7983 modify = 1;
7984 }
7985 CHECK_IF_ASCII(c);
7986 t += tlen;
7987 }
7988 if (!STR_EMBED_P(str)) {
7989 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7990 }
7991 TERM_FILL((char *)t, termlen);
7992 RSTRING(str)->as.heap.ptr = (char *)buf;
7993 RSTRING(str)->as.heap.len = t - buf;
7994 STR_SET_NOEMBED(str);
7995 RSTRING(str)->as.heap.aux.capa = max;
7996 }
7997 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
7998 while (s < send) {
7999 c = (unsigned char)*s;
8000 if (trans[c] != errc) {
8001 if (!cflag) {
8002 c = trans[c];
8003 *s = c;
8004 modify = 1;
8005 }
8006 else {
8007 *s = last;
8008 modify = 1;
8009 }
8010 }
8011 CHECK_IF_ASCII(c);
8012 s++;
8013 }
8014 }
8015 else {
8016 int clen, tlen;
8017 long offset, max = (long)((send - s) * 1.2);
8018 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8019
8020 while (s < send) {
8021 int may_modify = 0;
8022 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
8023 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8024
8025 if (c < 256) {
8026 c = trans[c];
8027 }
8028 else if (hash) {
8029 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8030 if (NIL_P(tmp)) {
8031 if (cflag) c = last;
8032 else c = errc;
8033 }
8034 else if (cflag) c = errc;
8035 else c = NUM2INT(tmp);
8036 }
8037 else {
8038 c = cflag ? last : errc;
8039 }
8040 if (c != errc) {
8041 tlen = rb_enc_codelen(c, enc);
8042 modify = 1;
8043 }
8044 else {
8045 c = c0;
8046 if (enc != e1) may_modify = 1;
8047 }
8048 if ((offset = t - buf) + tlen > max) {
8049 size_t MAYBE_UNUSED(old) = max + termlen;
8050 max = offset + tlen + (long)((send - s) * 1.2);
8051 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8052 t = buf + offset;
8053 }
8054 if (s != t) {
8055 rb_enc_mbcput(c, t, enc);
8056 if (may_modify && memcmp(s, t, tlen) != 0) {
8057 modify = 1;
8058 }
8059 }
8060 CHECK_IF_ASCII(c);
8061 s += clen;
8062 t += tlen;
8063 }
8064 if (!STR_EMBED_P(str)) {
8065 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8066 }
8067 TERM_FILL((char *)t, termlen);
8068 RSTRING(str)->as.heap.ptr = (char *)buf;
8069 RSTRING(str)->as.heap.len = t - buf;
8070 STR_SET_NOEMBED(str);
8071 RSTRING(str)->as.heap.aux.capa = max;
8072 }
8073
8074 if (modify) {
8075 if (cr != ENC_CODERANGE_BROKEN)
8076 ENC_CODERANGE_SET(str, cr);
8077 rb_enc_associate(str, enc);
8078 return str;
8079 }
8080 return Qnil;
8081}
8082
8083
8084/*
8085 * call-seq:
8086 * tr!(selector, replacements) -> self or nil
8087 *
8088 * Like String#tr, but modifies +self+ in place.
8089 * Returns +self+ if any changes were made, +nil+ otherwise.
8090 *
8091 */
8092
8093static VALUE
8094rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8095{
8096 return tr_trans(str, src, repl, 0);
8097}
8098
8099
8100/*
8101 * call-seq:
8102 * tr(selector, replacements) -> new_string
8103 *
8104 * Returns a copy of +self+ with each character specified by string +selector+
8105 * translated to the corresponding character in string +replacements+.
8106 * The correspondence is _positional_:
8107 *
8108 * - Each occurrence of the first character specified by +selector+
8109 * is translated to the first character in +replacements+.
8110 * - Each occurrence of the second character specified by +selector+
8111 * is translated to the second character in +replacements+.
8112 * - And so on.
8113 *
8114 * Example:
8115 *
8116 * 'hello'.tr('el', 'ip') #=> "hippo"
8117 *
8118 * If +replacements+ is shorter than +selector+,
8119 * it is implicitly padded with its own last character:
8120 *
8121 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8122 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8123 *
8124 * Arguments +selector+ and +replacements+ must be valid character selectors
8125 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8126 * and may use any of its valid forms, including negation, ranges, and escaping:
8127 *
8128 * # Negation.
8129 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8130 * # Ranges.
8131 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8132 * # Escapes.
8133 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8134 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8135 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8136 *
8137 */
8138
8139static VALUE
8140rb_str_tr(VALUE str, VALUE src, VALUE repl)
8141{
8142 str = str_duplicate(rb_cString, str);
8143 tr_trans(str, src, repl, 0);
8144 return str;
8145}
8146
8147#define TR_TABLE_MAX (UCHAR_MAX+1)
8148#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8149static void
8150tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8151 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8152{
8153 const unsigned int errc = -1;
8154 char buf[TR_TABLE_MAX];
8155 struct tr tr;
8156 unsigned int c;
8157 VALUE table = 0, ptable = 0;
8158 int i, l, cflag = 0;
8159
8160 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8161 tr.gen = tr.now = tr.max = 0;
8162
8163 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8164 cflag = 1;
8165 tr.p += l;
8166 }
8167 if (first) {
8168 for (i=0; i<TR_TABLE_MAX; i++) {
8169 stable[i] = 1;
8170 }
8171 stable[TR_TABLE_MAX] = cflag;
8172 }
8173 else if (stable[TR_TABLE_MAX] && !cflag) {
8174 stable[TR_TABLE_MAX] = 0;
8175 }
8176 for (i=0; i<TR_TABLE_MAX; i++) {
8177 buf[i] = cflag;
8178 }
8179
8180 while ((c = trnext(&tr, enc)) != errc) {
8181 if (c < TR_TABLE_MAX) {
8182 buf[(unsigned char)c] = !cflag;
8183 }
8184 else {
8185 VALUE key = UINT2NUM(c);
8186
8187 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8188 if (cflag) {
8189 ptable = *ctablep;
8190 table = ptable ? ptable : rb_hash_new();
8191 *ctablep = table;
8192 }
8193 else {
8194 table = rb_hash_new();
8195 ptable = *tablep;
8196 *tablep = table;
8197 }
8198 }
8199 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8200 rb_hash_aset(table, key, Qtrue);
8201 }
8202 }
8203 }
8204 for (i=0; i<TR_TABLE_MAX; i++) {
8205 stable[i] = stable[i] && buf[i];
8206 }
8207 if (!table && !cflag) {
8208 *tablep = 0;
8209 }
8210}
8211
8212
8213static int
8214tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8215{
8216 if (c < TR_TABLE_MAX) {
8217 return table[c] != 0;
8218 }
8219 else {
8220 VALUE v = UINT2NUM(c);
8221
8222 if (del) {
8223 if (!NIL_P(rb_hash_lookup(del, v)) &&
8224 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8225 return TRUE;
8226 }
8227 }
8228 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8229 return FALSE;
8230 }
8231 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8232 }
8233}
8234
8235/*
8236 * call-seq:
8237 * delete!(*selectors) -> self or nil
8238 *
8239 * Like String#delete, but modifies +self+ in place.
8240 * Returns +self+ if any changes were made, +nil+ otherwise.
8241 *
8242 */
8243
8244static VALUE
8245rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8246{
8247 char squeez[TR_TABLE_SIZE];
8248 rb_encoding *enc = 0;
8249 char *s, *send, *t;
8250 VALUE del = 0, nodel = 0;
8251 int modify = 0;
8252 int i, ascompat, cr;
8253
8254 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8256 for (i=0; i<argc; i++) {
8257 VALUE s = argv[i];
8258
8259 StringValue(s);
8260 enc = rb_enc_check(str, s);
8261 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8262 }
8263
8264 str_modify_keep_cr(str);
8265 ascompat = rb_enc_asciicompat(enc);
8266 s = t = RSTRING_PTR(str);
8267 send = RSTRING_END(str);
8268 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8269 while (s < send) {
8270 unsigned int c;
8271 int clen;
8272
8273 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8274 if (squeez[c]) {
8275 modify = 1;
8276 }
8277 else {
8278 if (t != s) *t = c;
8279 t++;
8280 }
8281 s++;
8282 }
8283 else {
8284 c = rb_enc_codepoint_len(s, send, &clen, enc);
8285
8286 if (tr_find(c, squeez, del, nodel)) {
8287 modify = 1;
8288 }
8289 else {
8290 if (t != s) rb_enc_mbcput(c, t, enc);
8291 t += clen;
8293 }
8294 s += clen;
8295 }
8296 }
8297 TERM_FILL(t, TERM_LEN(str));
8298 STR_SET_LEN(str, t - RSTRING_PTR(str));
8299 ENC_CODERANGE_SET(str, cr);
8300
8301 if (modify) return str;
8302 return Qnil;
8303}
8304
8305
8306/*
8307 * call-seq:
8308 * delete(*selectors) -> new_string
8309 *
8310 * Returns a copy of +self+ with characters specified by +selectors+ removed
8311 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8312 *
8313 * "hello".delete "l","lo" #=> "heo"
8314 * "hello".delete "lo" #=> "he"
8315 * "hello".delete "aeiou", "^e" #=> "hell"
8316 * "hello".delete "ej-m" #=> "ho"
8317 *
8318 */
8319
8320static VALUE
8321rb_str_delete(int argc, VALUE *argv, VALUE str)
8322{
8323 str = str_duplicate(rb_cString, str);
8324 rb_str_delete_bang(argc, argv, str);
8325 return str;
8326}
8327
8328
8329/*
8330 * call-seq:
8331 * squeeze!(*selectors) -> self or nil
8332 *
8333 * Like String#squeeze, but modifies +self+ in place.
8334 * Returns +self+ if any changes were made, +nil+ otherwise.
8335 */
8336
8337static VALUE
8338rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8339{
8340 char squeez[TR_TABLE_SIZE];
8341 rb_encoding *enc = 0;
8342 VALUE del = 0, nodel = 0;
8343 unsigned char *s, *send, *t;
8344 int i, modify = 0;
8345 int ascompat, singlebyte = single_byte_optimizable(str);
8346 unsigned int save;
8347
8348 if (argc == 0) {
8349 enc = STR_ENC_GET(str);
8350 }
8351 else {
8352 for (i=0; i<argc; i++) {
8353 VALUE s = argv[i];
8354
8355 StringValue(s);
8356 enc = rb_enc_check(str, s);
8357 if (singlebyte && !single_byte_optimizable(s))
8358 singlebyte = 0;
8359 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8360 }
8361 }
8362
8363 str_modify_keep_cr(str);
8364 s = t = (unsigned char *)RSTRING_PTR(str);
8365 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8366 send = (unsigned char *)RSTRING_END(str);
8367 save = -1;
8368 ascompat = rb_enc_asciicompat(enc);
8369
8370 if (singlebyte) {
8371 while (s < send) {
8372 unsigned int c = *s++;
8373 if (c != save || (argc > 0 && !squeez[c])) {
8374 *t++ = save = c;
8375 }
8376 }
8377 }
8378 else {
8379 while (s < send) {
8380 unsigned int c;
8381 int clen;
8382
8383 if (ascompat && (c = *s) < 0x80) {
8384 if (c != save || (argc > 0 && !squeez[c])) {
8385 *t++ = save = c;
8386 }
8387 s++;
8388 }
8389 else {
8390 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8391
8392 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8393 if (t != s) rb_enc_mbcput(c, t, enc);
8394 save = c;
8395 t += clen;
8396 }
8397 s += clen;
8398 }
8399 }
8400 }
8401
8402 TERM_FILL((char *)t, TERM_LEN(str));
8403 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8404 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8405 modify = 1;
8406 }
8407
8408 if (modify) return str;
8409 return Qnil;
8410}
8411
8412
8413/*
8414 * call-seq:
8415 * squeeze(*selectors) -> new_string
8416 *
8417 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8418 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8419 *
8420 * "Squeezed" means that each multiple-character run of a selected character
8421 * is squeezed down to a single character;
8422 * with no arguments given, squeezes all characters:
8423 *
8424 * "yellow moon".squeeze #=> "yelow mon"
8425 * " now is the".squeeze(" ") #=> " now is the"
8426 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8427 *
8428 */
8429
8430static VALUE
8431rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8432{
8433 str = str_duplicate(rb_cString, str);
8434 rb_str_squeeze_bang(argc, argv, str);
8435 return str;
8436}
8437
8438
8439/*
8440 * call-seq:
8441 * tr_s!(selector, replacements) -> self or nil
8442 *
8443 * Like String#tr_s, but modifies +self+ in place.
8444 * Returns +self+ if any changes were made, +nil+ otherwise.
8445 *
8446 * Related: String#squeeze!.
8447 */
8448
8449static VALUE
8450rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8451{
8452 return tr_trans(str, src, repl, 1);
8453}
8454
8455
8456/*
8457 * call-seq:
8458 * tr_s(selector, replacements) -> string
8459 *
8460 * Like String#tr, but also squeezes the modified portions of the translated string;
8461 * returns a new string (translated and squeezed).
8462 *
8463 * 'hello'.tr_s('l', 'r') #=> "hero"
8464 * 'hello'.tr_s('el', '-') #=> "h-o"
8465 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8466 *
8467 * Related: String#squeeze.
8468 *
8469 */
8470
8471static VALUE
8472rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8473{
8474 str = str_duplicate(rb_cString, str);
8475 tr_trans(str, src, repl, 1);
8476 return str;
8477}
8478
8479
8480/*
8481 * call-seq:
8482 * count(*selectors) -> integer
8483 *
8484 * Returns the total number of characters in +self+
8485 * that are specified by the given +selectors+
8486 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8487 *
8488 * a = "hello world"
8489 * a.count "lo" #=> 5
8490 * a.count "lo", "o" #=> 2
8491 * a.count "hello", "^l" #=> 4
8492 * a.count "ej-m" #=> 4
8493 *
8494 * "hello^world".count "\\^aeiou" #=> 4
8495 * "hello-world".count "a\\-eo" #=> 4
8496 *
8497 * c = "hello world\\r\\n"
8498 * c.count "\\" #=> 2
8499 * c.count "\\A" #=> 0
8500 * c.count "X-\\w" #=> 3
8501 */
8502
8503static VALUE
8504rb_str_count(int argc, VALUE *argv, VALUE str)
8505{
8506 char table[TR_TABLE_SIZE];
8507 rb_encoding *enc = 0;
8508 VALUE del = 0, nodel = 0, tstr;
8509 char *s, *send;
8510 int i;
8511 int ascompat;
8512 size_t n = 0;
8513
8515
8516 tstr = argv[0];
8517 StringValue(tstr);
8518 enc = rb_enc_check(str, tstr);
8519 if (argc == 1) {
8520 const char *ptstr;
8521 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8522 (ptstr = RSTRING_PTR(tstr),
8523 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8524 !is_broken_string(str)) {
8525 int clen;
8526 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8527
8528 s = RSTRING_PTR(str);
8529 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8530 send = RSTRING_END(str);
8531 while (s < send) {
8532 if (*(unsigned char*)s++ == c) n++;
8533 }
8534 return SIZET2NUM(n);
8535 }
8536 }
8537
8538 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8539 for (i=1; i<argc; i++) {
8540 tstr = argv[i];
8541 StringValue(tstr);
8542 enc = rb_enc_check(str, tstr);
8543 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8544 }
8545
8546 s = RSTRING_PTR(str);
8547 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8548 send = RSTRING_END(str);
8549 ascompat = rb_enc_asciicompat(enc);
8550 while (s < send) {
8551 unsigned int c;
8552
8553 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8554 if (table[c]) {
8555 n++;
8556 }
8557 s++;
8558 }
8559 else {
8560 int clen;
8561 c = rb_enc_codepoint_len(s, send, &clen, enc);
8562 if (tr_find(c, table, del, nodel)) {
8563 n++;
8564 }
8565 s += clen;
8566 }
8567 }
8568
8569 return SIZET2NUM(n);
8570}
8571
8572static VALUE
8573rb_fs_check(VALUE val)
8574{
8575 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8576 val = rb_check_string_type(val);
8577 if (NIL_P(val)) return 0;
8578 }
8579 return val;
8580}
8581
8582static const char isspacetable[256] = {
8583 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8584 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8585 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8586 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8587 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8588 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8589 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8590 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8591 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8592 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8594 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8595 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8596 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8597 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8598 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8599};
8600
8601#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8602
8603static long
8604split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8605{
8606 if (empty_count >= 0 && len == 0) {
8607 return empty_count + 1;
8608 }
8609 if (empty_count > 0) {
8610 /* make different substrings */
8611 if (result) {
8612 do {
8613 rb_ary_push(result, str_new_empty_String(str));
8614 } while (--empty_count > 0);
8615 }
8616 else {
8617 do {
8618 rb_yield(str_new_empty_String(str));
8619 } while (--empty_count > 0);
8620 }
8621 }
8622 str = rb_str_subseq(str, beg, len);
8623 if (result) {
8624 rb_ary_push(result, str);
8625 }
8626 else {
8627 rb_yield(str);
8628 }
8629 return empty_count;
8630}
8631
8632typedef enum {
8633 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8634} split_type_t;
8635
8636static split_type_t
8637literal_split_pattern(VALUE spat, split_type_t default_type)
8638{
8639 rb_encoding *enc = STR_ENC_GET(spat);
8640 const char *ptr;
8641 long len;
8642 RSTRING_GETMEM(spat, ptr, len);
8643 if (len == 0) {
8644 /* Special case - split into chars */
8645 return SPLIT_TYPE_CHARS;
8646 }
8647 else if (rb_enc_asciicompat(enc)) {
8648 if (len == 1 && ptr[0] == ' ') {
8649 return SPLIT_TYPE_AWK;
8650 }
8651 }
8652 else {
8653 int l;
8654 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8655 return SPLIT_TYPE_AWK;
8656 }
8657 }
8658 return default_type;
8659}
8660
8661/*
8662 * call-seq:
8663 * split(field_sep = $;, limit = nil) -> array
8664 * split(field_sep = $;, limit = nil) {|substring| ... } -> self
8665 *
8666 * :include: doc/string/split.rdoc
8667 *
8668 */
8669
8670static VALUE
8671rb_str_split_m(int argc, VALUE *argv, VALUE str)
8672{
8673 rb_encoding *enc;
8674 VALUE spat;
8675 VALUE limit;
8676 split_type_t split_type;
8677 long beg, end, i = 0, empty_count = -1;
8678 int lim = 0;
8679 VALUE result, tmp;
8680
8681 result = rb_block_given_p() ? Qfalse : Qnil;
8682 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8683 lim = NUM2INT(limit);
8684 if (lim <= 0) limit = Qnil;
8685 else if (lim == 1) {
8686 if (RSTRING_LEN(str) == 0)
8687 return result ? rb_ary_new2(0) : str;
8688 tmp = str_duplicate(rb_cString, str);
8689 if (!result) {
8690 rb_yield(tmp);
8691 return str;
8692 }
8693 return rb_ary_new3(1, tmp);
8694 }
8695 i = 1;
8696 }
8697 if (NIL_P(limit) && !lim) empty_count = 0;
8698
8699 enc = STR_ENC_GET(str);
8700 split_type = SPLIT_TYPE_REGEXP;
8701 if (!NIL_P(spat)) {
8702 spat = get_pat_quoted(spat, 0);
8703 }
8704 else if (NIL_P(spat = rb_fs)) {
8705 split_type = SPLIT_TYPE_AWK;
8706 }
8707 else if (!(spat = rb_fs_check(spat))) {
8708 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8709 }
8710 else {
8711 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8712 }
8713 if (split_type != SPLIT_TYPE_AWK) {
8714 switch (BUILTIN_TYPE(spat)) {
8715 case T_REGEXP:
8716 rb_reg_options(spat); /* check if uninitialized */
8717 tmp = RREGEXP_SRC(spat);
8718 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8719 if (split_type == SPLIT_TYPE_AWK) {
8720 spat = tmp;
8721 split_type = SPLIT_TYPE_STRING;
8722 }
8723 break;
8724
8725 case T_STRING:
8726 mustnot_broken(spat);
8727 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8728 break;
8729
8730 default:
8732 }
8733 }
8734
8735#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8736
8737 if (result) result = rb_ary_new();
8738 beg = 0;
8739 char *ptr = RSTRING_PTR(str);
8740 char *eptr = RSTRING_END(str);
8741 if (split_type == SPLIT_TYPE_AWK) {
8742 char *bptr = ptr;
8743 int skip = 1;
8744 unsigned int c;
8745
8746 end = beg;
8747 if (is_ascii_string(str)) {
8748 while (ptr < eptr) {
8749 c = (unsigned char)*ptr++;
8750 if (skip) {
8751 if (ascii_isspace(c)) {
8752 beg = ptr - bptr;
8753 }
8754 else {
8755 end = ptr - bptr;
8756 skip = 0;
8757 if (!NIL_P(limit) && lim <= i) break;
8758 }
8759 }
8760 else if (ascii_isspace(c)) {
8761 SPLIT_STR(beg, end-beg);
8762 skip = 1;
8763 beg = ptr - bptr;
8764 if (!NIL_P(limit)) ++i;
8765 }
8766 else {
8767 end = ptr - bptr;
8768 }
8769 }
8770 }
8771 else {
8772 while (ptr < eptr) {
8773 int n;
8774
8775 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8776 ptr += n;
8777 if (skip) {
8778 if (rb_isspace(c)) {
8779 beg = ptr - bptr;
8780 }
8781 else {
8782 end = ptr - bptr;
8783 skip = 0;
8784 if (!NIL_P(limit) && lim <= i) break;
8785 }
8786 }
8787 else if (rb_isspace(c)) {
8788 SPLIT_STR(beg, end-beg);
8789 skip = 1;
8790 beg = ptr - bptr;
8791 if (!NIL_P(limit)) ++i;
8792 }
8793 else {
8794 end = ptr - bptr;
8795 }
8796 }
8797 }
8798 }
8799 else if (split_type == SPLIT_TYPE_STRING) {
8800 char *str_start = ptr;
8801 char *substr_start = ptr;
8802 char *sptr = RSTRING_PTR(spat);
8803 long slen = RSTRING_LEN(spat);
8804
8805 mustnot_broken(str);
8806 enc = rb_enc_check(str, spat);
8807 while (ptr < eptr &&
8808 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8809 /* Check we are at the start of a char */
8810 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8811 if (t != ptr + end) {
8812 ptr = t;
8813 continue;
8814 }
8815 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8816 ptr += end + slen;
8817 substr_start = ptr;
8818 if (!NIL_P(limit) && lim <= ++i) break;
8819 }
8820 beg = ptr - str_start;
8821 }
8822 else if (split_type == SPLIT_TYPE_CHARS) {
8823 char *str_start = ptr;
8824 int n;
8825
8826 mustnot_broken(str);
8827 enc = rb_enc_get(str);
8828 while (ptr < eptr &&
8829 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8830 SPLIT_STR(ptr - str_start, n);
8831 ptr += n;
8832 if (!NIL_P(limit) && lim <= ++i) break;
8833 }
8834 beg = ptr - str_start;
8835 }
8836 else {
8837 long len = RSTRING_LEN(str);
8838 long start = beg;
8839 long idx;
8840 int last_null = 0;
8841 struct re_registers *regs;
8842 VALUE match = 0;
8843
8844 for (; rb_reg_search(spat, str, start, 0) >= 0;
8845 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8846 match = rb_backref_get();
8847 if (!result) rb_match_busy(match);
8848 regs = RMATCH_REGS(match);
8849 end = BEG(0);
8850 if (start == end && BEG(0) == END(0)) {
8851 if (!ptr) {
8852 SPLIT_STR(0, 0);
8853 break;
8854 }
8855 else if (last_null == 1) {
8856 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8857 beg = start;
8858 }
8859 else {
8860 if (start == len)
8861 start++;
8862 else
8863 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8864 last_null = 1;
8865 continue;
8866 }
8867 }
8868 else {
8869 SPLIT_STR(beg, end-beg);
8870 beg = start = END(0);
8871 }
8872 last_null = 0;
8873
8874 for (idx=1; idx < regs->num_regs; idx++) {
8875 if (BEG(idx) == -1) continue;
8876 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8877 }
8878 if (!NIL_P(limit) && lim <= ++i) break;
8879 }
8880 if (match) rb_match_unbusy(match);
8881 }
8882 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8883 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8884 }
8885
8886 return result ? result : str;
8887}
8888
8889VALUE
8890rb_str_split(VALUE str, const char *sep0)
8891{
8892 VALUE sep;
8893
8894 StringValue(str);
8895 sep = rb_str_new_cstr(sep0);
8896 return rb_str_split_m(1, &sep, str);
8897}
8898
8899#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8900
8901static inline int
8902enumerator_element(VALUE ary, VALUE e)
8903{
8904 if (ary) {
8905 rb_ary_push(ary, e);
8906 return 0;
8907 }
8908 else {
8909 rb_yield(e);
8910 return 1;
8911 }
8912}
8913
8914#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8915
8916static const char *
8917chomp_newline(const char *p, const char *e, rb_encoding *enc)
8918{
8919 const char *prev = rb_enc_prev_char(p, e, e, enc);
8920 if (rb_enc_is_newline(prev, e, enc)) {
8921 e = prev;
8922 prev = rb_enc_prev_char(p, e, e, enc);
8923 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
8924 e = prev;
8925 }
8926 return e;
8927}
8928
8929static VALUE
8930get_rs(void)
8931{
8932 VALUE rs = rb_rs;
8933 if (!NIL_P(rs) &&
8934 (!RB_TYPE_P(rs, T_STRING) ||
8935 RSTRING_LEN(rs) != 1 ||
8936 RSTRING_PTR(rs)[0] != '\n')) {
8937 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
8938 }
8939 return rs;
8940}
8941
8942#define rb_rs get_rs()
8943
8944static VALUE
8945rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
8946{
8947 rb_encoding *enc;
8948 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
8949 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8950 long pos, len, rslen;
8951 int rsnewline = 0;
8952
8953 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
8954 rs = rb_rs;
8955 if (!NIL_P(opts)) {
8956 static ID keywords[1];
8957 if (!keywords[0]) {
8958 keywords[0] = rb_intern_const("chomp");
8959 }
8960 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
8961 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
8962 }
8963
8964 if (NIL_P(rs)) {
8965 if (!ENUM_ELEM(ary, str)) {
8966 return ary;
8967 }
8968 else {
8969 return orig;
8970 }
8971 }
8972
8973 if (!RSTRING_LEN(str)) goto end;
8974 str = rb_str_new_frozen(str);
8975 ptr = subptr = RSTRING_PTR(str);
8976 pend = RSTRING_END(str);
8977 len = RSTRING_LEN(str);
8978 StringValue(rs);
8979 rslen = RSTRING_LEN(rs);
8980
8981 if (rs == rb_default_rs)
8982 enc = rb_enc_get(str);
8983 else
8984 enc = rb_enc_check(str, rs);
8985
8986 if (rslen == 0) {
8987 /* paragraph mode */
8988 int n;
8989 const char *eol = NULL;
8990 subend = subptr;
8991 while (subend < pend) {
8992 long chomp_rslen = 0;
8993 do {
8994 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
8995 n = 0;
8996 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
8997 if (rb_enc_is_newline(subend + n, pend, enc)) {
8998 if (eol == subend) break;
8999 subend += rslen;
9000 if (subptr) {
9001 eol = subend;
9002 chomp_rslen = -rslen;
9003 }
9004 }
9005 else {
9006 if (!subptr) subptr = subend;
9007 subend += rslen;
9008 }
9009 rslen = 0;
9010 } while (subend < pend);
9011 if (!subptr) break;
9012 if (rslen == 0) chomp_rslen = 0;
9013 line = rb_str_subseq(str, subptr - ptr,
9014 subend - subptr + (chomp ? chomp_rslen : rslen));
9015 if (ENUM_ELEM(ary, line)) {
9016 str_mod_check(str, ptr, len);
9017 }
9018 subptr = eol = NULL;
9019 }
9020 goto end;
9021 }
9022 else {
9023 rsptr = RSTRING_PTR(rs);
9024 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9025 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9026 rsnewline = 1;
9027 }
9028 }
9029
9030 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9031 rs = rb_str_new(rsptr, rslen);
9032 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9033 rsptr = RSTRING_PTR(rs);
9034 rslen = RSTRING_LEN(rs);
9035 }
9036
9037 while (subptr < pend) {
9038 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9039 if (pos < 0) break;
9040 hit = subptr + pos;
9041 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9042 if (hit != adjusted) {
9043 subptr = adjusted;
9044 continue;
9045 }
9046 subend = hit += rslen;
9047 if (chomp) {
9048 if (rsnewline) {
9049 subend = chomp_newline(subptr, subend, enc);
9050 }
9051 else {
9052 subend -= rslen;
9053 }
9054 }
9055 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9056 if (ENUM_ELEM(ary, line)) {
9057 str_mod_check(str, ptr, len);
9058 }
9059 subptr = hit;
9060 }
9061
9062 if (subptr != pend) {
9063 if (chomp) {
9064 if (rsnewline) {
9065 pend = chomp_newline(subptr, pend, enc);
9066 }
9067 else if (pend - subptr >= rslen &&
9068 memcmp(pend - rslen, rsptr, rslen) == 0) {
9069 pend -= rslen;
9070 }
9071 }
9072 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9073 ENUM_ELEM(ary, line);
9074 RB_GC_GUARD(str);
9075 }
9076
9077 end:
9078 if (ary)
9079 return ary;
9080 else
9081 return orig;
9082}
9083
9084/*
9085 * call-seq:
9086 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9087 * each_line(line_sep = $/, chomp: false) -> enumerator
9088 *
9089 * :include: doc/string/each_line.rdoc
9090 *
9091 */
9092
9093static VALUE
9094rb_str_each_line(int argc, VALUE *argv, VALUE str)
9095{
9096 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9097 return rb_str_enumerate_lines(argc, argv, str, 0);
9098}
9099
9100/*
9101 * call-seq:
9102 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9103 *
9104 * Forms substrings ("lines") of +self+ according to the given arguments
9105 * (see String#each_line for details); returns the lines in an array.
9106 *
9107 */
9108
9109static VALUE
9110rb_str_lines(int argc, VALUE *argv, VALUE str)
9111{
9112 VALUE ary = WANTARRAY("lines", 0);
9113 return rb_str_enumerate_lines(argc, argv, str, ary);
9114}
9115
9116static VALUE
9117rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9118{
9119 return LONG2FIX(RSTRING_LEN(str));
9120}
9121
9122static VALUE
9123rb_str_enumerate_bytes(VALUE str, VALUE ary)
9124{
9125 long i;
9126
9127 for (i=0; i<RSTRING_LEN(str); i++) {
9128 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9129 }
9130 if (ary)
9131 return ary;
9132 else
9133 return str;
9134}
9135
9136/*
9137 * call-seq:
9138 * each_byte {|byte| ... } -> self
9139 * each_byte -> enumerator
9140 *
9141 * :include: doc/string/each_byte.rdoc
9142 *
9143 */
9144
9145static VALUE
9146rb_str_each_byte(VALUE str)
9147{
9148 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9149 return rb_str_enumerate_bytes(str, 0);
9150}
9151
9152/*
9153 * call-seq:
9154 * bytes -> array_of_bytes
9155 *
9156 * :include: doc/string/bytes.rdoc
9157 *
9158 */
9159
9160static VALUE
9161rb_str_bytes(VALUE str)
9162{
9163 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9164 return rb_str_enumerate_bytes(str, ary);
9165}
9166
9167static VALUE
9168rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9169{
9170 return rb_str_length(str);
9171}
9172
9173static VALUE
9174rb_str_enumerate_chars(VALUE str, VALUE ary)
9175{
9176 VALUE orig = str;
9177 long i, len, n;
9178 const char *ptr;
9179 rb_encoding *enc;
9180
9181 str = rb_str_new_frozen(str);
9182 ptr = RSTRING_PTR(str);
9183 len = RSTRING_LEN(str);
9184 enc = rb_enc_get(str);
9185
9187 for (i = 0; i < len; i += n) {
9188 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9189 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9190 }
9191 }
9192 else {
9193 for (i = 0; i < len; i += n) {
9194 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9195 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9196 }
9197 }
9198 RB_GC_GUARD(str);
9199 if (ary)
9200 return ary;
9201 else
9202 return orig;
9203}
9204
9205/*
9206 * call-seq:
9207 * each_char {|c| ... } -> self
9208 * each_char -> enumerator
9209 *
9210 * :include: doc/string/each_char.rdoc
9211 *
9212 */
9213
9214static VALUE
9215rb_str_each_char(VALUE str)
9216{
9217 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9218 return rb_str_enumerate_chars(str, 0);
9219}
9220
9221/*
9222 * call-seq:
9223 * chars -> array_of_characters
9224 *
9225 * :include: doc/string/chars.rdoc
9226 *
9227 */
9228
9229static VALUE
9230rb_str_chars(VALUE str)
9231{
9232 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9233 return rb_str_enumerate_chars(str, ary);
9234}
9235
9236static VALUE
9237rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9238{
9239 VALUE orig = str;
9240 int n;
9241 unsigned int c;
9242 const char *ptr, *end;
9243 rb_encoding *enc;
9244
9245 if (single_byte_optimizable(str))
9246 return rb_str_enumerate_bytes(str, ary);
9247
9248 str = rb_str_new_frozen(str);
9249 ptr = RSTRING_PTR(str);
9250 end = RSTRING_END(str);
9251 enc = STR_ENC_GET(str);
9252
9253 while (ptr < end) {
9254 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9255 ENUM_ELEM(ary, UINT2NUM(c));
9256 ptr += n;
9257 }
9258 RB_GC_GUARD(str);
9259 if (ary)
9260 return ary;
9261 else
9262 return orig;
9263}
9264
9265/*
9266 * call-seq:
9267 * each_codepoint {|integer| ... } -> self
9268 * each_codepoint -> enumerator
9269 *
9270 * :include: doc/string/each_codepoint.rdoc
9271 *
9272 */
9273
9274static VALUE
9275rb_str_each_codepoint(VALUE str)
9276{
9277 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9278 return rb_str_enumerate_codepoints(str, 0);
9279}
9280
9281/*
9282 * call-seq:
9283 * codepoints -> array_of_integers
9284 *
9285 * :include: doc/string/codepoints.rdoc
9286 *
9287 */
9288
9289static VALUE
9290rb_str_codepoints(VALUE str)
9291{
9292 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9293 return rb_str_enumerate_codepoints(str, ary);
9294}
9295
9296static regex_t *
9297get_reg_grapheme_cluster(rb_encoding *enc)
9298{
9299 int encidx = rb_enc_to_index(enc);
9300
9301 const OnigUChar source_ascii[] = "\\X";
9302 const OnigUChar *source = source_ascii;
9303 size_t source_len = sizeof(source_ascii) - 1;
9304
9305 switch (encidx) {
9306#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9307#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9308#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9309#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9310#define CASE_UTF(e) \
9311 case ENCINDEX_UTF_##e: { \
9312 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9313 source = source_UTF_##e; \
9314 source_len = sizeof(source_UTF_##e); \
9315 break; \
9316 }
9317 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9318#undef CASE_UTF
9319#undef CHARS_16BE
9320#undef CHARS_16LE
9321#undef CHARS_32BE
9322#undef CHARS_32LE
9323 }
9324
9325 regex_t *reg_grapheme_cluster;
9326 OnigErrorInfo einfo;
9327 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9328 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9329 if (r) {
9330 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9331 onig_error_code_to_str(message, r, &einfo);
9332 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9333 }
9334
9335 return reg_grapheme_cluster;
9336}
9337
9338static regex_t *
9339get_cached_reg_grapheme_cluster(rb_encoding *enc)
9340{
9341 int encidx = rb_enc_to_index(enc);
9342 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9343
9344 if (encidx == rb_utf8_encindex()) {
9345 if (!reg_grapheme_cluster_utf8) {
9346 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9347 }
9348
9349 return reg_grapheme_cluster_utf8;
9350 }
9351
9352 return NULL;
9353}
9354
9355static VALUE
9356rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9357{
9358 size_t grapheme_cluster_count = 0;
9359 rb_encoding *enc = get_encoding(str);
9360 const char *ptr, *end;
9361
9362 if (!rb_enc_unicode_p(enc)) {
9363 return rb_str_length(str);
9364 }
9365
9366 bool cached_reg_grapheme_cluster = true;
9367 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9368 if (!reg_grapheme_cluster) {
9369 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9370 cached_reg_grapheme_cluster = false;
9371 }
9372
9373 ptr = RSTRING_PTR(str);
9374 end = RSTRING_END(str);
9375
9376 while (ptr < end) {
9377 OnigPosition len = onig_match(reg_grapheme_cluster,
9378 (const OnigUChar *)ptr, (const OnigUChar *)end,
9379 (const OnigUChar *)ptr, NULL, 0);
9380 if (len <= 0) break;
9381 grapheme_cluster_count++;
9382 ptr += len;
9383 }
9384
9385 if (!cached_reg_grapheme_cluster) {
9386 onig_free(reg_grapheme_cluster);
9387 }
9388
9389 return SIZET2NUM(grapheme_cluster_count);
9390}
9391
9392static VALUE
9393rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9394{
9395 VALUE orig = str;
9396 rb_encoding *enc = get_encoding(str);
9397 const char *ptr0, *ptr, *end;
9398
9399 if (!rb_enc_unicode_p(enc)) {
9400 return rb_str_enumerate_chars(str, ary);
9401 }
9402
9403 if (!ary) str = rb_str_new_frozen(str);
9404
9405 bool cached_reg_grapheme_cluster = true;
9406 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9407 if (!reg_grapheme_cluster) {
9408 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9409 cached_reg_grapheme_cluster = false;
9410 }
9411
9412 ptr0 = ptr = RSTRING_PTR(str);
9413 end = RSTRING_END(str);
9414
9415 while (ptr < end) {
9416 OnigPosition len = onig_match(reg_grapheme_cluster,
9417 (const OnigUChar *)ptr, (const OnigUChar *)end,
9418 (const OnigUChar *)ptr, NULL, 0);
9419 if (len <= 0) break;
9420 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9421 ptr += len;
9422 }
9423
9424 if (!cached_reg_grapheme_cluster) {
9425 onig_free(reg_grapheme_cluster);
9426 }
9427
9428 RB_GC_GUARD(str);
9429 if (ary)
9430 return ary;
9431 else
9432 return orig;
9433}
9434
9435/*
9436 * call-seq:
9437 * each_grapheme_cluster {|gc| ... } -> self
9438 * each_grapheme_cluster -> enumerator
9439 *
9440 * :include: doc/string/each_grapheme_cluster.rdoc
9441 *
9442 */
9443
9444static VALUE
9445rb_str_each_grapheme_cluster(VALUE str)
9446{
9447 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9448 return rb_str_enumerate_grapheme_clusters(str, 0);
9449}
9450
9451/*
9452 * call-seq:
9453 * grapheme_clusters -> array_of_grapheme_clusters
9454 *
9455 * :include: doc/string/grapheme_clusters.rdoc
9456 *
9457 */
9458
9459static VALUE
9460rb_str_grapheme_clusters(VALUE str)
9461{
9462 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9463 return rb_str_enumerate_grapheme_clusters(str, ary);
9464}
9465
9466static long
9467chopped_length(VALUE str)
9468{
9469 rb_encoding *enc = STR_ENC_GET(str);
9470 const char *p, *p2, *beg, *end;
9471
9472 beg = RSTRING_PTR(str);
9473 end = beg + RSTRING_LEN(str);
9474 if (beg >= end) return 0;
9475 p = rb_enc_prev_char(beg, end, end, enc);
9476 if (!p) return 0;
9477 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9478 p2 = rb_enc_prev_char(beg, p, end, enc);
9479 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9480 }
9481 return p - beg;
9482}
9483
9484/*
9485 * call-seq:
9486 * chop! -> self or nil
9487 *
9488 * Like String#chop, but modifies +self+ in place;
9489 * returns +nil+ if +self+ is empty, +self+ otherwise.
9490 *
9491 * Related: String#chomp!.
9492 */
9493
9494static VALUE
9495rb_str_chop_bang(VALUE str)
9496{
9497 str_modify_keep_cr(str);
9498 if (RSTRING_LEN(str) > 0) {
9499 long len;
9500 len = chopped_length(str);
9501 STR_SET_LEN(str, len);
9502 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9503 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9505 }
9506 return str;
9507 }
9508 return Qnil;
9509}
9510
9511
9512/*
9513 * call-seq:
9514 * chop -> new_string
9515 *
9516 * :include: doc/string/chop.rdoc
9517 *
9518 */
9519
9520static VALUE
9521rb_str_chop(VALUE str)
9522{
9523 return rb_str_subseq(str, 0, chopped_length(str));
9524}
9525
9526static long
9527smart_chomp(VALUE str, const char *e, const char *p)
9528{
9529 rb_encoding *enc = rb_enc_get(str);
9530 if (rb_enc_mbminlen(enc) > 1) {
9531 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9532 if (rb_enc_is_newline(pp, e, enc)) {
9533 e = pp;
9534 }
9535 pp = e - rb_enc_mbminlen(enc);
9536 if (pp >= p) {
9537 pp = rb_enc_left_char_head(p, pp, e, enc);
9538 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9539 e = pp;
9540 }
9541 }
9542 }
9543 else {
9544 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9545 case '\n':
9546 if (--e > p && *(e-1) == '\r') {
9547 --e;
9548 }
9549 break;
9550 case '\r':
9551 --e;
9552 break;
9553 }
9554 }
9555 return e - p;
9556}
9557
9558static long
9559chompped_length(VALUE str, VALUE rs)
9560{
9561 rb_encoding *enc;
9562 int newline;
9563 char *pp, *e, *rsptr;
9564 long rslen;
9565 char *const p = RSTRING_PTR(str);
9566 long len = RSTRING_LEN(str);
9567
9568 if (len == 0) return 0;
9569 e = p + len;
9570 if (rs == rb_default_rs) {
9571 return smart_chomp(str, e, p);
9572 }
9573
9574 enc = rb_enc_get(str);
9575 RSTRING_GETMEM(rs, rsptr, rslen);
9576 if (rslen == 0) {
9577 if (rb_enc_mbminlen(enc) > 1) {
9578 while (e > p) {
9579 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9580 if (!rb_enc_is_newline(pp, e, enc)) break;
9581 e = pp;
9582 pp -= rb_enc_mbminlen(enc);
9583 if (pp >= p) {
9584 pp = rb_enc_left_char_head(p, pp, e, enc);
9585 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9586 e = pp;
9587 }
9588 }
9589 }
9590 }
9591 else {
9592 while (e > p && *(e-1) == '\n') {
9593 --e;
9594 if (e > p && *(e-1) == '\r')
9595 --e;
9596 }
9597 }
9598 return e - p;
9599 }
9600 if (rslen > len) return len;
9601
9602 enc = rb_enc_get(rs);
9603 newline = rsptr[rslen-1];
9604 if (rslen == rb_enc_mbminlen(enc)) {
9605 if (rslen == 1) {
9606 if (newline == '\n')
9607 return smart_chomp(str, e, p);
9608 }
9609 else {
9610 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9611 return smart_chomp(str, e, p);
9612 }
9613 }
9614
9615 enc = rb_enc_check(str, rs);
9616 if (is_broken_string(rs)) {
9617 return len;
9618 }
9619 pp = e - rslen;
9620 if (p[len-1] == newline &&
9621 (rslen <= 1 ||
9622 memcmp(rsptr, pp, rslen) == 0)) {
9623 if (rb_enc_left_char_head(p, pp, e, enc) == pp)
9624 return len - rslen;
9625 RB_GC_GUARD(rs);
9626 }
9627 return len;
9628}
9629
9635static VALUE
9636chomp_rs(int argc, const VALUE *argv)
9637{
9638 rb_check_arity(argc, 0, 1);
9639 if (argc > 0) {
9640 VALUE rs = argv[0];
9641 if (!NIL_P(rs)) StringValue(rs);
9642 return rs;
9643 }
9644 else {
9645 return rb_rs;
9646 }
9647}
9648
9649VALUE
9650rb_str_chomp_string(VALUE str, VALUE rs)
9651{
9652 long olen = RSTRING_LEN(str);
9653 long len = chompped_length(str, rs);
9654 if (len >= olen) return Qnil;
9655 str_modify_keep_cr(str);
9656 STR_SET_LEN(str, len);
9657 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9658 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9660 }
9661 return str;
9662}
9663
9664/*
9665 * call-seq:
9666 * chomp!(line_sep = $/) -> self or nil
9667 *
9668 * Like String#chomp, but modifies +self+ in place;
9669 * returns +nil+ if no modification made, +self+ otherwise.
9670 *
9671 */
9672
9673static VALUE
9674rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9675{
9676 VALUE rs;
9677 str_modifiable(str);
9678 if (RSTRING_LEN(str) == 0) return Qnil;
9679 rs = chomp_rs(argc, argv);
9680 if (NIL_P(rs)) return Qnil;
9681 return rb_str_chomp_string(str, rs);
9682}
9683
9684
9685/*
9686 * call-seq:
9687 * chomp(line_sep = $/) -> new_string
9688 *
9689 * :include: doc/string/chomp.rdoc
9690 *
9691 */
9692
9693static VALUE
9694rb_str_chomp(int argc, VALUE *argv, VALUE str)
9695{
9696 VALUE rs = chomp_rs(argc, argv);
9697 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9698 return rb_str_subseq(str, 0, chompped_length(str, rs));
9699}
9700
9701static long
9702lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9703{
9704 const char *const start = s;
9705
9706 if (!s || s >= e) return 0;
9707
9708 /* remove spaces at head */
9709 if (single_byte_optimizable(str)) {
9710 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9711 }
9712 else {
9713 while (s < e) {
9714 int n;
9715 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9716
9717 if (cc && !rb_isspace(cc)) break;
9718 s += n;
9719 }
9720 }
9721 return s - start;
9722}
9723
9724/*
9725 * call-seq:
9726 * lstrip! -> self or nil
9727 *
9728 * Like String#lstrip, except that any modifications are made in +self+;
9729 * returns +self+ if any modification are made, +nil+ otherwise.
9730 *
9731 * Related: String#rstrip!, String#strip!.
9732 */
9733
9734static VALUE
9735rb_str_lstrip_bang(VALUE str)
9736{
9737 rb_encoding *enc;
9738 char *start, *s;
9739 long olen, loffset;
9740
9741 str_modify_keep_cr(str);
9742 enc = STR_ENC_GET(str);
9743 RSTRING_GETMEM(str, start, olen);
9744 loffset = lstrip_offset(str, start, start+olen, enc);
9745 if (loffset > 0) {
9746 long len = olen-loffset;
9747 s = start + loffset;
9748 memmove(start, s, len);
9749 STR_SET_LEN(str, len);
9750 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9751 return str;
9752 }
9753 return Qnil;
9754}
9755
9756
9757/*
9758 * call-seq:
9759 * lstrip -> new_string
9760 *
9761 * Returns a copy of +self+ with leading whitespace removed;
9762 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9763 *
9764 * whitespace = "\x00\t\n\v\f\r "
9765 * s = whitespace + 'abc' + whitespace
9766 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9767 * s.lstrip # => "abc\u0000\t\n\v\f\r "
9768 *
9769 * Related: String#rstrip, String#strip.
9770 */
9771
9772static VALUE
9773rb_str_lstrip(VALUE str)
9774{
9775 char *start;
9776 long len, loffset;
9777 RSTRING_GETMEM(str, start, len);
9778 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9779 if (loffset <= 0) return str_duplicate(rb_cString, str);
9780 return rb_str_subseq(str, loffset, len - loffset);
9781}
9782
9783static long
9784rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9785{
9786 const char *t;
9787
9788 rb_str_check_dummy_enc(enc);
9790 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
9791 }
9792 if (!s || s >= e) return 0;
9793 t = e;
9794
9795 /* remove trailing spaces or '\0's */
9796 if (single_byte_optimizable(str)) {
9797 unsigned char c;
9798 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9799 }
9800 else {
9801 char *tp;
9802
9803 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9804 unsigned int c = rb_enc_codepoint(tp, e, enc);
9805 if (c && !rb_isspace(c)) break;
9806 t = tp;
9807 }
9808 }
9809 return e - t;
9810}
9811
9812/*
9813 * call-seq:
9814 * rstrip! -> self or nil
9815 *
9816 * Like String#rstrip, except that any modifications are made in +self+;
9817 * returns +self+ if any modification are made, +nil+ otherwise.
9818 *
9819 * Related: String#lstrip!, String#strip!.
9820 */
9821
9822static VALUE
9823rb_str_rstrip_bang(VALUE str)
9824{
9825 rb_encoding *enc;
9826 char *start;
9827 long olen, roffset;
9828
9829 str_modify_keep_cr(str);
9830 enc = STR_ENC_GET(str);
9831 RSTRING_GETMEM(str, start, olen);
9832 roffset = rstrip_offset(str, start, start+olen, enc);
9833 if (roffset > 0) {
9834 long len = olen - roffset;
9835
9836 STR_SET_LEN(str, len);
9837 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9838 return str;
9839 }
9840 return Qnil;
9841}
9842
9843
9844/*
9845 * call-seq:
9846 * rstrip -> new_string
9847 *
9848 * Returns a copy of the receiver with trailing whitespace removed;
9849 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9850 *
9851 * whitespace = "\x00\t\n\v\f\r "
9852 * s = whitespace + 'abc' + whitespace
9853 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9854 * s.rstrip # => "\u0000\t\n\v\f\r abc"
9855 *
9856 * Related: String#lstrip, String#strip.
9857 */
9858
9859static VALUE
9860rb_str_rstrip(VALUE str)
9861{
9862 rb_encoding *enc;
9863 char *start;
9864 long olen, roffset;
9865
9866 enc = STR_ENC_GET(str);
9867 RSTRING_GETMEM(str, start, olen);
9868 roffset = rstrip_offset(str, start, start+olen, enc);
9869
9870 if (roffset <= 0) return str_duplicate(rb_cString, str);
9871 return rb_str_subseq(str, 0, olen-roffset);
9872}
9873
9874
9875/*
9876 * call-seq:
9877 * strip! -> self or nil
9878 *
9879 * Like String#strip, except that any modifications are made in +self+;
9880 * returns +self+ if any modification are made, +nil+ otherwise.
9881 *
9882 * Related: String#lstrip!, String#strip!.
9883 */
9884
9885static VALUE
9886rb_str_strip_bang(VALUE str)
9887{
9888 char *start;
9889 long olen, loffset, roffset;
9890 rb_encoding *enc;
9891
9892 str_modify_keep_cr(str);
9893 enc = STR_ENC_GET(str);
9894 RSTRING_GETMEM(str, start, olen);
9895 loffset = lstrip_offset(str, start, start+olen, enc);
9896 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9897
9898 if (loffset > 0 || roffset > 0) {
9899 long len = olen-roffset;
9900 if (loffset > 0) {
9901 len -= loffset;
9902 memmove(start, start + loffset, len);
9903 }
9904 STR_SET_LEN(str, len);
9905 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9906 return str;
9907 }
9908 return Qnil;
9909}
9910
9911
9912/*
9913 * call-seq:
9914 * strip -> new_string
9915 *
9916 * Returns a copy of the receiver with leading and trailing whitespace removed;
9917 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9918 *
9919 * whitespace = "\x00\t\n\v\f\r "
9920 * s = whitespace + 'abc' + whitespace
9921 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9922 * s.strip # => "abc"
9923 *
9924 * Related: String#lstrip, String#rstrip.
9925 */
9926
9927static VALUE
9928rb_str_strip(VALUE str)
9929{
9930 char *start;
9931 long olen, loffset, roffset;
9932 rb_encoding *enc = STR_ENC_GET(str);
9933
9934 RSTRING_GETMEM(str, start, olen);
9935 loffset = lstrip_offset(str, start, start+olen, enc);
9936 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9937
9938 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
9939 return rb_str_subseq(str, loffset, olen-loffset-roffset);
9940}
9941
9942static VALUE
9943scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
9944{
9945 VALUE result, match;
9946 struct re_registers *regs;
9947 int i;
9948 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9949 if (pos >= 0) {
9950 if (BUILTIN_TYPE(pat) == T_STRING) {
9951 regs = NULL;
9952 end = pos + RSTRING_LEN(pat);
9953 }
9954 else {
9955 match = rb_backref_get();
9956 regs = RMATCH_REGS(match);
9957 pos = BEG(0);
9958 end = END(0);
9959 }
9960 if (pos == end) {
9961 rb_encoding *enc = STR_ENC_GET(str);
9962 /*
9963 * Always consume at least one character of the input string
9964 */
9965 if (RSTRING_LEN(str) > end)
9966 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
9967 RSTRING_END(str), enc);
9968 else
9969 *start = end + 1;
9970 }
9971 else {
9972 *start = end;
9973 }
9974 if (!regs || regs->num_regs == 1) {
9975 result = rb_str_subseq(str, pos, end - pos);
9976 return result;
9977 }
9978 result = rb_ary_new2(regs->num_regs);
9979 for (i=1; i < regs->num_regs; i++) {
9980 VALUE s = Qnil;
9981 if (BEG(i) >= 0) {
9982 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
9983 }
9984 rb_ary_push(result, s);
9985 }
9986
9987 return result;
9988 }
9989 return Qnil;
9990}
9991
9992
9993/*
9994 * call-seq:
9995 * scan(string_or_regexp) -> array
9996 * scan(string_or_regexp) {|matches| ... } -> self
9997 *
9998 * Matches a pattern against +self+; the pattern is:
9999 *
10000 * - +string_or_regexp+ itself, if it is a Regexp.
10001 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10002 *
10003 * Iterates through +self+, generating a collection of matching results:
10004 *
10005 * - If the pattern contains no groups, each result is the
10006 * matched string, <code>$&</code>.
10007 * - If the pattern contains groups, each result is an array
10008 * containing one entry per group.
10009 *
10010 * With no block given, returns an array of the results:
10011 *
10012 * s = 'cruel world'
10013 * s.scan(/\w+/) # => ["cruel", "world"]
10014 * s.scan(/.../) # => ["cru", "el ", "wor"]
10015 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10016 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10017 *
10018 * With a block given, calls the block with each result; returns +self+:
10019 *
10020 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10021 * print "\n"
10022 * s.scan(/(.)(.)/) {|x,y| print y, x }
10023 * print "\n"
10024 *
10025 * Output:
10026 *
10027 * <<cruel>> <<world>>
10028 * rceu lowlr
10029 *
10030 */
10031
10032static VALUE
10033rb_str_scan(VALUE str, VALUE pat)
10034{
10035 VALUE result;
10036 long start = 0;
10037 long last = -1, prev = 0;
10038 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10039
10040 pat = get_pat_quoted(pat, 1);
10041 mustnot_broken(str);
10042 if (!rb_block_given_p()) {
10043 VALUE ary = rb_ary_new();
10044
10045 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10046 last = prev;
10047 prev = start;
10048 rb_ary_push(ary, result);
10049 }
10050 if (last >= 0) rb_pat_search(pat, str, last, 1);
10051 else rb_backref_set(Qnil);
10052 return ary;
10053 }
10054
10055 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10056 last = prev;
10057 prev = start;
10058 rb_yield(result);
10059 str_mod_check(str, p, len);
10060 }
10061 if (last >= 0) rb_pat_search(pat, str, last, 1);
10062 return str;
10063}
10064
10065
10066/*
10067 * call-seq:
10068 * hex -> integer
10069 *
10070 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10071 * (with an optional sign and an optional <code>0x</code>) and returns the
10072 * corresponding number;
10073 * returns zero if there is no such leading substring:
10074 *
10075 * '0x0a'.hex # => 10
10076 * '-1234'.hex # => -4660
10077 * '0'.hex # => 0
10078 * 'non-numeric'.hex # => 0
10079 *
10080 * Related: String#oct.
10081 *
10082 */
10083
10084static VALUE
10085rb_str_hex(VALUE str)
10086{
10087 return rb_str_to_inum(str, 16, FALSE);
10088}
10089
10090
10091/*
10092 * call-seq:
10093 * oct -> integer
10094 *
10095 * Interprets the leading substring of +self+ as a string of octal digits
10096 * (with an optional sign) and returns the corresponding number;
10097 * returns zero if there is no such leading substring:
10098 *
10099 * '123'.oct # => 83
10100 * '-377'.oct # => -255
10101 * '0377non-numeric'.oct # => 255
10102 * 'non-numeric'.oct # => 0
10103 *
10104 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10105 * see Kernel#Integer.
10106 *
10107 * Related: String#hex.
10108 *
10109 */
10110
10111static VALUE
10112rb_str_oct(VALUE str)
10113{
10114 return rb_str_to_inum(str, -8, FALSE);
10115}
10116
10117#ifndef HAVE_CRYPT_R
10118# include "ruby/thread_native.h"
10119# include "ruby/atomic.h"
10120
10121static struct {
10122 rb_nativethread_lock_t lock;
10123} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10124
10125static void
10126crypt_mutex_initialize(void)
10127{
10128}
10129#endif
10130
10131/*
10132 * call-seq:
10133 * crypt(salt_str) -> new_string
10134 *
10135 * Returns the string generated by calling <code>crypt(3)</code>
10136 * standard library function with <code>str</code> and
10137 * <code>salt_str</code>, in this order, as its arguments. Please do
10138 * not use this method any longer. It is legacy; provided only for
10139 * backward compatibility with ruby scripts in earlier days. It is
10140 * bad to use in contemporary programs for several reasons:
10141 *
10142 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10143 * run. The generated string lacks data portability.
10144 *
10145 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10146 * (i.e. silently ends up in unexpected results).
10147 *
10148 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10149 * thread safe.
10150 *
10151 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10152 * very very weak. According to its manpage, Linux's traditional
10153 * <code>crypt(3)</code> output has only 2**56 variations; too
10154 * easy to brute force today. And this is the default behaviour.
10155 *
10156 * * In order to make things robust some OSes implement so-called
10157 * "modular" usage. To go through, you have to do a complex
10158 * build-up of the <code>salt_str</code> parameter, by hand.
10159 * Failure in generation of a proper salt string tends not to
10160 * yield any errors; typos in parameters are normally not
10161 * detectable.
10162 *
10163 * * For instance, in the following example, the second invocation
10164 * of String#crypt is wrong; it has a typo in "round=" (lacks
10165 * "s"). However the call does not fail and something unexpected
10166 * is generated.
10167 *
10168 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10169 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10170 *
10171 * * Even in the "modular" mode, some hash functions are considered
10172 * archaic and no longer recommended at all; for instance module
10173 * <code>$1$</code> is officially abandoned by its author: see
10174 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10175 * instance module <code>$3$</code> is considered completely
10176 * broken: see the manpage of FreeBSD.
10177 *
10178 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10179 * written above, <code>crypt(3)</code> on Mac OS never fails.
10180 * This means even if you build up a proper salt string it
10181 * generates a traditional DES hash anyways, and there is no way
10182 * for you to be aware of.
10183 *
10184 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10185 *
10186 * If for some reason you cannot migrate to other secure contemporary
10187 * password hashing algorithms, install the string-crypt gem and
10188 * <code>require 'string/crypt'</code> to continue using it.
10189 */
10190
10191static VALUE
10192rb_str_crypt(VALUE str, VALUE salt)
10193{
10194#ifdef HAVE_CRYPT_R
10195 VALUE databuf;
10196 struct crypt_data *data;
10197# define CRYPT_END() ALLOCV_END(databuf)
10198#else
10199 extern char *crypt(const char *, const char *);
10200# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10201#endif
10202 VALUE result;
10203 const char *s, *saltp;
10204 char *res;
10205#ifdef BROKEN_CRYPT
10206 char salt_8bit_clean[3];
10207#endif
10208
10209 StringValue(salt);
10210 mustnot_wchar(str);
10211 mustnot_wchar(salt);
10212 s = StringValueCStr(str);
10213 saltp = RSTRING_PTR(salt);
10214 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10215 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10216 }
10217
10218#ifdef BROKEN_CRYPT
10219 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10220 salt_8bit_clean[0] = saltp[0] & 0x7f;
10221 salt_8bit_clean[1] = saltp[1] & 0x7f;
10222 salt_8bit_clean[2] = '\0';
10223 saltp = salt_8bit_clean;
10224 }
10225#endif
10226#ifdef HAVE_CRYPT_R
10227 data = ALLOCV(databuf, sizeof(struct crypt_data));
10228# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10229 data->initialized = 0;
10230# endif
10231 res = crypt_r(s, saltp, data);
10232#else
10233 crypt_mutex_initialize();
10234 rb_nativethread_lock_lock(&crypt_mutex.lock);
10235 res = crypt(s, saltp);
10236#endif
10237 if (!res) {
10238 int err = errno;
10239 CRYPT_END();
10240 rb_syserr_fail(err, "crypt");
10241 }
10242 result = rb_str_new_cstr(res);
10243 CRYPT_END();
10244 return result;
10245}
10246
10247
10248/*
10249 * call-seq:
10250 * ord -> integer
10251 *
10252 * :include: doc/string/ord.rdoc
10253 *
10254 */
10255
10256static VALUE
10257rb_str_ord(VALUE s)
10258{
10259 unsigned int c;
10260
10261 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10262 return UINT2NUM(c);
10263}
10264/*
10265 * call-seq:
10266 * sum(n = 16) -> integer
10267 *
10268 * :include: doc/string/sum.rdoc
10269 *
10270 */
10271
10272static VALUE
10273rb_str_sum(int argc, VALUE *argv, VALUE str)
10274{
10275 int bits = 16;
10276 char *ptr, *p, *pend;
10277 long len;
10278 VALUE sum = INT2FIX(0);
10279 unsigned long sum0 = 0;
10280
10281 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10282 bits = 0;
10283 }
10284 ptr = p = RSTRING_PTR(str);
10285 len = RSTRING_LEN(str);
10286 pend = p + len;
10287
10288 while (p < pend) {
10289 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10290 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10291 str_mod_check(str, ptr, len);
10292 sum0 = 0;
10293 }
10294 sum0 += (unsigned char)*p;
10295 p++;
10296 }
10297
10298 if (bits == 0) {
10299 if (sum0) {
10300 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10301 }
10302 }
10303 else {
10304 if (sum == INT2FIX(0)) {
10305 if (bits < (int)sizeof(long)*CHAR_BIT) {
10306 sum0 &= (((unsigned long)1)<<bits)-1;
10307 }
10308 sum = LONG2FIX(sum0);
10309 }
10310 else {
10311 VALUE mod;
10312
10313 if (sum0) {
10314 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10315 }
10316
10317 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10318 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10319 sum = rb_funcall(sum, '&', 1, mod);
10320 }
10321 }
10322 return sum;
10323}
10324
10325static VALUE
10326rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10327{
10328 rb_encoding *enc;
10329 VALUE w;
10330 long width, len, flen = 1, fclen = 1;
10331 VALUE res;
10332 char *p;
10333 const char *f = " ";
10334 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10335 VALUE pad;
10336 int singlebyte = 1, cr;
10337 int termlen;
10338
10339 rb_scan_args(argc, argv, "11", &w, &pad);
10340 enc = STR_ENC_GET(str);
10341 termlen = rb_enc_mbminlen(enc);
10342 width = NUM2LONG(w);
10343 if (argc == 2) {
10344 StringValue(pad);
10345 enc = rb_enc_check(str, pad);
10346 f = RSTRING_PTR(pad);
10347 flen = RSTRING_LEN(pad);
10348 fclen = str_strlen(pad, enc); /* rb_enc_check */
10349 singlebyte = single_byte_optimizable(pad);
10350 if (flen == 0 || fclen == 0) {
10351 rb_raise(rb_eArgError, "zero width padding");
10352 }
10353 }
10354 len = str_strlen(str, enc); /* rb_enc_check */
10355 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10356 n = width - len;
10357 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10358 rlen = n - llen;
10359 cr = ENC_CODERANGE(str);
10360 if (flen > 1) {
10361 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10362 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10363 }
10364 size = RSTRING_LEN(str);
10365 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10366 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10367 (len += llen2 + rlen2) >= LONG_MAX - size) {
10368 rb_raise(rb_eArgError, "argument too big");
10369 }
10370 len += size;
10371 res = str_new0(rb_cString, 0, len, termlen);
10372 p = RSTRING_PTR(res);
10373 if (flen <= 1) {
10374 memset(p, *f, llen);
10375 p += llen;
10376 }
10377 else {
10378 while (llen >= fclen) {
10379 memcpy(p,f,flen);
10380 p += flen;
10381 llen -= fclen;
10382 }
10383 if (llen > 0) {
10384 memcpy(p, f, llen2);
10385 p += llen2;
10386 }
10387 }
10388 memcpy(p, RSTRING_PTR(str), size);
10389 p += size;
10390 if (flen <= 1) {
10391 memset(p, *f, rlen);
10392 p += rlen;
10393 }
10394 else {
10395 while (rlen >= fclen) {
10396 memcpy(p,f,flen);
10397 p += flen;
10398 rlen -= fclen;
10399 }
10400 if (rlen > 0) {
10401 memcpy(p, f, rlen2);
10402 p += rlen2;
10403 }
10404 }
10405 TERM_FILL(p, termlen);
10406 STR_SET_LEN(res, p-RSTRING_PTR(res));
10407 rb_enc_associate(res, enc);
10408 if (argc == 2)
10409 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10410 if (cr != ENC_CODERANGE_BROKEN)
10411 ENC_CODERANGE_SET(res, cr);
10412
10413 RB_GC_GUARD(pad);
10414 return res;
10415}
10416
10417
10418/*
10419 * call-seq:
10420 * ljust(size, pad_string = ' ') -> new_string
10421 *
10422 * :include: doc/string/ljust.rdoc
10423 *
10424 * Related: String#rjust, String#center.
10425 *
10426 */
10427
10428static VALUE
10429rb_str_ljust(int argc, VALUE *argv, VALUE str)
10430{
10431 return rb_str_justify(argc, argv, str, 'l');
10432}
10433
10434/*
10435 * call-seq:
10436 * rjust(size, pad_string = ' ') -> new_string
10437 *
10438 * :include: doc/string/rjust.rdoc
10439 *
10440 * Related: String#ljust, String#center.
10441 *
10442 */
10443
10444static VALUE
10445rb_str_rjust(int argc, VALUE *argv, VALUE str)
10446{
10447 return rb_str_justify(argc, argv, str, 'r');
10448}
10449
10450
10451/*
10452 * call-seq:
10453 * center(size, pad_string = ' ') -> new_string
10454 *
10455 * :include: doc/string/center.rdoc
10456 *
10457 * Related: String#ljust, String#rjust.
10458 *
10459 */
10460
10461static VALUE
10462rb_str_center(int argc, VALUE *argv, VALUE str)
10463{
10464 return rb_str_justify(argc, argv, str, 'c');
10465}
10466
10467/*
10468 * call-seq:
10469 * partition(string_or_regexp) -> [head, match, tail]
10470 *
10471 * :include: doc/string/partition.rdoc
10472 *
10473 */
10474
10475static VALUE
10476rb_str_partition(VALUE str, VALUE sep)
10477{
10478 long pos;
10479
10480 sep = get_pat_quoted(sep, 0);
10481 if (RB_TYPE_P(sep, T_REGEXP)) {
10482 if (rb_reg_search(sep, str, 0, 0) < 0) {
10483 goto failed;
10484 }
10485 VALUE match = rb_backref_get();
10486 struct re_registers *regs = RMATCH_REGS(match);
10487
10488 pos = BEG(0);
10489 sep = rb_str_subseq(str, pos, END(0) - pos);
10490 }
10491 else {
10492 pos = rb_str_index(str, sep, 0);
10493 if (pos < 0) goto failed;
10494 }
10495 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10496 sep,
10497 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10498 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10499
10500 failed:
10501 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10502}
10503
10504/*
10505 * call-seq:
10506 * rpartition(sep) -> [head, match, tail]
10507 *
10508 * :include: doc/string/rpartition.rdoc
10509 *
10510 */
10511
10512static VALUE
10513rb_str_rpartition(VALUE str, VALUE sep)
10514{
10515 long pos = RSTRING_LEN(str);
10516
10517 sep = get_pat_quoted(sep, 0);
10518 if (RB_TYPE_P(sep, T_REGEXP)) {
10519 if (rb_reg_search(sep, str, pos, 1) < 0) {
10520 goto failed;
10521 }
10522 VALUE match = rb_backref_get();
10523 struct re_registers *regs = RMATCH_REGS(match);
10524
10525 pos = BEG(0);
10526 sep = rb_str_subseq(str, pos, END(0) - pos);
10527 }
10528 else {
10529 pos = rb_str_sublen(str, pos);
10530 pos = rb_str_rindex(str, sep, pos);
10531 if (pos < 0) {
10532 goto failed;
10533 }
10534 pos = rb_str_offset(str, pos);
10535 }
10536
10537 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10538 sep,
10539 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10540 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10541 failed:
10542 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10543}
10544
10545/*
10546 * call-seq:
10547 * start_with?(*string_or_regexp) -> true or false
10548 *
10549 * :include: doc/string/start_with_p.rdoc
10550 *
10551 */
10552
10553static VALUE
10554rb_str_start_with(int argc, VALUE *argv, VALUE str)
10555{
10556 int i;
10557
10558 for (i=0; i<argc; i++) {
10559 VALUE tmp = argv[i];
10560 if (RB_TYPE_P(tmp, T_REGEXP)) {
10561 if (rb_reg_start_with_p(tmp, str))
10562 return Qtrue;
10563 }
10564 else {
10565 StringValue(tmp);
10566 rb_enc_check(str, tmp);
10567 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
10568 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10569 return Qtrue;
10570 }
10571 }
10572 return Qfalse;
10573}
10574
10575/*
10576 * call-seq:
10577 * end_with?(*strings) -> true or false
10578 *
10579 * :include: doc/string/end_with_p.rdoc
10580 *
10581 */
10582
10583static VALUE
10584rb_str_end_with(int argc, VALUE *argv, VALUE str)
10585{
10586 int i;
10587 char *p, *s, *e;
10588 rb_encoding *enc;
10589
10590 for (i=0; i<argc; i++) {
10591 VALUE tmp = argv[i];
10592 long slen, tlen;
10593 StringValue(tmp);
10594 enc = rb_enc_check(str, tmp);
10595 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10596 if ((slen = RSTRING_LEN(str)) < tlen) continue;
10597 p = RSTRING_PTR(str);
10598 e = p + slen;
10599 s = e - tlen;
10600 if (rb_enc_left_char_head(p, s, e, enc) != s)
10601 continue;
10602 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
10603 return Qtrue;
10604 }
10605 return Qfalse;
10606}
10607
10617static long
10618deleted_prefix_length(VALUE str, VALUE prefix)
10619{
10620 char *strptr, *prefixptr;
10621 long olen, prefixlen;
10622
10623 StringValue(prefix);
10624 if (is_broken_string(prefix)) return 0;
10625 rb_enc_check(str, prefix);
10626
10627 /* return 0 if not start with prefix */
10628 prefixlen = RSTRING_LEN(prefix);
10629 if (prefixlen <= 0) return 0;
10630 olen = RSTRING_LEN(str);
10631 if (olen < prefixlen) return 0;
10632 strptr = RSTRING_PTR(str);
10633 prefixptr = RSTRING_PTR(prefix);
10634 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10635
10636 return prefixlen;
10637}
10638
10639/*
10640 * call-seq:
10641 * delete_prefix!(prefix) -> self or nil
10642 *
10643 * Like String#delete_prefix, except that +self+ is modified in place.
10644 * Returns +self+ if the prefix is removed, +nil+ otherwise.
10645 *
10646 */
10647
10648static VALUE
10649rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10650{
10651 long prefixlen;
10652 str_modify_keep_cr(str);
10653
10654 prefixlen = deleted_prefix_length(str, prefix);
10655 if (prefixlen <= 0) return Qnil;
10656
10657 return rb_str_drop_bytes(str, prefixlen);
10658}
10659
10660/*
10661 * call-seq:
10662 * delete_prefix(prefix) -> new_string
10663 *
10664 * :include: doc/string/delete_prefix.rdoc
10665 *
10666 */
10667
10668static VALUE
10669rb_str_delete_prefix(VALUE str, VALUE prefix)
10670{
10671 long prefixlen;
10672
10673 prefixlen = deleted_prefix_length(str, prefix);
10674 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10675
10676 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10677}
10678
10688static long
10689deleted_suffix_length(VALUE str, VALUE suffix)
10690{
10691 char *strptr, *suffixptr, *s;
10692 long olen, suffixlen;
10693 rb_encoding *enc;
10694
10695 StringValue(suffix);
10696 if (is_broken_string(suffix)) return 0;
10697 enc = rb_enc_check(str, suffix);
10698
10699 /* return 0 if not start with suffix */
10700 suffixlen = RSTRING_LEN(suffix);
10701 if (suffixlen <= 0) return 0;
10702 olen = RSTRING_LEN(str);
10703 if (olen < suffixlen) return 0;
10704 strptr = RSTRING_PTR(str);
10705 suffixptr = RSTRING_PTR(suffix);
10706 s = strptr + olen - suffixlen;
10707 if (memcmp(s, suffixptr, suffixlen) != 0) return 0;
10708 if (rb_enc_left_char_head(strptr, s, strptr + olen, enc) != s) return 0;
10709
10710 return suffixlen;
10711}
10712
10713/*
10714 * call-seq:
10715 * delete_suffix!(suffix) -> self or nil
10716 *
10717 * Like String#delete_suffix, except that +self+ is modified in place.
10718 * Returns +self+ if the suffix is removed, +nil+ otherwise.
10719 *
10720 */
10721
10722static VALUE
10723rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10724{
10725 long olen, suffixlen, len;
10726 str_modifiable(str);
10727
10728 suffixlen = deleted_suffix_length(str, suffix);
10729 if (suffixlen <= 0) return Qnil;
10730
10731 olen = RSTRING_LEN(str);
10732 str_modify_keep_cr(str);
10733 len = olen - suffixlen;
10734 STR_SET_LEN(str, len);
10735 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10736 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10738 }
10739 return str;
10740}
10741
10742/*
10743 * call-seq:
10744 * delete_suffix(suffix) -> new_string
10745 *
10746 * :include: doc/string/delete_suffix.rdoc
10747 *
10748 */
10749
10750static VALUE
10751rb_str_delete_suffix(VALUE str, VALUE suffix)
10752{
10753 long suffixlen;
10754
10755 suffixlen = deleted_suffix_length(str, suffix);
10756 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10757
10758 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10759}
10760
10761void
10762rb_str_setter(VALUE val, ID id, VALUE *var)
10763{
10764 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10765 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10766 }
10767 *var = val;
10768}
10769
10770static void
10771rb_fs_setter(VALUE val, ID id, VALUE *var)
10772{
10773 val = rb_fs_check(val);
10774 if (!val) {
10776 "value of %"PRIsVALUE" must be String or Regexp",
10777 rb_id2str(id));
10778 }
10779 if (!NIL_P(val)) {
10780 rb_warn_deprecated("`$;'", NULL);
10781 }
10782 *var = val;
10783}
10784
10785
10786/*
10787 * call-seq:
10788 * force_encoding(encoding) -> self
10789 *
10790 * :include: doc/string/force_encoding.rdoc
10791 *
10792 */
10793
10794static VALUE
10795rb_str_force_encoding(VALUE str, VALUE enc)
10796{
10797 str_modifiable(str);
10798 rb_enc_associate(str, rb_to_encoding(enc));
10800 return str;
10801}
10802
10803/*
10804 * call-seq:
10805 * b -> string
10806 *
10807 * :include: doc/string/b.rdoc
10808 *
10809 */
10810
10811static VALUE
10812rb_str_b(VALUE str)
10813{
10814 VALUE str2;
10815 if (FL_TEST(str, STR_NOEMBED)) {
10816 str2 = str_alloc_heap(rb_cString);
10817 }
10818 else {
10819 str2 = str_alloc_embed(rb_cString, RSTRING_EMBED_LEN(str) + TERM_LEN(str));
10820 }
10821 str_replace_shared_without_enc(str2, str);
10822
10823 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
10824 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
10825 // If we know the receiver's code range then we know the result's code range.
10826 int cr = ENC_CODERANGE(str);
10827 switch (cr) {
10828 case ENC_CODERANGE_7BIT:
10830 break;
10834 break;
10835 default:
10836 ENC_CODERANGE_CLEAR(str2);
10837 break;
10838 }
10839 }
10840
10841 return str2;
10842}
10843
10844/*
10845 * call-seq:
10846 * valid_encoding? -> true or false
10847 *
10848 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
10849 *
10850 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? # => true
10851 * "\xc2".force_encoding("UTF-8").valid_encoding? # => false
10852 * "\x80".force_encoding("UTF-8").valid_encoding? # => false
10853 */
10854
10855static VALUE
10856rb_str_valid_encoding_p(VALUE str)
10857{
10858 int cr = rb_enc_str_coderange(str);
10859
10860 return RBOOL(cr != ENC_CODERANGE_BROKEN);
10861}
10862
10863/*
10864 * call-seq:
10865 * ascii_only? -> true or false
10866 *
10867 * Returns +true+ if +self+ contains only ASCII characters,
10868 * +false+ otherwise:
10869 *
10870 * 'abc'.ascii_only? # => true
10871 * "abc\u{6666}".ascii_only? # => false
10872 *
10873 */
10874
10875static VALUE
10876rb_str_is_ascii_only_p(VALUE str)
10877{
10878 int cr = rb_enc_str_coderange(str);
10879
10880 return RBOOL(cr == ENC_CODERANGE_7BIT);
10881}
10882
10883VALUE
10885{
10886 static const char ellipsis[] = "...";
10887 const long ellipsislen = sizeof(ellipsis) - 1;
10888 rb_encoding *const enc = rb_enc_get(str);
10889 const long blen = RSTRING_LEN(str);
10890 const char *const p = RSTRING_PTR(str), *e = p + blen;
10891 VALUE estr, ret = 0;
10892
10893 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
10894 if (len * rb_enc_mbminlen(enc) >= blen ||
10895 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
10896 ret = str;
10897 }
10898 else if (len <= ellipsislen ||
10899 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
10900 if (rb_enc_asciicompat(enc)) {
10901 ret = rb_str_new(ellipsis, len);
10902 rb_enc_associate(ret, enc);
10903 }
10904 else {
10905 estr = rb_usascii_str_new(ellipsis, len);
10906 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
10907 }
10908 }
10909 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
10910 rb_str_cat(ret, ellipsis, ellipsislen);
10911 }
10912 else {
10913 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
10914 rb_enc_from_encoding(enc), 0, Qnil);
10915 rb_str_append(ret, estr);
10916 }
10917 return ret;
10918}
10919
10920static VALUE
10921str_compat_and_valid(VALUE str, rb_encoding *enc)
10922{
10923 int cr;
10924 str = StringValue(str);
10925 cr = rb_enc_str_coderange(str);
10926 if (cr == ENC_CODERANGE_BROKEN) {
10927 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
10928 }
10929 else {
10930 rb_encoding *e = STR_ENC_GET(str);
10931 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
10932 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
10933 rb_enc_name(enc), rb_enc_name(e));
10934 }
10935 }
10936 return str;
10937}
10938
10939static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
10940
10941VALUE
10943{
10944 rb_encoding *enc = STR_ENC_GET(str);
10945 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
10946}
10947
10948VALUE
10949rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
10950{
10951 int cr = ENC_CODERANGE_UNKNOWN;
10952 if (enc == STR_ENC_GET(str)) {
10953 /* cached coderange makes sense only when enc equals the
10954 * actual encoding of str */
10955 cr = ENC_CODERANGE(str);
10956 }
10957 return enc_str_scrub(enc, str, repl, cr);
10958}
10959
10960static VALUE
10961enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
10962{
10963 int encidx;
10964 VALUE buf = Qnil;
10965 const char *rep, *p, *e, *p1, *sp;
10966 long replen = -1;
10967 long slen;
10968
10969 if (rb_block_given_p()) {
10970 if (!NIL_P(repl))
10971 rb_raise(rb_eArgError, "both of block and replacement given");
10972 replen = 0;
10973 }
10974
10975 if (ENC_CODERANGE_CLEAN_P(cr))
10976 return Qnil;
10977
10978 if (!NIL_P(repl)) {
10979 repl = str_compat_and_valid(repl, enc);
10980 }
10981
10982 if (rb_enc_dummy_p(enc)) {
10983 return Qnil;
10984 }
10985 encidx = rb_enc_to_index(enc);
10986
10987#define DEFAULT_REPLACE_CHAR(str) do { \
10988 static const char replace[sizeof(str)-1] = str; \
10989 rep = replace; replen = (int)sizeof(replace); \
10990 } while (0)
10991
10992 slen = RSTRING_LEN(str);
10993 p = RSTRING_PTR(str);
10994 e = RSTRING_END(str);
10995 p1 = p;
10996 sp = p;
10997
10998 if (rb_enc_asciicompat(enc)) {
10999 int rep7bit_p;
11000 if (!replen) {
11001 rep = NULL;
11002 rep7bit_p = FALSE;
11003 }
11004 else if (!NIL_P(repl)) {
11005 rep = RSTRING_PTR(repl);
11006 replen = RSTRING_LEN(repl);
11007 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11008 }
11009 else if (encidx == rb_utf8_encindex()) {
11010 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11011 rep7bit_p = FALSE;
11012 }
11013 else {
11014 DEFAULT_REPLACE_CHAR("?");
11015 rep7bit_p = TRUE;
11016 }
11017 cr = ENC_CODERANGE_7BIT;
11018
11019 p = search_nonascii(p, e);
11020 if (!p) {
11021 p = e;
11022 }
11023 while (p < e) {
11024 int ret = rb_enc_precise_mbclen(p, e, enc);
11025 if (MBCLEN_NEEDMORE_P(ret)) {
11026 break;
11027 }
11028 else if (MBCLEN_CHARFOUND_P(ret)) {
11030 p += MBCLEN_CHARFOUND_LEN(ret);
11031 }
11032 else if (MBCLEN_INVALID_P(ret)) {
11033 /*
11034 * p1~p: valid ascii/multibyte chars
11035 * p ~e: invalid bytes + unknown bytes
11036 */
11037 long clen = rb_enc_mbmaxlen(enc);
11038 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11039 if (p > p1) {
11040 rb_str_buf_cat(buf, p1, p - p1);
11041 }
11042
11043 if (e - p < clen) clen = e - p;
11044 if (clen <= 2) {
11045 clen = 1;
11046 }
11047 else {
11048 const char *q = p;
11049 clen--;
11050 for (; clen > 1; clen--) {
11051 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11052 if (MBCLEN_NEEDMORE_P(ret)) break;
11053 if (MBCLEN_INVALID_P(ret)) continue;
11055 }
11056 }
11057 if (rep) {
11058 rb_str_buf_cat(buf, rep, replen);
11059 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11060 }
11061 else {
11062 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11063 str_mod_check(str, sp, slen);
11064 repl = str_compat_and_valid(repl, enc);
11065 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11068 }
11069 p += clen;
11070 p1 = p;
11071 p = search_nonascii(p, e);
11072 if (!p) {
11073 p = e;
11074 break;
11075 }
11076 }
11077 else {
11079 }
11080 }
11081 if (NIL_P(buf)) {
11082 if (p == e) {
11083 ENC_CODERANGE_SET(str, cr);
11084 return Qnil;
11085 }
11086 buf = rb_str_buf_new(RSTRING_LEN(str));
11087 }
11088 if (p1 < p) {
11089 rb_str_buf_cat(buf, p1, p - p1);
11090 }
11091 if (p < e) {
11092 if (rep) {
11093 rb_str_buf_cat(buf, rep, replen);
11094 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11095 }
11096 else {
11097 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11098 str_mod_check(str, sp, slen);
11099 repl = str_compat_and_valid(repl, enc);
11100 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11103 }
11104 }
11105 }
11106 else {
11107 /* ASCII incompatible */
11108 long mbminlen = rb_enc_mbminlen(enc);
11109 if (!replen) {
11110 rep = NULL;
11111 }
11112 else if (!NIL_P(repl)) {
11113 rep = RSTRING_PTR(repl);
11114 replen = RSTRING_LEN(repl);
11115 }
11116 else if (encidx == ENCINDEX_UTF_16BE) {
11117 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11118 }
11119 else if (encidx == ENCINDEX_UTF_16LE) {
11120 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11121 }
11122 else if (encidx == ENCINDEX_UTF_32BE) {
11123 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11124 }
11125 else if (encidx == ENCINDEX_UTF_32LE) {
11126 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11127 }
11128 else {
11129 DEFAULT_REPLACE_CHAR("?");
11130 }
11131
11132 while (p < e) {
11133 int ret = rb_enc_precise_mbclen(p, e, enc);
11134 if (MBCLEN_NEEDMORE_P(ret)) {
11135 break;
11136 }
11137 else if (MBCLEN_CHARFOUND_P(ret)) {
11138 p += MBCLEN_CHARFOUND_LEN(ret);
11139 }
11140 else if (MBCLEN_INVALID_P(ret)) {
11141 const char *q = p;
11142 long clen = rb_enc_mbmaxlen(enc);
11143 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11144 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11145
11146 if (e - p < clen) clen = e - p;
11147 if (clen <= mbminlen * 2) {
11148 clen = mbminlen;
11149 }
11150 else {
11151 clen -= mbminlen;
11152 for (; clen > mbminlen; clen-=mbminlen) {
11153 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11154 if (MBCLEN_NEEDMORE_P(ret)) break;
11155 if (MBCLEN_INVALID_P(ret)) continue;
11157 }
11158 }
11159 if (rep) {
11160 rb_str_buf_cat(buf, rep, replen);
11161 }
11162 else {
11163 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11164 str_mod_check(str, sp, slen);
11165 repl = str_compat_and_valid(repl, enc);
11166 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11167 }
11168 p += clen;
11169 p1 = p;
11170 }
11171 else {
11173 }
11174 }
11175 if (NIL_P(buf)) {
11176 if (p == e) {
11178 return Qnil;
11179 }
11180 buf = rb_str_buf_new(RSTRING_LEN(str));
11181 }
11182 if (p1 < p) {
11183 rb_str_buf_cat(buf, p1, p - p1);
11184 }
11185 if (p < e) {
11186 if (rep) {
11187 rb_str_buf_cat(buf, rep, replen);
11188 }
11189 else {
11190 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11191 str_mod_check(str, sp, slen);
11192 repl = str_compat_and_valid(repl, enc);
11193 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11194 }
11195 }
11197 }
11198 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11199 return buf;
11200}
11201
11202/*
11203 * call-seq:
11204 * scrub(replacement_string = default_replacement) -> new_string
11205 * scrub{|bytes| ... } -> new_string
11206 *
11207 * :include: doc/string/scrub.rdoc
11208 *
11209 */
11210static VALUE
11211str_scrub(int argc, VALUE *argv, VALUE str)
11212{
11213 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11214 VALUE new = rb_str_scrub(str, repl);
11215 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11216}
11217
11218/*
11219 * call-seq:
11220 * scrub! -> self
11221 * scrub!(replacement_string = default_replacement) -> self
11222 * scrub!{|bytes| ... } -> self
11223 *
11224 * Like String#scrub, except that any replacements are made in +self+.
11225 *
11226 */
11227static VALUE
11228str_scrub_bang(int argc, VALUE *argv, VALUE str)
11229{
11230 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11231 VALUE new = rb_str_scrub(str, repl);
11232 if (!NIL_P(new)) rb_str_replace(str, new);
11233 return str;
11234}
11235
11236static ID id_normalize;
11237static ID id_normalized_p;
11238static VALUE mUnicodeNormalize;
11239
11240static VALUE
11241unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11242{
11243 static int UnicodeNormalizeRequired = 0;
11244 VALUE argv2[2];
11245
11246 if (!UnicodeNormalizeRequired) {
11247 rb_require("unicode_normalize/normalize.rb");
11248 UnicodeNormalizeRequired = 1;
11249 }
11250 argv2[0] = str;
11251 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11252 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11253}
11254
11255/*
11256 * call-seq:
11257 * unicode_normalize(form = :nfc) -> string
11258 *
11259 * Returns a copy of +self+ with
11260 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11261 *
11262 * Argument +form+ must be one of the following symbols
11263 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11264 *
11265 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11266 * - +:nfd+: Canonical decomposition.
11267 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11268 * - +:nfkd+: Compatibility decomposition.
11269 *
11270 * The encoding of +self+ must be one of:
11271 *
11272 * - Encoding::UTF_8
11273 * - Encoding::UTF_16BE
11274 * - Encoding::UTF_16LE
11275 * - Encoding::UTF_32BE
11276 * - Encoding::UTF_32LE
11277 * - Encoding::GB18030
11278 * - Encoding::UCS_2BE
11279 * - Encoding::UCS_4BE
11280 *
11281 * Examples:
11282 *
11283 * "a\u0300".unicode_normalize # => "a"
11284 * "\u00E0".unicode_normalize(:nfd) # => "a "
11285 *
11286 * Related: String#unicode_normalize!, String#unicode_normalized?.
11287 */
11288static VALUE
11289rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11290{
11291 return unicode_normalize_common(argc, argv, str, id_normalize);
11292}
11293
11294/*
11295 * call-seq:
11296 * unicode_normalize!(form = :nfc) -> self
11297 *
11298 * Like String#unicode_normalize, except that the normalization
11299 * is performed on +self+.
11300 *
11301 * Related String#unicode_normalized?.
11302 *
11303 */
11304static VALUE
11305rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11306{
11307 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11308}
11309
11310/* call-seq:
11311 * unicode_normalized?(form = :nfc) -> true or false
11312 *
11313 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11314 * +false+ otherwise.
11315 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11316 *
11317 * Examples:
11318 *
11319 * "a\u0300".unicode_normalized? # => false
11320 * "a\u0300".unicode_normalized?(:nfd) # => true
11321 * "\u00E0".unicode_normalized? # => true
11322 * "\u00E0".unicode_normalized?(:nfd) # => false
11323 *
11324 *
11325 * Raises an exception if +self+ is not in a Unicode encoding:
11326 *
11327 * s = "\xE0".force_encoding('ISO-8859-1')
11328 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11329 *
11330 * Related: String#unicode_normalize, String#unicode_normalize!.
11331 *
11332 */
11333static VALUE
11334rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11335{
11336 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11337}
11338
11339/**********************************************************************
11340 * Document-class: Symbol
11341 *
11342 * Symbol objects represent named identifiers inside the Ruby interpreter.
11343 *
11344 * You can create a \Symbol object explicitly with:
11345 *
11346 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11347 *
11348 * The same Symbol object will be
11349 * created for a given name or string for the duration of a program's
11350 * execution, regardless of the context or meaning of that name. Thus
11351 * if <code>Fred</code> is a constant in one context, a method in
11352 * another, and a class in a third, the Symbol <code>:Fred</code>
11353 * will be the same object in all three contexts.
11354 *
11355 * module One
11356 * class Fred
11357 * end
11358 * $f1 = :Fred
11359 * end
11360 * module Two
11361 * Fred = 1
11362 * $f2 = :Fred
11363 * end
11364 * def Fred()
11365 * end
11366 * $f3 = :Fred
11367 * $f1.object_id #=> 2514190
11368 * $f2.object_id #=> 2514190
11369 * $f3.object_id #=> 2514190
11370 *
11371 * Constant, method, and variable names are returned as symbols:
11372 *
11373 * module One
11374 * Two = 2
11375 * def three; 3 end
11376 * @four = 4
11377 * @@five = 5
11378 * $six = 6
11379 * end
11380 * seven = 7
11381 *
11382 * One.constants
11383 * # => [:Two]
11384 * One.instance_methods(true)
11385 * # => [:three]
11386 * One.instance_variables
11387 * # => [:@four]
11388 * One.class_variables
11389 * # => [:@@five]
11390 * global_variables.grep(/six/)
11391 * # => [:$six]
11392 * local_variables
11393 * # => [:seven]
11394 *
11395 * Symbol objects are different from String objects in that
11396 * Symbol objects represent identifiers, while String objects
11397 * represent text or data.
11398 *
11399 * == What's Here
11400 *
11401 * First, what's elsewhere. \Class \Symbol:
11402 *
11403 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11404 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
11405 *
11406 * Here, class \Symbol provides methods that are useful for:
11407 *
11408 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
11409 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
11410 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
11411 *
11412 * === Methods for Querying
11413 *
11414 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
11415 * - #=~: Returns the index of the first substring in symbol that matches a
11416 * given Regexp or other object; returns +nil+ if no match is found.
11417 * - #[], #slice : Returns a substring of symbol
11418 * determined by a given index, start/length, or range, or string.
11419 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11420 * - #encoding: Returns the Encoding object that represents the encoding
11421 * of symbol.
11422 * - #end_with?: Returns +true+ if symbol ends with
11423 * any of the given strings.
11424 * - #match: Returns a MatchData object if symbol
11425 * matches a given Regexp; +nil+ otherwise.
11426 * - #match?: Returns +true+ if symbol
11427 * matches a given Regexp; +false+ otherwise.
11428 * - #length, #size: Returns the number of characters in symbol.
11429 * - #start_with?: Returns +true+ if symbol starts with
11430 * any of the given strings.
11431 *
11432 * === Methods for Comparing
11433 *
11434 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
11435 * or larger than symbol.
11436 * - #==, #===: Returns +true+ if a given symbol has the same content and
11437 * encoding.
11438 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
11439 * symbol is smaller than, equal to, or larger than symbol.
11440 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
11441 * after Unicode case folding; +false+ otherwise.
11442 *
11443 * === Methods for Converting
11444 *
11445 * - #capitalize: Returns symbol with the first character upcased
11446 * and all other characters downcased.
11447 * - #downcase: Returns symbol with all characters downcased.
11448 * - #inspect: Returns the string representation of +self+ as a symbol literal.
11449 * - #name: Returns the frozen string corresponding to symbol.
11450 * - #succ, #next: Returns the symbol that is the successor to symbol.
11451 * - #swapcase: Returns symbol with all upcase characters downcased
11452 * and all downcase characters upcased.
11453 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
11454 * - #to_s, #id2name: Returns the string corresponding to +self+.
11455 * - #to_sym, #intern: Returns +self+.
11456 * - #upcase: Returns symbol with all characters upcased.
11457 *
11458 */
11459
11460
11461/*
11462 * call-seq:
11463 * symbol == object -> true or false
11464 *
11465 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
11466 *
11467 * Symbol#=== is an alias for Symbol#==.
11468 *
11469 */
11470
11471#define sym_equal rb_obj_equal
11472
11473static int
11474sym_printable(const char *s, const char *send, rb_encoding *enc)
11475{
11476 while (s < send) {
11477 int n;
11478 int c = rb_enc_precise_mbclen(s, send, enc);
11479
11480 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11481 n = MBCLEN_CHARFOUND_LEN(c);
11482 c = rb_enc_mbc_to_codepoint(s, send, enc);
11483 if (!rb_enc_isprint(c, enc)) return FALSE;
11484 s += n;
11485 }
11486 return TRUE;
11487}
11488
11489int
11490rb_str_symname_p(VALUE sym)
11491{
11492 rb_encoding *enc;
11493 const char *ptr;
11494 long len;
11495 rb_encoding *resenc = rb_default_internal_encoding();
11496
11497 if (resenc == NULL) resenc = rb_default_external_encoding();
11498 enc = STR_ENC_GET(sym);
11499 ptr = RSTRING_PTR(sym);
11500 len = RSTRING_LEN(sym);
11501 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11502 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11503 return FALSE;
11504 }
11505 return TRUE;
11506}
11507
11508VALUE
11509rb_str_quote_unprintable(VALUE str)
11510{
11511 rb_encoding *enc;
11512 const char *ptr;
11513 long len;
11514 rb_encoding *resenc;
11515
11516 Check_Type(str, T_STRING);
11517 resenc = rb_default_internal_encoding();
11518 if (resenc == NULL) resenc = rb_default_external_encoding();
11519 enc = STR_ENC_GET(str);
11520 ptr = RSTRING_PTR(str);
11521 len = RSTRING_LEN(str);
11522 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11523 !sym_printable(ptr, ptr + len, enc)) {
11524 return rb_str_escape(str);
11525 }
11526 return str;
11527}
11528
11529MJIT_FUNC_EXPORTED VALUE
11530rb_id_quote_unprintable(ID id)
11531{
11532 VALUE str = rb_id2str(id);
11533 if (!rb_str_symname_p(str)) {
11534 return rb_str_escape(str);
11535 }
11536 return str;
11537}
11538
11539/*
11540 * call-seq:
11541 * inspect -> string
11542 *
11543 * Returns a string representation of +self+ (including the leading colon):
11544 *
11545 * :foo.inspect # => ":foo"
11546 *
11547 * Related: Symbol#to_s, Symbol#name.
11548 *
11549 */
11550
11551static VALUE
11552sym_inspect(VALUE sym)
11553{
11554 VALUE str = rb_sym2str(sym);
11555 const char *ptr;
11556 long len;
11557 char *dest;
11558
11559 if (!rb_str_symname_p(str)) {
11560 str = rb_str_inspect(str);
11561 len = RSTRING_LEN(str);
11562 rb_str_resize(str, len + 1);
11563 dest = RSTRING_PTR(str);
11564 memmove(dest + 1, dest, len);
11565 }
11566 else {
11567 rb_encoding *enc = STR_ENC_GET(str);
11568 RSTRING_GETMEM(str, ptr, len);
11569 str = rb_enc_str_new(0, len + 1, enc);
11570 dest = RSTRING_PTR(str);
11571 memcpy(dest + 1, ptr, len);
11572 }
11573 dest[0] = ':';
11574 return str;
11575}
11576
11577/*
11578 * call-seq:
11579 * to_s -> string
11580 *
11581 * Returns a string representation of +self+ (not including the leading colon):
11582 *
11583 * :foo.to_s # => "foo"
11584 *
11585 * Symbol#id2name is an alias for Symbol#to_s.
11586 *
11587 * Related: Symbol#inspect, Symbol#name.
11588 */
11589
11590VALUE
11592{
11593 return str_new_shared(rb_cString, rb_sym2str(sym));
11594}
11595
11596MJIT_FUNC_EXPORTED VALUE
11597rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11598{
11599 VALUE obj;
11600
11601 if (argc < 1) {
11602 rb_raise(rb_eArgError, "no receiver given");
11603 }
11604 obj = argv[0];
11605 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11606}
11607
11608/*
11609 * call-seq:
11610 * succ
11611 *
11612 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
11613 *
11614 * :foo.succ # => :fop
11615 *
11616 * Symbol#next is an alias for Symbol#succ.
11617 *
11618 * Related: String#succ.
11619 */
11620
11621static VALUE
11622sym_succ(VALUE sym)
11623{
11624 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
11625}
11626
11627/*
11628 * call-seq:
11629 * symbol <=> object -> -1, 0, +1, or nil
11630 *
11631 * If +object+ is a symbol,
11632 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
11633 *
11634 * :bar <=> :foo # => -1
11635 * :foo <=> :foo # => 0
11636 * :foo <=> :bar # => 1
11637 *
11638 * Otherwise, returns +nil+:
11639 *
11640 * :foo <=> 'bar' # => nil
11641 *
11642 * Related: String#<=>.
11643 */
11644
11645static VALUE
11646sym_cmp(VALUE sym, VALUE other)
11647{
11648 if (!SYMBOL_P(other)) {
11649 return Qnil;
11650 }
11651 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11652}
11653
11654/*
11655 * call-seq:
11656 * casecmp(object) -> -1, 0, 1, or nil
11657 *
11658 * :include: doc/symbol/casecmp.rdoc
11659 *
11660 */
11661
11662static VALUE
11663sym_casecmp(VALUE sym, VALUE other)
11664{
11665 if (!SYMBOL_P(other)) {
11666 return Qnil;
11667 }
11668 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11669}
11670
11671/*
11672 * call-seq:
11673 * casecmp?(object) -> true, false, or nil
11674 *
11675 * :include: doc/symbol/casecmp_p.rdoc
11676 *
11677 */
11678
11679static VALUE
11680sym_casecmp_p(VALUE sym, VALUE other)
11681{
11682 if (!SYMBOL_P(other)) {
11683 return Qnil;
11684 }
11685 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11686}
11687
11688/*
11689 * call-seq:
11690 * symbol =~ object -> integer or nil
11691 *
11692 * Equivalent to <tt>symbol.to_s =~ object</tt>,
11693 * including possible updates to global variables;
11694 * see String#=~.
11695 *
11696 */
11697
11698static VALUE
11699sym_match(VALUE sym, VALUE other)
11700{
11701 return rb_str_match(rb_sym2str(sym), other);
11702}
11703
11704/*
11705 * call-seq:
11706 * match(pattern, offset = 0) -> matchdata or nil
11707 * match(pattern, offset = 0) {|matchdata| } -> object
11708 *
11709 * Equivalent to <tt>self.to_s.match</tt>,
11710 * including possible updates to global variables;
11711 * see String#match.
11712 *
11713 */
11714
11715static VALUE
11716sym_match_m(int argc, VALUE *argv, VALUE sym)
11717{
11718 return rb_str_match_m(argc, argv, rb_sym2str(sym));
11719}
11720
11721/*
11722 * call-seq:
11723 * match?(pattern, offset) -> true or false
11724 *
11725 * Equivalent to <tt>sym.to_s.match?</tt>;
11726 * see String#match.
11727 *
11728 */
11729
11730static VALUE
11731sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11732{
11733 return rb_str_match_m_p(argc, argv, sym);
11734}
11735
11736/*
11737 * call-seq:
11738 * symbol[index] -> string or nil
11739 * symbol[start, length] -> string or nil
11740 * symbol[range] -> string or nil
11741 * symbol[regexp, capture = 0] -> string or nil
11742 * symbol[substring] -> string or nil
11743 *
11744 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
11745 *
11746 */
11747
11748static VALUE
11749sym_aref(int argc, VALUE *argv, VALUE sym)
11750{
11751 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11752}
11753
11754/*
11755 * call-seq:
11756 * length -> integer
11757 *
11758 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
11759 *
11760 * Symbol#size is an alias for Symbol#length.
11761 *
11762 */
11763
11764static VALUE
11765sym_length(VALUE sym)
11766{
11767 return rb_str_length(rb_sym2str(sym));
11768}
11769
11770/*
11771 * call-seq:
11772 * empty? -> true or false
11773 *
11774 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
11775 *
11776 */
11777
11778static VALUE
11779sym_empty(VALUE sym)
11780{
11781 return rb_str_empty(rb_sym2str(sym));
11782}
11783
11784/*
11785 * call-seq:
11786 * upcase(*options) -> symbol
11787 *
11788 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11789 *
11790 * See String#upcase.
11791 *
11792 */
11793
11794static VALUE
11795sym_upcase(int argc, VALUE *argv, VALUE sym)
11796{
11797 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11798}
11799
11800/*
11801 * call-seq:
11802 * downcase(*options) -> symbol
11803 *
11804 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11805 *
11806 * See String#downcase.
11807 *
11808 * Related: Symbol#upcase.
11809 *
11810 */
11811
11812static VALUE
11813sym_downcase(int argc, VALUE *argv, VALUE sym)
11814{
11815 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11816}
11817
11818/*
11819 * call-seq:
11820 * capitalize(*options) -> symbol
11821 *
11822 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11823 *
11824 * See String#capitalize.
11825 *
11826 */
11827
11828static VALUE
11829sym_capitalize(int argc, VALUE *argv, VALUE sym)
11830{
11831 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11832}
11833
11834/*
11835 * call-seq:
11836 * swapcase(*options) -> symbol
11837 *
11838 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11839 *
11840 * See String#swapcase.
11841 *
11842 */
11843
11844static VALUE
11845sym_swapcase(int argc, VALUE *argv, VALUE sym)
11846{
11847 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11848}
11849
11850/*
11851 * call-seq:
11852 * start_with?(*string_or_regexp) -> true or false
11853 *
11854 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
11855 *
11856 */
11857
11858static VALUE
11859sym_start_with(int argc, VALUE *argv, VALUE sym)
11860{
11861 return rb_str_start_with(argc, argv, rb_sym2str(sym));
11862}
11863
11864/*
11865 * call-seq:
11866 * end_with?(*string_or_regexp) -> true or false
11867 *
11868 *
11869 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
11870 *
11871 */
11872
11873static VALUE
11874sym_end_with(int argc, VALUE *argv, VALUE sym)
11875{
11876 return rb_str_end_with(argc, argv, rb_sym2str(sym));
11877}
11878
11879/*
11880 * call-seq:
11881 * encoding -> encoding
11882 *
11883 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
11884 *
11885 */
11886
11887static VALUE
11888sym_encoding(VALUE sym)
11889{
11890 return rb_obj_encoding(rb_sym2str(sym));
11891}
11892
11893static VALUE
11894string_for_symbol(VALUE name)
11895{
11896 if (!RB_TYPE_P(name, T_STRING)) {
11897 VALUE tmp = rb_check_string_type(name);
11898 if (NIL_P(tmp)) {
11899 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
11900 name);
11901 }
11902 name = tmp;
11903 }
11904 return name;
11905}
11906
11907ID
11909{
11910 if (SYMBOL_P(name)) {
11911 return SYM2ID(name);
11912 }
11913 name = string_for_symbol(name);
11914 return rb_intern_str(name);
11915}
11916
11917VALUE
11919{
11920 if (SYMBOL_P(name)) {
11921 return name;
11922 }
11923 name = string_for_symbol(name);
11924 return rb_str_intern(name);
11925}
11926
11927/*
11928 * call-seq:
11929 * Symbol.all_symbols -> array_of_symbols
11930 *
11931 * Returns an array of all symbols currently in Ruby's symbol table:
11932 *
11933 * Symbol.all_symbols.size # => 9334
11934 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
11935 *
11936 */
11937
11938static VALUE
11939sym_all_symbols(VALUE _)
11940{
11941 return rb_sym_all_symbols();
11942}
11943
11944VALUE
11946{
11947 return rb_fstring(str);
11948}
11949
11950VALUE
11951rb_interned_str(const char *ptr, long len)
11952{
11953 struct RString fake_str;
11954 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
11955}
11956
11957VALUE
11959{
11960 return rb_interned_str(ptr, strlen(ptr));
11961}
11962
11963VALUE
11964rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
11965{
11966 if (UNLIKELY(rb_enc_autoload_p(enc))) {
11967 rb_enc_autoload(enc);
11968 }
11969
11970 struct RString fake_str;
11971 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
11972}
11973
11974VALUE
11976{
11977 return rb_enc_interned_str(ptr, strlen(ptr), enc);
11978}
11979
11980void
11981Init_String(void)
11982{
11983 rb_cString = rb_define_class("String", rb_cObject);
11984 assert(rb_vm_fstring_table());
11985 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
11987 rb_define_alloc_func(rb_cString, empty_str_alloc);
11988 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
11989 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
11990 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
11991 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
11994 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
11995 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
11996 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
11997 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12000 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12001 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12002 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12003 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12006 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12007 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12008 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12009 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12010 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12012 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12014 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12015 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12016 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12017 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12018 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12019 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12021 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12022 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12023 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12024 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12025 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12026 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12027 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12028 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12030 rb_define_method(rb_cString, "+@", str_uplus, 0);
12031 rb_define_method(rb_cString, "-@", str_uminus, 0);
12032 rb_define_alias(rb_cString, "dedup", "-@");
12033
12034 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12035 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12036 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12037 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12040 rb_define_method(rb_cString, "undump", str_undump, 0);
12041
12042 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12043 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12044 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12045 sym_fold = ID2SYM(rb_intern_const("fold"));
12046
12047 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12048 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12049 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12050 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12051
12052 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12053 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12054 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12055 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12056
12057 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12058 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12059 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12060 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12061 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12062 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12063 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12064 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12065 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12066 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12067 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12069 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12070 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12071 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12072 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12073 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12074
12075 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12076 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12077 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12078
12079 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12080
12081 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12082 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12083 rb_define_method(rb_cString, "center", rb_str_center, -1);
12084
12085 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12086 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12087 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12088 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12089 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12090 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12091 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12092 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12093 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12094
12095 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12096 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12097 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12098 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12099 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12100 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12101 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12102 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12103 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12104
12105 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12106 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12107 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12108 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12109 rb_define_method(rb_cString, "count", rb_str_count, -1);
12110
12111 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12112 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12113 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12114 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12115
12116 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12117 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12118 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12119 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12120 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12121
12122 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12123
12124 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12125 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12126
12127 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12128 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12129
12130 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12131 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12132 rb_define_method(rb_cString, "b", rb_str_b, 0);
12133 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12134 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12135
12136 /* define UnicodeNormalize module here so that we don't have to look it up */
12137 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12138 id_normalize = rb_intern_const("normalize");
12139 id_normalized_p = rb_intern_const("normalized?");
12140
12141 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12142 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12143 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12144
12145 rb_fs = Qnil;
12146 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12147 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12148 rb_gc_register_address(&rb_fs);
12149
12150 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12154 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12155
12156 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12157 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12158 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12160 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
12161 rb_define_method(rb_cSymbol, "name", rb_sym2str, 0); /* in symbol.c */
12162 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12163 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12164 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12165
12166 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12167 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12168 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12169 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12170
12171 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12172 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12173 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12174 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12175 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12176 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12177 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12178
12179 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12180 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12181 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12182 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12183
12184 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12185 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12186
12187 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12188}
#define RUBY_ASSERT(expr)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:177
#define RUBY_ASSERT_ALWAYS(expr)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:167
Atomic operations.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_isascii(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isascii(), except it additionally takes an encoding.
Definition ctype.h:82
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1200
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implenentation detail of RB_OBJ_FROZEN().
Definition fl_type.h:906
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition fl_type.h:356
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1125
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:923
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1033
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2284
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2108
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:2574
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:868
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2363
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:107
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:105
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:142
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition fl_type.h:67
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:398
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:145
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define OBJ_FREEZE_RAW
Old name of RB_OBJ_FREEZE_RAW.
Definition fl_type.h:144
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:143
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:108
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:395
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:393
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:533
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:140
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:137
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:652
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:66
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:534
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:535
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:97
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:532
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:67
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:139
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:68
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:107
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:141
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:109
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:651
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:138
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:146
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:68
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports always regardless of runtime -W flag.
Definition error.c:421
void rb_raise(VALUE exc, const char *fmt,...)
Exception entry point.
Definition error.c:3150
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:688
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3262
void rb_bug(const char *fmt,...)
Interpreter panic switch.
Definition error.c:794
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1095
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1091
void rb_fatal(const char *fmt,...)
Raises the unsung "fatal" exception.
Definition error.c:3201
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1098
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1089
VALUE rb_eArgError
ArgumentError exception.
Definition error.c:1092
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1093
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:589
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:1939
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1194
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3416
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:190
VALUE rb_cSymbol
Sumbol class.
Definition string.c:80
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:122
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
Definition object.c:1182
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:79
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3026
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition rgengc.h:220
Encoding relates APIs.
static const char * rb_enc_name(rb_encoding *enc)
Queries the (canonical) name of the passed encoding.
Definition encoding.h:433
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:699
static char * rb_enc_prev_char(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the previous (left) character.
Definition encoding.h:678
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:720
static bool rb_enc_asciicompat(rb_encoding *enc)
Queries if the passed encoding is in some sense compatible with ASCII.
Definition encoding.h:784
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:587
static int rb_enc_mbcput(unsigned int c, void *buf, rb_encoding *enc)
Identical to rb_enc_uint_chr(), except it writes back to the passed buffer instead of allocating one.
Definition encoding.h:659
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:463
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition encoding.h:607
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:448
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:635
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:742
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1208
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:821
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1074
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2716
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1093
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:11964
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:249
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2060
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
Definition string.c:3288
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:1021
VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it additionally takes an encoding.
Definition string.c:981
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1313
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1214
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:833
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:11975
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:719
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:407
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1453
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2630
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2884
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1709
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1102
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1189
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
#define rb_check_frozen
Just another name of rb_check_frozen.
Definition error.h:264
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:280
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:604
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
Definition io.c:200
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1662
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1009
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1668
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1578
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1229
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4114
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3597
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1435
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1861
VALUE rb_str_to_interned_str(VALUE str)
Identical to rb_interned_str(), except it takes a Ruby's string instead of C's.
Definition string.c:11945
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1571
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1376
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2211
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3353
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1289
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:11591
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2283
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1265
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1565
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:2744
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:4822
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:3577
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
Definition string.c:2826
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:10884
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1741
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1618
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:1056
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:871
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1382
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1834
void rb_str_modify(VALUE str)
Declares that the string is about to be modified.
Definition string.c:2437
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:3567
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
Definition string.c:3177
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2149
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:1840
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6024
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:2834
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:11958
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
Definition string.c:1295
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1604
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
Definition string.c:3319
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:2791
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:3679
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3020
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:6706
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2489
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:11951
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:3633
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:3453
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:3608
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3295
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:2942
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5326
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:10942
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1513
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2640
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:2921
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3003
VALUE rb_str_resize(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3064
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1068
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2445
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:6820
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1277
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1532
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2163
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5252
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:8890
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1062
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
Definition symbol.c:844
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
Definition string.c:1682
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:2805
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1142
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:276
ID rb_intern(const char *name)
Finds or creates a symbol of the given name.
Definition symbol.c:789
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
Definition symbol.c:942
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:11918
ID rb_to_id(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition string.c:11908
ID rb_intern_str(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
Definition symbol.c:795
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1765
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3376
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4358
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:214
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1357
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:366
#define ALLOCA_N(type, n)
Definition memory.h:286
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:354
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:161
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:343
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:69
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:152
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:71
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition rgengc.h:107
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:139
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
@ RSTRING_EMBED_LEN_MAX
Max possible number of characters that can be embedded.
Definition rstring.h:215
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:72
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1307
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2617
static long RSTRING_EMBED_LEN(VALUE str)
Queries the length of the string.
Definition rstring.h:423
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:554
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
Definition rstring.h:528
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:574
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2501
static long RSTRING_LEN(VALUE str)
Queries the length of the string.
Definition rstring.h:484
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1301
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2512
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1609
static char * RSTRING_PTR(VALUE str)
Queries the contents pointer of the string.
Definition rstring.h:498
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:95
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:441
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1330
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:77
Ruby's String.
Definition rstring.h:231
union RString::@50 as
String's specific fields.
struct RString::@50::@51 heap
Strings that use separated memory region for contents use this pattern.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:234
long capa
Capacity of *ptr.
Definition rstring.h:268
struct RString::@50::@52 embed
Embedded contents.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:250
char ary[RSTRING_EMBED_LEN_MAX+1]
When a string is short enough, it uses this area to store the contents themselves.
Definition rstring.h:298
union RString::@50::@51::@53 aux
Auxiliary info.
VALUE shared
Parent of the string.
Definition rstring.h:276
char * ptr
Pointer to the contents of the string.
Definition rstring.h:258
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:190
Definition st.h:79
Definition string.c:7775
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:299
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:432
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.
Definition value_type.h:375