14#include "ruby/internal/config.h"
24#include "debug_counter.h"
29#include "internal/array.h"
30#include "internal/compar.h"
31#include "internal/compilers.h"
32#include "internal/encoding.h"
33#include "internal/error.h"
34#include "internal/gc.h"
35#include "internal/numeric.h"
36#include "internal/object.h"
37#include "internal/proc.h"
38#include "internal/re.h"
39#include "internal/sanitizers.h"
40#include "internal/string.h"
41#include "internal/transcode.h"
46#include "ruby_assert.h"
49#if defined HAVE_CRYPT_R
50# if defined HAVE_CRYPT_H
53#elif !defined HAVE_CRYPT
54# include "missing/crypt.h"
55# define HAVE_CRYPT_R 1
58#define BEG(no) (regs->beg[(no)])
59#define END(no) (regs->end[(no)])
62#undef rb_usascii_str_new
66#undef rb_usascii_str_new_cstr
67#undef rb_utf8_str_new_cstr
68#undef rb_enc_str_new_cstr
69#undef rb_external_str_new_cstr
70#undef rb_locale_str_new_cstr
71#undef rb_str_dup_frozen
72#undef rb_str_buf_new_cstr
103#define RUBY_MAX_CHAR_LEN 16
104#define STR_SHARED_ROOT FL_USER5
105#define STR_BORROWED FL_USER6
106#define STR_TMPLOCK FL_USER7
107#define STR_NOFREE FL_USER18
108#define STR_FAKESTR FL_USER19
110#define STR_SET_NOEMBED(str) do {\
111 FL_SET((str), STR_NOEMBED);\
113 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
116 STR_SET_EMBED_LEN((str), 0);\
119#define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
121# define STR_SET_EMBED_LEN(str, n) do { \
122 assert(str_embed_capa(str) > (n));\
123 RSTRING(str)->as.embed.len = (n);\
126# define STR_SET_EMBED_LEN(str, n) do { \
128 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
129 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
133#define STR_SET_LEN(str, n) do { \
134 if (STR_EMBED_P(str)) {\
135 STR_SET_EMBED_LEN((str), (n));\
138 RSTRING(str)->as.heap.len = (n);\
142#define STR_DEC_LEN(str) do {\
143 if (STR_EMBED_P(str)) {\
144 long n = RSTRING_LEN(str);\
146 STR_SET_EMBED_LEN((str), n);\
149 RSTRING(str)->as.heap.len--;\
154str_enc_fastpath(
VALUE str)
158 case ENCINDEX_ASCII_8BIT:
160 case ENCINDEX_US_ASCII:
167#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
168#define TERM_FILL(ptr, termlen) do {\
169 char *const term_fill_ptr = (ptr);\
170 const int term_fill_len = (termlen);\
171 *term_fill_ptr = '\0';\
172 if (UNLIKELY(term_fill_len > 1))\
173 memset(term_fill_ptr, 0, term_fill_len);\
176#define RESIZE_CAPA(str,capacity) do {\
177 const int termlen = TERM_LEN(str);\
178 RESIZE_CAPA_TERM(str,capacity,termlen);\
180#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
181 if (STR_EMBED_P(str)) {\
182 if (str_embed_capa(str) < capacity + termlen) {\
183 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
184 const long tlen = RSTRING_LEN(str);\
185 memcpy(tmp, RSTRING_PTR(str), tlen);\
186 RSTRING(str)->as.heap.ptr = tmp;\
187 RSTRING(str)->as.heap.len = tlen;\
188 STR_SET_NOEMBED(str);\
189 RSTRING(str)->as.heap.aux.capa = (capacity);\
193 assert(!FL_TEST((str), STR_SHARED)); \
194 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
195 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
196 RSTRING(str)->as.heap.aux.capa = (capacity);\
200#define STR_SET_SHARED(str, shared_str) do { \
201 if (!FL_TEST(str, STR_FAKESTR)) { \
202 assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
203 assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
204 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
205 FL_SET((str), STR_SHARED); \
206 FL_SET((shared_str), STR_SHARED_ROOT); \
207 if (RBASIC_CLASS((shared_str)) == 0) \
208 FL_SET_RAW((shared_str), STR_BORROWED); \
212#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
213#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
216#define STR_ENC_GET(str) get_encoding(str)
218#if !defined SHARABLE_MIDDLE_SUBSTRING
219# define SHARABLE_MIDDLE_SUBSTRING 0
221#if !SHARABLE_MIDDLE_SUBSTRING
222#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
224#define SHARABLE_SUBSTRING_P(beg, len, end) 1
229str_embed_capa(
VALUE str)
232 return rb_gc_obj_slot_size(str) - offsetof(
struct RString, as.
embed.
ary);
239rb_str_reembeddable_p(
VALUE str)
241 return !
FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
245rb_str_embed_size(
long capa)
251rb_str_size_as_embedded(
VALUE str)
255 if (STR_EMBED_P(str)) {
256 real_size = rb_str_embed_size(
RSTRING(str)->as.embed.len) + TERM_LEN(str);
260 else if (rb_str_reembeddable_p(str)) {
261 real_size = rb_str_embed_size(
RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
265 real_size =
sizeof(
struct RString);
273STR_EMBEDDABLE_P(
long len,
long termlen)
276 return rb_gc_size_allocatable_p(rb_str_embed_size(
len + termlen));
284static VALUE str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding);
285static VALUE str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex);
287static void str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen);
288static inline void str_modifiable(
VALUE str);
292str_make_independent(
VALUE str)
295 int termlen = TERM_LEN(str);
296 str_make_independent_expand((str),
len, 0L, termlen);
299static inline int str_dependent_p(
VALUE str);
302rb_str_make_independent(
VALUE str)
304 if (str_dependent_p(str)) {
305 str_make_independent(str);
310rb_str_make_embedded(
VALUE str)
315 char *buf =
RSTRING(str)->as.heap.ptr;
319 STR_SET_EMBED_LEN(str,
len);
326 TERM_FILL(
RSTRING(str)->
as.embed.ary +
len, TERM_LEN(str));
333 if (new_root == old_root) {
339 if (!STR_EMBED_P(new_root)) {
343 size_t offset = (size_t)((uintptr_t)
RSTRING(str)->as.heap.ptr - (uintptr_t)
RSTRING(old_root)->as.embed.ary);
346 RSTRING(str)->as.heap.ptr =
RSTRING(new_root)->as.embed.ary + offset;
350rb_debug_rstring_null_ptr(
const char *func)
352 fprintf(stderr,
"%s is returning NULL!! "
353 "SIGSEGV is highly expected to follow immediately.\n"
354 "If you could reproduce, attach your debugger here, "
355 "and look at the passed string.\n",
360static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
363get_encoding(
VALUE str)
369mustnot_broken(
VALUE str)
371 if (is_broken_string(str)) {
377mustnot_wchar(
VALUE str)
387static VALUE register_fstring(
VALUE str,
bool copy);
394#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
402fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data,
int existing)
412 if (rb_objspace_garbage_object_p(str)) {
424 rb_enc_copy(new_str, str);
437 if (STR_SHARED_P(str)) {
439 str_make_independent(str);
442 if (!BARE_STRING_P(str)) {
446 RBASIC(str)->flags |= RSTRING_FSTR;
448 *key = *value = arg->fstr = str;
462 if (
FL_TEST(str, RSTRING_FSTR))
465 bare = BARE_STRING_P(str);
467 if (STR_EMBED_P(str)) {
471 if (
FL_TEST_RAW(str, STR_NOEMBED|STR_SHARED_ROOT|STR_SHARED) == (STR_NOEMBED|STR_SHARED_ROOT)) {
480 fstr = register_fstring(str, FALSE);
483 str_replace_shared_without_enc(str, fstr);
491register_fstring(
VALUE str,
bool copy)
498 st_table *frozen_strings = rb_vm_fstring_table();
501 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
502 }
while (UNDEF_P(args.fstr));
514setup_fake_str(
struct RString *fake_str,
const char *name,
long len,
int encidx)
530 return (
VALUE)fake_str;
537rb_setup_fake_str(
struct RString *fake_str,
const char *name,
long len,
rb_encoding *enc)
539 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
547MJIT_FUNC_EXPORTED
VALUE
548rb_fstring_new(
const char *ptr,
long len)
551 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII), FALSE);
558 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc), FALSE);
562rb_fstring_cstr(
const char *
ptr)
564 return rb_fstring_new(
ptr, strlen(
ptr));
568fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
578 const char *aptr, *bptr;
581 return (alen != blen ||
583 memcmp(aptr, bptr, alen) != 0);
587single_byte_optimizable(
VALUE str)
595 enc = STR_ENC_GET(str);
606static inline const char *
607search_nonascii(
const char *p,
const char *e)
609 const uintptr_t *s, *t;
611#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
612# if SIZEOF_UINTPTR_T == 8
613# define NONASCII_MASK UINT64_C(0x8080808080808080)
614# elif SIZEOF_UINTPTR_T == 4
615# define NONASCII_MASK UINT32_C(0x80808080)
617# error "don't know what to do."
620# if SIZEOF_UINTPTR_T == 8
621# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
622# elif SIZEOF_UINTPTR_T == 4
623# define NONASCII_MASK 0x80808080UL
625# error "don't know what to do."
629 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
630#if !UNALIGNED_WORD_ACCESS
631 if ((uintptr_t)p % SIZEOF_VOIDP) {
632 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
637 case 7:
if (p[-7]&0x80)
return p-7;
638 case 6:
if (p[-6]&0x80)
return p-6;
639 case 5:
if (p[-5]&0x80)
return p-5;
640 case 4:
if (p[-4]&0x80)
return p-4;
642 case 3:
if (p[-3]&0x80)
return p-3;
643 case 2:
if (p[-2]&0x80)
return p-2;
644 case 1:
if (p[-1]&0x80)
return p-1;
649#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
650#define aligned_ptr(value) \
651 __builtin_assume_aligned((value), sizeof(uintptr_t))
653#define aligned_ptr(value) (uintptr_t *)(value)
656 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
659 if (*s & NONASCII_MASK) {
660#ifdef WORDS_BIGENDIAN
661 return (
const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
663 return (
const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
673 case 7:
if (e[-7]&0x80)
return e-7;
674 case 6:
if (e[-6]&0x80)
return e-6;
675 case 5:
if (e[-5]&0x80)
return e-5;
676 case 4:
if (e[-4]&0x80)
return e-4;
678 case 3:
if (e[-3]&0x80)
return e-3;
679 case 2:
if (e[-2]&0x80)
return e-2;
680 case 1:
if (e[-1]&0x80)
return e-1;
688 const char *e = p +
len;
690 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
692 p = search_nonascii(p, e);
697 p = search_nonascii(p, e);
700 int ret = rb_enc_precise_mbclen(p, e, enc);
704 p = search_nonascii(p, e);
710 int ret = rb_enc_precise_mbclen(p, e, enc);
726 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
729 p = search_nonascii(p, e);
734 p = search_nonascii(p, e);
740 int ret = rb_enc_precise_mbclen(p, e, enc);
747 p = search_nonascii(p, e);
753 int ret = rb_enc_precise_mbclen(p, e, enc);
772rb_enc_cr_str_copy_for_substr(
VALUE dest,
VALUE src)
777 str_enc_copy(dest, src);
802rb_enc_cr_str_exact_copy(
VALUE dest,
VALUE src)
804 str_enc_copy(dest, src);
817 return enc_coderange_scan(str, enc);
826 cr = enc_coderange_scan(str, get_encoding(str));
839 else if (is_ascii_string(str))
845str_mod_check(
VALUE s,
const char *p,
long len)
853str_capacity(
VALUE str,
const int termlen)
855 if (STR_EMBED_P(str)) {
857 return str_embed_capa(str) - termlen;
862 else if (
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
863 return RSTRING(str)->as.heap.len;
866 return RSTRING(str)->as.heap.aux.capa;
873 return str_capacity(str, TERM_LEN(str));
877must_not_null(
const char *
ptr)
887 size_t size = rb_str_embed_size(
capa);
889 assert(rb_gc_size_allocatable_p(size));
891 assert(size <=
sizeof(
struct RString));
894 RVARGC_NEWOBJ_OF(str,
struct RString, klass,
901str_alloc_heap(
VALUE klass)
903 RVARGC_NEWOBJ_OF(str,
struct RString, klass,
910empty_str_alloc(
VALUE klass)
912 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
913 VALUE str = str_alloc_embed(klass, 0);
914 memset(
RSTRING(str)->
as.embed.ary, 0, str_embed_capa(str));
919str_new0(
VALUE klass,
const char *
ptr,
long len,
int termlen)
927 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
929 if (STR_EMBEDDABLE_P(
len, termlen)) {
930 str = str_alloc_embed(klass,
len + termlen);
936 str = str_alloc_heap(klass);
942 rb_xmalloc_mul_add_mul(
sizeof(
char),
len,
sizeof(
char), termlen);
947 STR_SET_LEN(str,
len);
955 return str_new0(klass,
ptr,
len, 1);
976 rb_enc_associate_index(str, rb_utf8_encindex());
988 rb_enc_associate(str, enc);
1000 __msan_unpoison_string(
ptr);
1016 rb_enc_associate_index(str, rb_utf8_encindex());
1031str_new_static(
VALUE klass,
const char *
ptr,
long len,
int encindex)
1040 rb_encoding *enc = rb_enc_get_from_index(encindex);
1044 RUBY_DTRACE_CREATE_HOOK(STRING,
len);
1045 str = str_alloc_heap(klass);
1049 RBASIC(str)->flags |= STR_NOFREE;
1051 rb_enc_associate_index(str, encindex);
1079static VALUE str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1081 int ecflags,
VALUE ecopts);
1086 int encidx = rb_enc_to_index(enc);
1087 if (rb_enc_get_index(str) == encidx)
1088 return is_ascii_string(str);
1099 if (!to)
return str;
1100 if (!from) from = rb_enc_get(str);
1101 if (from == to)
return str;
1103 rb_is_ascii8bit_enc(to)) {
1104 if (STR_ENC_GET(str) != to) {
1106 rb_enc_associate(str, to);
1113 from, to, ecflags, ecopts);
1114 if (
NIL_P(newstr)) {
1122rb_str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1128 if (ofs < -olen || olen < ofs)
1130 if (ofs < 0) ofs += olen;
1132 STR_SET_LEN(newstr, ofs);
1137 return str_cat_conv_enc_opts(newstr, ofs,
ptr,
len, from,
1145 STR_SET_LEN(str, 0);
1146 rb_enc_associate(str, enc);
1152str_cat_conv_enc_opts(
VALUE newstr,
long ofs,
const char *
ptr,
long len,
1154 int ecflags,
VALUE ecopts)
1159 VALUE econv_wrapper;
1160 const unsigned char *start, *sp;
1161 unsigned char *dest, *dp;
1162 size_t converted_output = (size_t)ofs;
1167 RBASIC_CLEAR_CLASS(econv_wrapper);
1169 if (!ec)
return Qnil;
1172 sp = (
unsigned char*)
ptr;
1174 while ((dest = (
unsigned char*)
RSTRING_PTR(newstr)),
1175 (dp = dest + converted_output),
1179 size_t converted_input = sp - start;
1180 size_t rest =
len - converted_input;
1181 converted_output = dp - dest;
1183 if (converted_input && converted_output &&
1184 rest < (LONG_MAX / converted_output)) {
1185 rest = (rest * converted_output) / converted_input;
1190 olen += rest < 2 ? 2 : rest;
1199 rb_enc_associate(newstr, to);
1218 const int eidx = rb_enc_to_index(eenc);
1225 if ((eidx == rb_ascii8bit_encindex()) ||
1226 (eidx == rb_usascii_encindex() && search_nonascii(
ptr,
ptr +
len))) {
1230 ienc = rb_default_internal_encoding();
1231 if (!ienc || eenc == ienc) {
1236 if ((eidx == rb_ascii8bit_encindex()) ||
1237 (eidx == rb_usascii_encindex()) ||
1245 if (
NIL_P(rb_str_cat_conv_enc_opts(str, 0,
ptr,
len, eenc, 0,
Qnil))) {
1246 rb_str_initialize(str,
ptr,
len, eenc);
1254 int eidx = rb_enc_to_index(eenc);
1255 if (eidx == rb_usascii_encindex() &&
1256 !is_ascii_string(str)) {
1257 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1260 rb_enc_associate_index(str, eidx);
1319str_replace_shared_without_enc(
VALUE str2,
VALUE str)
1321 const int termlen = TERM_LEN(str);
1326 if (str_embed_capa(str2) >=
len + termlen) {
1327 char *ptr2 =
RSTRING(str2)->as.embed.ary;
1328 STR_SET_EMBED(str2);
1330 STR_SET_EMBED_LEN(str2,
len);
1331 TERM_FILL(ptr2+
len, termlen);
1335 if (STR_SHARED_P(str)) {
1336 root =
RSTRING(str)->as.heap.aux.shared;
1344 if (!STR_EMBED_P(str2) && !
FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1346 rb_fatal(
"about to free a possible shared root");
1348 char *ptr2 = STR_HEAP_PTR(str2);
1350 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1353 FL_SET(str2, STR_NOEMBED);
1356 STR_SET_SHARED(str2, root);
1364 str_replace_shared_without_enc(str2, str);
1365 rb_enc_cr_str_exact_copy(str2, str);
1372 return str_replace_shared(str_alloc_heap(klass), str);
1389rb_str_new_frozen_String(
VALUE orig)
1396rb_str_tmp_frozen_acquire(
VALUE orig)
1399 return str_new_frozen_buffer(0, orig, FALSE);
1403rb_str_tmp_frozen_release(
VALUE orig,
VALUE tmp)
1408 if (STR_EMBED_P(tmp)) {
1421 RSTRING(orig)->as.heap.aux.capa =
RSTRING(tmp)->as.heap.aux.capa;
1422 RBASIC(orig)->flags |=
RBASIC(tmp)->flags & STR_NOFREE;
1427 STR_SET_EMBED_LEN(tmp, 0);
1435 return str_new_frozen_buffer(klass, orig, TRUE);
1441 assert(!STR_EMBED_P(orig));
1442 assert(!STR_SHARED_P(orig));
1444 VALUE str = str_alloc_heap(klass);
1447 RSTRING(str)->as.heap.aux.capa =
RSTRING(orig)->as.heap.aux.capa;
1448 RBASIC(str)->flags |=
RBASIC(orig)->flags & STR_NOFREE;
1449 RBASIC(orig)->flags &= ~STR_NOFREE;
1450 STR_SET_SHARED(orig, str);
1457str_new_frozen_buffer(
VALUE klass,
VALUE orig,
int copy_encoding)
1462 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1464 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(
len, termlen)) {
1466 assert(STR_EMBED_P(str));
1477 assert(!STR_EMBED_P(
shared));
1481 if ((ofs > 0) || (rest > 0) ||
1484 str = str_new_shared(klass,
shared);
1485 assert(!STR_EMBED_P(str));
1486 RSTRING(str)->as.heap.ptr += ofs;
1487 RSTRING(str)->as.heap.len -= ofs + rest;
1495 else if (STR_EMBEDDABLE_P(
RSTRING_LEN(orig), TERM_LEN(orig))) {
1496 str = str_alloc_embed(klass,
RSTRING_LEN(orig) + TERM_LEN(orig));
1503 str = heap_str_make_shared(klass, orig);
1507 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1519str_new_empty_String(
VALUE str)
1522 rb_enc_copy(v, str);
1526#define STR_BUF_MIN_SIZE 63
1534 if (STR_EMBEDDABLE_P(
capa, 1)) {
1541 if (
capa < STR_BUF_MIN_SIZE) {
1542 capa = STR_BUF_MIN_SIZE;
1547 RSTRING(str)->as.heap.ptr[0] =
'\0';
1567 return str_new(0, 0,
len);
1573 if (
FL_TEST(str, RSTRING_FSTR)) {
1574 st_data_t fstr = (st_data_t)str;
1578 st_delete(rb_vm_fstring_table(), &fstr, NULL);
1579 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1584 if (STR_EMBED_P(str)) {
1585 RB_DEBUG_COUNTER_INC(obj_str_embed);
1587 else if (
FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1588 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_SHARED));
1589 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared,
FL_TEST(str, STR_NOFREE));
1592 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1593 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1597RUBY_FUNC_EXPORTED
size_t
1598rb_str_memsize(
VALUE str)
1600 if (
FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1601 return STR_HEAP_SIZE(str);
1611 return rb_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
1614static inline void str_discard(
VALUE str);
1615static void str_shared_replace(
VALUE str,
VALUE str2);
1620 if (str != str2) str_shared_replace(str, str2);
1631 enc = STR_ENC_GET(str2);
1636 if (str_embed_capa(str) >=
RSTRING_LEN(str2) + termlen) {
1640 rb_enc_associate(str, enc);
1645 if (STR_EMBED_P(str2)) {
1646 assert(!
FL_TEST(str2, STR_SHARED));
1648 assert(
len + termlen <= str_embed_capa(str2));
1650 char *new_ptr =
ALLOC_N(
char,
len + termlen);
1651 memcpy(new_ptr,
RSTRING(str2)->
as.embed.ary,
len + termlen);
1652 RSTRING(str2)->as.heap.ptr = new_ptr;
1655 STR_SET_NOEMBED(str2);
1659 STR_SET_NOEMBED(str);
1664 if (
FL_TEST(str2, STR_SHARED)) {
1666 STR_SET_SHARED(str,
shared);
1669 RSTRING(str)->as.heap.aux.capa =
RSTRING(str2)->as.heap.aux.capa;
1673 STR_SET_EMBED(str2);
1675 STR_SET_EMBED_LEN(str2, 0);
1676 rb_enc_associate(str, enc);
1690 return rb_obj_as_string_result(str, obj);
1693MJIT_FUNC_EXPORTED
VALUE
1707 if (STR_SHARED_P(str2)) {
1710 STR_SET_NOEMBED(str);
1713 STR_SET_SHARED(str,
shared);
1714 rb_enc_cr_str_exact_copy(str, str2);
1717 str_replace_shared(str, str2);
1726 size_t size = rb_str_embed_size(
capa);
1728 assert(rb_gc_size_allocatable_p(size));
1730 assert(size <=
sizeof(
struct RString));
1733 RB_RVARGC_EC_NEWOBJ_OF(ec, str,
struct RString, klass,
1742 RB_RVARGC_EC_NEWOBJ_OF(ec, str,
struct RString, klass,
1751 const VALUE flag_mask =
1753 RSTRING_NOEMBED | RSTRING_EMBED_LEN_MASK |
1760 if (STR_EMBED_P(str)) {
1763 assert(STR_EMBED_P(dup));
1764 assert(str_embed_capa(dup) >=
len + 1);
1765 STR_SET_EMBED_LEN(dup,
len);
1771 root =
RSTRING(str)->as.heap.aux.shared;
1773 else if (UNLIKELY(!(flags &
FL_FREEZE))) {
1774 root = str = str_new_frozen(klass, str);
1777 assert(!STR_SHARED_P(root));
1781 else if (STR_EMBED_P(root)) {
1790 FL_SET(root, STR_SHARED_ROOT);
1792 flags |= RSTRING_NOEMBED | STR_SHARED;
1797 encidx = rb_enc_get_index(str);
1798 flags &= ~ENCODING_MASK;
1801 if (encidx) rb_enc_associate_index(dup, encidx);
1809 if (
FL_TEST(str, STR_NOEMBED)) {
1810 dup = ec_str_alloc_heap(ec, klass);
1816 return str_duplicate_setup(klass, str, dup);
1823 if (
FL_TEST(str, STR_NOEMBED)) {
1824 dup = str_alloc_heap(klass);
1830 return str_duplicate_setup(klass, str, dup);
1842 RUBY_DTRACE_CREATE_HOOK(STRING,
RSTRING_LEN(str));
1849 RUBY_DTRACE_CREATE_HOOK(STRING,
RSTRING_LEN(str));
1850 return ec_str_duplicate(ec,
rb_cString, str);
1865 static ID keyword_ids[2];
1866 VALUE orig, opt, venc, vcapa;
1871 if (!keyword_ids[0]) {
1872 keyword_ids[0] = rb_id_encoding();
1873 CONST_ID(keyword_ids[1],
"capacity");
1881 if (!UNDEF_P(venc) && !
NIL_P(venc)) {
1882 enc = rb_to_encoding(venc);
1884 if (!UNDEF_P(vcapa) && !
NIL_P(vcapa)) {
1889 if (
capa < STR_BUF_MIN_SIZE) {
1890 capa = STR_BUF_MIN_SIZE;
1898 if (orig == str) n = 0;
1900 str_modifiable(str);
1901 if (STR_EMBED_P(str)) {
1902 char *new_ptr =
ALLOC_N(
char, (
size_t)
capa + termlen);
1904 assert(
RSTRING(str)->
as.embed.len + 1 <= str_embed_capa(str));
1909 RSTRING(str)->as.heap.ptr = new_ptr;
1911 else if (
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1912 const size_t size = (size_t)
capa + termlen;
1914 const size_t osize =
RSTRING(str)->as.heap.len + TERM_LEN(str);
1915 char *new_ptr =
ALLOC_N(
char, (
size_t)
capa + termlen);
1916 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1918 RSTRING(str)->as.heap.ptr = new_ptr;
1920 else if (STR_HEAP_SIZE(str) != (
size_t)
capa + termlen) {
1921 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
1922 (
size_t)
capa + termlen, STR_HEAP_SIZE(str));
1928 rb_enc_cr_str_exact_copy(str, orig);
1930 FL_SET(str, STR_NOEMBED);
1937 rb_enc_associate(str, enc);
1948#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1963static inline uintptr_t
1964count_utf8_lead_bytes_with_word(
const uintptr_t *s)
1969 d = (d>>6) | (~d>>7);
1970 d &= NONASCII_MASK >> 7;
1973#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1975 return rb_popcount_intptr(d);
1979# if SIZEOF_VOIDP == 8
1988enc_strlen(
const char *p,
const char *e,
rb_encoding *enc,
int cr)
1994 long diff = (long)(e - p);
2000 if ((
int)
sizeof(uintptr_t) * 2 < e - p) {
2001 const uintptr_t *s, *t;
2002 const uintptr_t lowbits =
sizeof(uintptr_t) - 1;
2003 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2004 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2005 while (p < (
const char *)s) {
2006 if (is_utf8_lead_byte(*p))
len++;
2010 len += count_utf8_lead_bytes_with_word(s);
2013 p = (
const char *)s;
2016 if (is_utf8_lead_byte(*p))
len++;
2027 q = search_nonascii(p, e);
2033 p += rb_enc_fast_mbclen(p, e, enc);
2040 q = search_nonascii(p, e);
2046 p += rb_enc_mbclen(p, e, enc);
2053 for (c=0; p<e; c++) {
2054 p += rb_enc_mbclen(p, e, enc);
2069rb_enc_strlen_cr(
const char *p,
const char *e,
rb_encoding *enc,
int *cr)
2077 long diff = (long)(e - p);
2084 q = search_nonascii(p, e);
2092 ret = rb_enc_precise_mbclen(p, e, enc);
2107 for (c=0; p<e; c++) {
2108 ret = rb_enc_precise_mbclen(p, e, enc);
2132 if (single_byte_optimizable(str))
return RSTRING_LEN(str);
2133 if (!enc) enc = STR_ENC_GET(str);
2139 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2144 return enc_strlen(p, e, enc, cr);
2151 return str_strlen(str, NULL);
2165 return LONG2NUM(str_strlen(str, NULL));
2177rb_str_bytesize(
VALUE str)
2195rb_str_empty(
VALUE str)
2215 char *ptr1, *ptr2, *ptr3;
2220 enc = rb_enc_check_str(str1, str2);
2224 if (len1 > LONG_MAX - len2) {
2227 str3 = str_new0(
rb_cString, 0, len1+len2, termlen);
2229 memcpy(ptr3, ptr1, len1);
2230 memcpy(ptr3+len1, ptr2, len2);
2231 TERM_FILL(&ptr3[len1+len2], termlen);
2241MJIT_FUNC_EXPORTED
VALUE
2247 MAYBE_UNUSED(
char) *ptr1, *ptr2;
2250 int enc1 = rb_enc_get_index(str1);
2251 int enc2 = rb_enc_get_index(str2);
2256 else if (enc2 < 0) {
2259 else if (enc1 != enc2) {
2262 else if (len1 > LONG_MAX - len2) {
2295 rb_enc_copy(str2, str);
2303 if (STR_EMBEDDABLE_P(
len, 1)) {
2312 STR_SET_LEN(str2,
len);
2313 rb_enc_copy(str2, str);
2321 termlen = TERM_LEN(str);
2327 while (n <=
len/2) {
2328 memcpy(ptr2 + n, ptr2, n);
2331 memcpy(ptr2 + n, ptr2,
len-n);
2333 STR_SET_LEN(str2,
len);
2334 TERM_FILL(&ptr2[
len], termlen);
2335 rb_enc_cr_str_copy_for_substr(str2, str);
2361 VALUE tmp = rb_check_array_type(arg);
2370rb_check_lockedtmp(
VALUE str)
2372 if (
FL_TEST(str, STR_TMPLOCK)) {
2378str_modifiable(
VALUE str)
2380 rb_check_lockedtmp(str);
2385str_dependent_p(
VALUE str)
2387 if (STR_EMBED_P(str) || !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2396str_independent(
VALUE str)
2398 str_modifiable(str);
2399 return !str_dependent_p(str);
2403str_make_independent_expand(
VALUE str,
long len,
long expand,
const int termlen)
2411 if (!STR_EMBED_P(str) && str_embed_capa(str) >=
capa + termlen) {
2416 STR_SET_EMBED_LEN(str,
len);
2423 memcpy(
ptr, oldptr,
len);
2425 if (
FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2428 STR_SET_NOEMBED(str);
2429 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2430 TERM_FILL(
ptr +
len, termlen);
2439 if (!str_independent(str))
2440 str_make_independent(str);
2447 int termlen = TERM_LEN(str);
2453 if (expand >= LONG_MAX -
len) {
2457 if (!str_independent(str)) {
2458 str_make_independent_expand(str,
len, expand, termlen);
2460 else if (expand > 0) {
2461 RESIZE_CAPA_TERM(str,
len + expand, termlen);
2468str_modify_keep_cr(
VALUE str)
2470 if (!str_independent(str))
2471 str_make_independent(str);
2478str_discard(
VALUE str)
2480 str_modifiable(str);
2481 if (!STR_EMBED_P(str) && !
FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2482 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2483 RSTRING(str)->as.heap.ptr = 0;
2484 RSTRING(str)->as.heap.len = 0;
2519zero_filled(
const char *s,
int n)
2521 for (; n > 0; --n) {
2528str_null_char(
const char *s,
long len,
const int minlen,
rb_encoding *enc)
2530 const char *e = s +
len;
2532 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2533 if (zero_filled(s, minlen))
return s;
2539str_fill_term(
VALUE str,
char *s,
long len,
int termlen)
2544 if (str_dependent_p(str)) {
2545 if (!zero_filled(s +
len, termlen))
2546 str_make_independent_expand(str,
len, 0L, termlen);
2549 TERM_FILL(s +
len, termlen);
2556rb_str_change_terminator_length(
VALUE str,
const int oldtermlen,
const int termlen)
2558 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2563 rb_check_lockedtmp(str);
2564 str_make_independent_expand(str,
len, 0L, termlen);
2566 else if (str_dependent_p(str)) {
2567 if (termlen > oldtermlen)
2568 str_make_independent_expand(str,
len, 0L, termlen);
2571 if (!STR_EMBED_P(str)) {
2573 assert(!
FL_TEST((str), STR_SHARED));
2576 if (termlen > oldtermlen) {
2585str_null_check(
VALUE str,
int *w)
2594 if (str_null_char(s,
len, minlen, enc)) {
2597 return str_fill_term(str, s,
len, minlen);
2600 if (!s || memchr(s, 0,
len)) {
2604 s = str_fill_term(str, s,
len, minlen);
2610rb_str_to_cstr(
VALUE str)
2613 return str_null_check(str, &w);
2621 char *s = str_null_check(str, &w);
2632rb_str_fill_terminator(
VALUE str,
const int newminlen)
2636 return str_fill_term(str, s,
len, newminlen);
2642 str = rb_check_convert_type_with_id(str,
T_STRING,
"String", idTo_str);
2666str_nth_len(
const char *p,
const char *e,
long *nthp,
rb_encoding *enc)
2676 const char *p2, *e2;
2679 while (p < e && 0 < nth) {
2686 p2 = search_nonascii(p, e2);
2695 n = rb_enc_mbclen(p, e, enc);
2706 while (p < e && nth--) {
2707 p += rb_enc_mbclen(p, e, enc);
2718 return str_nth_len(p, e, &nth, enc);
2722str_nth(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
2727 p = str_nth_len(p, e, &nth, enc);
2736str_offset(
const char *p,
const char *e,
long nth,
rb_encoding *enc,
int singlebyte)
2738 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2739 if (!pp)
return e - p;
2747 STR_ENC_GET(str), single_byte_optimizable(str));
2752str_utf8_nth(
const char *p,
const char *e,
long *nthp)
2755 if ((
int)SIZEOF_VOIDP * 2 < e - p && (
int)SIZEOF_VOIDP * 2 < nth) {
2756 const uintptr_t *s, *t;
2757 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2758 s = (
const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2759 t = (
const uintptr_t*)(~lowbits & (uintptr_t)e);
2760 while (p < (
const char *)s) {
2761 if (is_utf8_lead_byte(*p)) nth--;
2765 nth -= count_utf8_lead_bytes_with_word(s);
2767 }
while (s < t && (
int)SIZEOF_VOIDP <= nth);
2771 if (is_utf8_lead_byte(*p)) {
2772 if (nth == 0)
break;
2782str_utf8_offset(
const char *p,
const char *e,
long nth)
2784 const char *pp = str_utf8_nth(p, e, &nth);
2793 if (single_byte_optimizable(str) || pos < 0)
2797 return enc_strlen(p, p + pos, STR_ENC_GET(str),
ENC_CODERANGE(str));
2802str_subseq(
VALUE str,
long beg,
long len)
2806 const long rstring_embed_capa_max = ((
sizeof(
struct RString) - offsetof(struct
RString,
as.
embed.
ary)) / sizeof(char)) - 1;
2809 len <= rstring_embed_capa_max) {
2816 RSTRING(str2)->as.heap.ptr += beg;
2828 VALUE str2 = str_subseq(str, beg,
len);
2829 rb_enc_cr_str_copy_for_substr(str2, str);
2842 if (
len < 0)
return 0;
2846 if (single_byte_optimizable(str)) {
2847 if (beg > blen)
return 0;
2850 if (beg < 0)
return 0;
2852 if (
len > blen - beg)
2854 if (
len < 0)
return 0;
2859 if (
len > -beg)
len = -beg;
2871 slen = str_strlen(str, enc);
2873 if (beg < 0)
return 0;
2875 if (
len == 0)
goto end;
2882 if (beg > str_strlen(str, enc))
return 0;
2887 enc == rb_utf8_encoding()) {
2888 p = str_utf8_nth(s, e, &beg);
2889 if (beg > 0)
return 0;
2890 len = str_utf8_offset(p, e,
len);
2896 p = s + beg * char_sz;
2900 else if (
len * char_sz > e - p)
2905 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2906 if (beg > 0)
return 0;
2910 len = str_offset(p, e,
len, enc, 0);
2918static VALUE str_substr(
VALUE str,
long beg,
long len,
int empty);
2923 return str_substr(str, beg,
len, TRUE);
2927str_substr(
VALUE str,
long beg,
long len,
int empty)
2931 if (!p)
return Qnil;
2932 if (!
len && !empty)
return Qnil;
2936 VALUE str2 = str_subseq(str, beg,
len);
2937 rb_enc_cr_str_copy_for_substr(str2, str);
2981str_uminus(
VALUE str)
2986 return rb_fstring(str);
2990#define rb_str_dup_frozen rb_str_new_frozen
2995 if (
FL_TEST(str, STR_TMPLOCK)) {
2998 FL_SET(str, STR_TMPLOCK);
3005 if (!
FL_TEST(str, STR_TMPLOCK)) {
3012RUBY_FUNC_EXPORTED
VALUE
3023 const int termlen = TERM_LEN(str);
3025 str_modifiable(str);
3026 if (STR_SHARED_P(str)) {
3029 if (
len > (
capa = (
long)str_capacity(str, termlen)) ||
len < 0) {
3059 STR_SET_LEN(str,
len);
3070 int independent = str_independent(str);
3079 const int termlen = TERM_LEN(str);
3080 if (STR_EMBED_P(str)) {
3081 if (
len == slen)
return str;
3082 if (str_embed_capa(str) >=
len + termlen) {
3083 STR_SET_EMBED_LEN(str,
len);
3087 str_make_independent_expand(str, slen,
len - slen, termlen);
3089 else if (str_embed_capa(str) >=
len + termlen) {
3090 char *
ptr = STR_HEAP_PTR(str);
3092 if (slen >
len) slen =
len;
3095 STR_SET_EMBED_LEN(str,
len);
3096 if (independent) ruby_xfree(
ptr);
3099 else if (!independent) {
3100 if (
len == slen)
return str;
3101 str_make_independent_expand(str, slen,
len - slen, termlen);
3105 SIZED_REALLOC_N(
RSTRING(str)->
as.heap.ptr,
char,
3106 (
size_t)
len + termlen, STR_HEAP_SIZE(str));
3109 else if (
len == slen)
return str;
3117str_buf_cat4(
VALUE str,
const char *
ptr,
long len,
bool keep_cr)
3120 str_modify_keep_cr(str);
3125 if (
len == 0)
return 0;
3127 long capa, total, olen, off = -1;
3129 const int termlen = TERM_LEN(str);
3135 if (
ptr >= sptr &&
ptr <= sptr + olen) {
3139 if (STR_EMBED_P(str)) {
3140 capa = str_embed_capa(str) - termlen;
3141 sptr =
RSTRING(str)->as.embed.ary;
3146 sptr =
RSTRING(str)->as.heap.ptr;
3147 olen =
RSTRING(str)->as.heap.len;
3149 if (olen > LONG_MAX -
len) {
3154 if (total >= LONG_MAX / 2) {
3157 while (total >
capa) {
3160 RESIZE_CAPA_TERM(str,
capa, termlen);
3166 memcpy(sptr + olen,
ptr,
len);
3167 STR_SET_LEN(str, total);
3168 TERM_FILL(sptr + total, termlen);
3173#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3174#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3179 if (
len == 0)
return str;
3183 return str_buf_cat(str,
ptr,
len);
3198rb_enc_cr_str_buf_cat(
VALUE str,
const char *
ptr,
long len,
3199 int ptr_encindex,
int ptr_cr,
int *ptr_cr_ret)
3208 if (str_encindex == ptr_encindex) {
3210 ptr_cr = coderange_scan(
ptr,
len, rb_enc_from_index(ptr_encindex));
3214 str_enc = rb_enc_from_index(str_encindex);
3215 ptr_enc = rb_enc_from_index(ptr_encindex);
3228 ptr_cr = coderange_scan(
ptr,
len, ptr_enc);
3237 *ptr_cr_ret = ptr_cr;
3239 if (str_encindex != ptr_encindex &&
3242 str_enc = rb_enc_from_index(str_encindex);
3243 ptr_enc = rb_enc_from_index(ptr_encindex);
3248 res_encindex = str_encindex;
3253 res_encindex = str_encindex;
3257 res_encindex = ptr_encindex;
3262 res_encindex = str_encindex;
3269 res_encindex = str_encindex;
3277 str_buf_cat(str,
ptr,
len);
3290 return rb_enc_cr_str_buf_cat(str,
ptr,
len,
3301 return rb_enc_cr_str_buf_cat(str,
ptr, strlen(
ptr),
3307 unsigned int c = (
unsigned char)*
ptr;
3308 int len = rb_enc_codelen(c, enc);
3310 rb_enc_cr_str_buf_cat(str, buf,
len,
3323 if (str_enc_fastpath(str)) {
3359#define MIN_PRE_ALLOC_SIZE 48
3361MJIT_FUNC_EXPORTED
VALUE
3362rb_str_concat_literals(
size_t num,
const VALUE *strary)
3372 if (LIKELY(
len < MIN_PRE_ALLOC_SIZE)) {
3378 rb_enc_copy(str, strary[0]);
3382 for (i = s; i < num; ++i) {
3383 const VALUE v = strary[i];
3387 if (encidx != ENCINDEX_US_ASCII) {
3389 rb_enc_set_index(str, encidx);
3414rb_str_concat_multi(
int argc,
VALUE *argv,
VALUE str)
3416 str_modifiable(str);
3421 else if (argc > 1) {
3424 rb_enc_copy(arg_str, str);
3425 for (i = 0; i < argc; i++) {
3460 if (rb_num_to_uint(str2, &code) == 0) {
3473 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3476 buf[0] = (char)code;
3478 if (encidx != rb_enc_to_index(enc)) {
3479 rb_enc_associate_index(str1, encidx);
3489 switch (
len = rb_enc_codelen(code, enc)) {
3490 case ONIGERR_INVALID_CODE_POINT_VALUE:
3493 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3500 if (rb_enc_precise_mbclen(buf, buf +
len + 1, enc) !=
len) {
3513rb_ascii8bit_appendable_encoding_index(
rb_encoding *enc,
unsigned int code)
3515 int encidx = rb_enc_to_index(enc);
3517 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3522 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3523 return ENCINDEX_ASCII_8BIT;
3546rb_str_prepend_multi(
int argc,
VALUE *argv,
VALUE str)
3548 str_modifiable(str);
3553 else if (argc > 1) {
3556 rb_enc_copy(arg_str, str);
3557 for (i = 0; i < argc; i++) {
3570 if (e && is_ascii_string(str)) {
3580 const char *ptr1, *ptr2;
3583 return (len1 != len2 ||
3585 memcmp(ptr1, ptr2, len1) != 0);
3599rb_str_hash_m(
VALUE str)
3605#define lesser(a,b) (((a)>(b))?(b):(a))
3617 if (idx1 == idx2)
return TRUE;
3636 const char *ptr1, *ptr2;
3639 if (str1 == str2)
return 0;
3642 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3651 if (len1 > len2)
return 1;
3654 if (retval > 0)
return 1;
3681 if (str1 == str2)
return Qtrue;
3688 return rb_str_eql_internal(str1, str2);
3709MJIT_FUNC_EXPORTED
VALUE
3712 if (str1 == str2)
return Qtrue;
3714 return rb_str_eql_internal(str1, str2);
3745 return rb_invcmp(str1, str2);
3787 return str_casecmp(str1, s);
3795 const char *p1, *p1end, *p2, *p2end;
3797 enc = rb_enc_compatible(str1, str2);
3804 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3805 while (p1 < p1end && p2 < p2end) {
3807 unsigned int c1 =
TOLOWER(*p1 & 0xff);
3808 unsigned int c2 =
TOLOWER(*p2 & 0xff);
3810 return INT2FIX(c1 < c2 ? -1 : 1);
3817 while (p1 < p1end && p2 < p2end) {
3818 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3819 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3821 if (0 <= c1 && 0 <= c2) {
3825 return INT2FIX(c1 < c2 ? -1 : 1);
3829 l1 = rb_enc_mbclen(p1, p1end, enc);
3830 l2 = rb_enc_mbclen(p2, p2end, enc);
3831 len = l1 < l2 ? l1 : l2;
3832 r = memcmp(p1, p2,
len);
3834 return INT2FIX(r < 0 ? -1 : 1);
3836 return INT2FIX(l1 < l2 ? -1 : 1);
3877 return str_casecmp_p(str1, s);
3884 VALUE folded_str1, folded_str2;
3885 VALUE fold_opt = sym_fold;
3887 enc = rb_enc_compatible(str1, str2);
3892 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3893 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3895 return rb_str_eql(folded_str1, folded_str2);
3899strseq_core(
const char *str_ptr,
const char *str_ptr_end,
long str_len,
3900 const char *sub_ptr,
long sub_len,
long offset,
rb_encoding *enc)
3902 const char *search_start = str_ptr;
3903 long pos, search_len = str_len - offset;
3907 pos =
rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3908 if (pos < 0)
return pos;
3910 if (t == search_start + pos)
break;
3911 search_len -= t - search_start;
3912 if (search_len <= 0)
return -1;
3913 offset += t - search_start;
3916 return pos + offset;
3919#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3922rb_strseq_index(
VALUE str,
VALUE sub,
long offset,
int in_byte)
3924 const char *str_ptr, *str_ptr_end, *sub_ptr;
3925 long str_len, sub_len;
3928 enc = rb_enc_check(str, sub);
3929 if (is_broken_string(sub))
return -1;
3937 if (str_len < sub_len)
return -1;
3940 long str_len_char, sub_len_char;
3941 int single_byte = single_byte_optimizable(str);
3942 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3943 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3945 offset += str_len_char;
3946 if (offset < 0)
return -1;
3948 if (str_len_char - offset < sub_len_char)
return -1;
3949 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3952 if (sub_len == 0)
return offset;
3955 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3969rb_str_index_m(
int argc,
VALUE *argv,
VALUE str)
3975 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
3982 pos += str_strlen(str, NULL);
3992 if (pos > str_strlen(str, NULL))
3995 rb_enc_check(str, sub), single_byte_optimizable(str));
4009 pos = rb_str_index(str, sub, pos);
4013 if (pos == -1)
return Qnil;
4022str_check_byte_pos(
VALUE str,
long pos)
4026 const char *p = s + pos;
4073rb_str_byteindex_m(
int argc,
VALUE *argv,
VALUE str)
4079 if (
rb_scan_args(argc, argv,
"11", &sub, &initpos) == 2) {
4085 if (pos < 0 || pos > slen) {
4096 if (!str_check_byte_pos(str, pos)) {
4098 "offset %ld does not land on character boundary", pos);
4114 pos = rb_strseq_index(str, sub, pos, 1);
4117 if (pos == -1)
return Qnil;
4125 char *hit, *adjusted;
4127 long slen, searchlen;
4132 if (slen == 0)
return s - sbeg;
4136 searchlen = s - sbeg + 1;
4139 hit = memrchr(sbeg, c, searchlen);
4142 if (hit != adjusted) {
4143 searchlen = adjusted - sbeg;
4146 if (memcmp(hit, t, slen) == 0)
4148 searchlen = adjusted - sbeg;
4149 }
while (searchlen > 0);
4166 if (memcmp(s, t, slen) == 0) {
4169 if (s <= sbeg)
break;
4185 enc = rb_enc_check(str, sub);
4186 if (is_broken_string(sub))
return -1;
4187 singlebyte = single_byte_optimizable(str);
4188 len = singlebyte ?
RSTRING_LEN(str) : str_strlen(str, enc);
4189 slen = str_strlen(sub, enc);
4192 if (len < slen)
return -1;
4193 if (len - pos < slen) pos = len - slen;
4194 if (len == 0)
return pos;
4205 s = str_nth(sbeg,
RSTRING_END(str), pos, enc, singlebyte);
4267rb_str_rindex_m(
int argc,
VALUE *argv,
VALUE str)
4272 long pos, len = str_strlen(str, enc);
4274 if (
rb_scan_args(argc, argv,
"11", &sub, &vpos) == 2) {
4285 if (pos > len) pos = len;
4294 enc, single_byte_optimizable(str));
4305 pos = rb_str_rindex(str, sub, pos);
4306 if (pos >= 0)
return LONG2NUM(pos);
4312rb_str_byterindex(
VALUE str,
VALUE sub,
long pos)
4318 enc = rb_enc_check(str, sub);
4319 if (is_broken_string(sub))
return -1;
4324 if (len < slen)
return -1;
4325 if (len - pos < slen) pos = len - slen;
4326 if (len == 0)
return pos;
4338 return str_rindex(str, sub, s, enc);
4403rb_str_byterindex_m(
int argc,
VALUE *argv,
VALUE str)
4409 if (
rb_scan_args(argc, argv,
"11", &sub, &vpos) == 2) {
4420 if (pos > len) pos = len;
4426 if (!str_check_byte_pos(str, pos)) {
4428 "offset %ld does not land on character boundary", pos);
4441 pos = rb_str_byterindex(str, sub, pos);
4442 if (pos >= 0)
return LONG2NUM(pos);
4478 switch (OBJ_BUILTIN_TYPE(y)) {
4530rb_str_match_m(
int argc,
VALUE *argv,
VALUE str)
4537 result = rb_funcallv(get_pat(re),
rb_intern(
"match"), argc, argv);
4569rb_str_match_m_p(
int argc,
VALUE *argv,
VALUE str)
4573 re = get_pat(argv[0]);
4574 return rb_reg_match_p(re, str, argc > 1 ?
NUM2LONG(argv[1]) : 0);
4583static enum neighbor_char
4591 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4593 return NEIGHBOR_NOT_CHAR;
4597 if (!l)
return NEIGHBOR_NOT_CHAR;
4598 if (l != len)
return NEIGHBOR_WRAPPED;
4600 r = rb_enc_precise_mbclen(p, p + len, enc);
4602 return NEIGHBOR_NOT_CHAR;
4604 return NEIGHBOR_FOUND;
4607 for (i = len-1; 0 <= i && (
unsigned char)p[i] == 0xff; i--)
4610 return NEIGHBOR_WRAPPED;
4611 ++((
unsigned char*)p)[i];
4612 l = rb_enc_precise_mbclen(p, p+len, enc);
4616 return NEIGHBOR_FOUND;
4619 memset(p+l, 0xff, len-l);
4625 for (len2 = len-1; 0 < len2; len2--) {
4626 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4630 memset(p+len2+1, 0xff, len-(len2+1));
4635static enum neighbor_char
4642 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4644 return NEIGHBOR_NOT_CHAR;
4647 if (!c)
return NEIGHBOR_NOT_CHAR;
4650 if (!l)
return NEIGHBOR_NOT_CHAR;
4651 if (l != len)
return NEIGHBOR_WRAPPED;
4653 r = rb_enc_precise_mbclen(p, p + len, enc);
4655 return NEIGHBOR_NOT_CHAR;
4657 return NEIGHBOR_FOUND;
4660 for (i = len-1; 0 <= i && (
unsigned char)p[i] == 0; i--)
4663 return NEIGHBOR_WRAPPED;
4664 --((
unsigned char*)p)[i];
4665 l = rb_enc_precise_mbclen(p, p+len, enc);
4669 return NEIGHBOR_FOUND;
4672 memset(p+l, 0, len-l);
4678 for (len2 = len-1; 0 < len2; len2--) {
4679 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4683 memset(p+len2+1, 0, len-(len2+1));
4697static enum neighbor_char
4698enc_succ_alnum_char(
char *p,
long len,
rb_encoding *enc,
char *carry)
4700 enum neighbor_char ret;
4704 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4708 const int max_gaps = 1;
4712 ctype = ONIGENC_CTYPE_DIGIT;
4714 ctype = ONIGENC_CTYPE_ALPHA;
4716 return NEIGHBOR_NOT_CHAR;
4718 MEMCPY(save, p,
char, len);
4719 for (
try = 0;
try <= max_gaps; ++
try) {
4720 ret = enc_succ_char(p, len, enc);
4721 if (ret == NEIGHBOR_FOUND) {
4724 return NEIGHBOR_FOUND;
4727 MEMCPY(p, save,
char, len);
4730 MEMCPY(save, p,
char, len);
4731 ret = enc_pred_char(p, len, enc);
4732 if (ret == NEIGHBOR_FOUND) {
4735 MEMCPY(p, save,
char, len);
4740 MEMCPY(p, save,
char, len);
4746 return NEIGHBOR_NOT_CHAR;
4749 if (ctype != ONIGENC_CTYPE_DIGIT) {
4750 MEMCPY(carry, p,
char, len);
4751 return NEIGHBOR_WRAPPED;
4754 MEMCPY(carry, p,
char, len);
4755 enc_succ_char(carry, len, enc);
4756 return NEIGHBOR_WRAPPED;
4826 rb_enc_cr_str_copy_for_substr(str, orig);
4827 return str_succ(str);
4834 char *sbeg, *s, *e, *last_alnum = 0;
4835 int found_alnum = 0;
4837 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] =
"\1";
4838 long carry_pos = 0, carry_len = 1;
4839 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4842 if (slen == 0)
return str;
4844 enc = STR_ENC_GET(str);
4846 s = e = sbeg + slen;
4849 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4855 l = rb_enc_precise_mbclen(s, e, enc);
4856 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
4857 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4858 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4860 case NEIGHBOR_NOT_CHAR:
4862 case NEIGHBOR_FOUND:
4864 case NEIGHBOR_WRAPPED:
4869 carry_pos = s - sbeg;
4875 enum neighbor_char neighbor;
4876 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4877 l = rb_enc_precise_mbclen(s, e, enc);
4878 if (!ONIGENC_MBCLEN_CHARFOUND_P(l))
continue;
4879 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4881 neighbor = enc_succ_char(tmp, l, enc);
4883 case NEIGHBOR_FOUND:
4887 case NEIGHBOR_WRAPPED:
4890 case NEIGHBOR_NOT_CHAR:
4893 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4895 enc_succ_char(s, l, enc);
4898 MEMCPY(carry, s,
char, l);
4901 carry_pos = s - sbeg;
4905 RESIZE_CAPA(str, slen + carry_len);
4907 s = sbeg + carry_pos;
4908 memmove(s + carry_len, s, slen - carry_pos);
4909 memmove(s, carry, carry_len);
4911 STR_SET_LEN(str, slen);
4928rb_str_succ_bang(
VALUE str)
4936all_digits_p(
const char *s,
long len)
4990 VALUE end, exclusive;
4994 return rb_str_upto_each(beg, end,
RTEST(exclusive), str_upto_i,
Qnil);
5000 VALUE current, after_end;
5007 enc = rb_enc_check(beg, end);
5008 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5014 if (c > e || (excl && c == e))
return beg;
5017 if (!excl && c == e)
break;
5019 if (excl && c == e)
break;
5031 b = rb_str_to_inum(beg, 10, FALSE);
5032 e = rb_str_to_inum(end, 10, FALSE);
5039 if (excl && bi == ei)
break;
5040 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5045 ID op = excl ?
'<' : idLE;
5046 VALUE args[2], fmt = rb_fstring_lit(
"%.*d");
5051 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5052 b = rb_funcallv(b, succ, 0, 0);
5059 if (n > 0 || (excl && n == 0))
return beg;
5061 after_end = rb_funcallv(end, succ, 0, 0);
5066 next = rb_funcallv(current, succ, 0, 0);
5067 if ((*each)(current, arg))
break;
5068 if (
NIL_P(next))
break;
5089 VALUE b, args[2], fmt = rb_fstring_lit(
"%.*d");
5091 b = rb_str_to_inum(beg, 10, FALSE);
5097 if ((*each)(
rb_enc_sprintf(usascii,
"%.*ld", width, bi), arg))
break;
5105 if ((*each)(
rb_str_format(numberof(args), args, fmt), arg))
break;
5106 b = rb_funcallv(b, succ, 0, 0);
5112 VALUE next = rb_funcallv(current, succ, 0, 0);
5113 if ((*each)(current, arg))
break;
5127 if (!
rb_equal(str, *argp))
return 0;
5156 if (b <= v && v < e)
return Qtrue;
5157 return RBOOL(!
RTEST(exclusive) && v == e);
5170 rb_str_upto_each(beg, end,
RTEST(exclusive), include_range_i, (
VALUE)&val);
5172 return RBOOL(
NIL_P(val));
5195 return rb_str_subpat(str, indx,
INT2FIX(0));
5198 if (rb_str_index(str, indx, 0) != -1)
5204 long beg, len = str_strlen(str, NULL);
5216 return str_substr(str, idx, 1, FALSE);
5235rb_str_aref_m(
int argc,
VALUE *argv,
VALUE str)
5239 return rb_str_subpat(str, argv[0], argv[1]);
5248 return rb_str_aref(str, argv[0]);
5257 str_modifiable(str);
5258 if (len > olen) len = olen;
5260 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5262 int fl = (int)(
RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5264 STR_SET_EMBED_LEN(str, nlen);
5265 ptr =
RSTRING(str)->as.embed.ary;
5266 memmove(ptr, oldptr + len, nlen);
5267 if (fl == STR_NOEMBED)
xfree(oldptr);
5270 if (!STR_SHARED_P(str)) {
5272 rb_enc_cr_str_exact_copy(shared, str);
5275 ptr =
RSTRING(str)->as.heap.ptr += len;
5276 RSTRING(str)->as.heap.len = nlen;
5284rb_str_splice_0(
VALUE str,
long beg,
long len,
VALUE val)
5290 if (beg == 0 && vlen == 0) {
5295 str_modify_keep_cr(str);
5299 RESIZE_CAPA(str, slen + vlen - len);
5309 memmove(sptr + beg + vlen,
5311 slen - (beg + len));
5313 if (vlen < beg && len < 0) {
5314 MEMZERO(sptr + slen,
char, -len);
5320 STR_SET_LEN(str, slen);
5321 TERM_FILL(&sptr[slen], TERM_LEN(str));
5331 int singlebyte = single_byte_optimizable(str);
5337 enc = rb_enc_check(str, val);
5338 slen = str_strlen(str, enc);
5340 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5347 assert(beg <= slen);
5348 if (len > slen - beg) {
5353 e = str_nth(p,
RSTRING_END(str), len, enc, singlebyte);
5358 rb_str_splice_0(str, beg, len, val);
5359 rb_enc_associate(str, enc);
5365#define rb_str_splice(str, beg, len, val) rb_str_update(str, beg, len, val)
5372 long start, end, len;
5382 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5386 nth += regs->num_regs;
5396 enc = rb_enc_check_str(str, val);
5397 rb_str_splice_0(str, start, len, val);
5398 rb_enc_associate(str, enc);
5406 switch (
TYPE(indx)) {
5408 rb_str_subpat_set(str, indx,
INT2FIX(0), val);
5412 beg = rb_str_index(str, indx, 0);
5417 rb_str_splice(str, beg, str_strlen(indx, NULL), val);
5425 rb_str_splice(str, beg, len, val);
5433 rb_str_splice(str, idx, 1, val);
5468rb_str_aset_m(
int argc,
VALUE *argv,
VALUE str)
5472 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5480 return rb_str_aset(str, argv[0], argv[1]);
5512 rb_str_splice(str, pos, 0, str2);
5540rb_str_slice_bang(
int argc,
VALUE *argv,
VALUE str)
5548 str_modify_keep_cr(str);
5556 if ((nth += regs->num_regs) <= 0)
return Qnil;
5558 else if (nth >= regs->num_regs)
return Qnil;
5560 len = END(nth) - beg;
5563 else if (argc == 2) {
5571 if (!len)
return Qnil;
5576 beg = rb_str_index(str, indx, 0);
5577 if (beg == -1)
return Qnil;
5589 if (!len)
return Qnil;
5603 rb_enc_cr_str_copy_for_substr(result, str);
5613 if (beg + len > slen)
5617 slen - (beg + len));
5619 STR_SET_LEN(str, slen);
5620 TERM_FILL(&sptr[slen], TERM_LEN(str));
5631 switch (OBJ_BUILTIN_TYPE(pat)) {
5650get_pat_quoted(
VALUE pat,
int check)
5654 switch (OBJ_BUILTIN_TYPE(pat)) {
5668 if (check && is_broken_string(pat)) {
5675rb_pat_search(
VALUE pat,
VALUE str,
long pos,
int set_backref_str)
5678 pos = rb_strseq_index(str, pat, pos, 1);
5679 if (set_backref_str) {
5681 str = rb_str_new_frozen_String(str);
5682 rb_backref_set_string(str, pos,
RSTRING_LEN(pat));
5691 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5711rb_str_sub_bang(
int argc,
VALUE *argv,
VALUE str)
5725 hash = rb_check_hash_type(argv[1]);
5731 pat = get_pat_quoted(argv[0], 1);
5733 str_modifiable(str);
5734 beg = rb_pat_search(pat, str, 0, 1);
5757 if (iter || !
NIL_P(hash)) {
5764 repl = rb_hash_aref(hash,
rb_str_subseq(str, beg0, end0 - beg0));
5767 str_mod_check(str, p, len);
5774 enc = rb_enc_compatible(str, repl);
5784 enc = STR_ENC_GET(repl);
5787 rb_enc_associate(str, enc);
5800 RESIZE_CAPA(str, len + rlen - plen);
5804 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5807 memmove(p + beg0, rp, rlen);
5809 STR_SET_LEN(str, len);
5837 rb_str_sub_bang(argc, argv, str);
5842str_gsub(
int argc,
VALUE *argv,
VALUE str,
int bang)
5846 long beg, beg0, end0;
5847 long offset, blen, slen, len, last;
5848 enum {STR, ITER, MAP} mode = STR;
5850 int need_backref = -1;
5860 hash = rb_check_hash_type(argv[1]);
5869 rb_error_arity(argc, 1, 2);
5872 pat = get_pat_quoted(argv[0], 1);
5873 beg = rb_pat_search(pat, str, 0, need_backref);
5875 if (bang)
return Qnil;
5885 str_enc = STR_ENC_GET(str);
5886 rb_enc_associate(dest, str_enc);
5908 val = rb_hash_aref(hash,
rb_str_subseq(str, beg0, end0 - beg0));
5911 str_mod_check(str, sp, slen);
5916 else if (need_backref) {
5918 if (need_backref < 0) {
5919 need_backref = val != repl;
5926 len = beg0 - offset;
5943 offset = end0 + len;
5947 beg = rb_pat_search(pat, str, offset, need_backref);
5952 rb_pat_search(pat, str, last, 1);
5954 str_shared_replace(str, dest);
5982rb_str_gsub_bang(
int argc,
VALUE *argv,
VALUE str)
5984 str_modify_keep_cr(str);
5985 return str_gsub(argc, argv, str, 1);
6008 return str_gsub(argc, argv, str, 0);
6026 str_modifiable(str);
6027 if (str == str2)
return str;
6031 return str_replace(str, str2);
6046rb_str_clear(
VALUE str)
6050 STR_SET_EMBED_LEN(str, 0);
6071rb_str_chr(
VALUE str)
6119 char *ptr, *head, *left = 0;
6123 if (pos < -len || len <= pos)
6130 char byte = (char)(
NUM2INT(w) & 0xFF);
6132 if (!str_independent(str))
6133 str_make_independent(str);
6134 enc = STR_ENC_GET(str);
6137 if (!STR_EMBED_P(str)) {
6144 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6152 width = rb_enc_precise_mbclen(left, head+len, enc);
6154 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6170str_byte_substr(
VALUE str,
long beg,
long len,
int empty)
6174 if (beg > n || len < 0)
return Qnil;
6177 if (beg < 0)
return Qnil;
6182 if (!empty)
return Qnil;
6186 VALUE str2 = str_subseq(str, beg, len);
6188 str_enc_copy(str2, str);
6227 return str_byte_substr(str, beg, len, TRUE);
6232 return str_byte_substr(str, idx, 1, FALSE);
6279rb_str_byteslice(
int argc,
VALUE *argv,
VALUE str)
6284 return str_byte_substr(str, beg, len, TRUE);
6287 return str_byte_aref(str, argv[0]);
6307rb_str_bytesplice(
int argc,
VALUE *argv,
VALUE str)
6309 long beg, end, len, slen;
6318 rb_builtin_class_name(argv[0]));
6329 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
6336 assert(beg <= slen);
6337 if (len > slen - beg) {
6341 if (!str_check_byte_pos(str, beg)) {
6343 "offset %ld does not land on character boundary", beg);
6345 if (!str_check_byte_pos(str, end)) {
6347 "offset %ld does not land on character boundary", end);
6350 enc = rb_enc_check(str, val);
6351 str_modify_keep_cr(str);
6352 rb_str_splice_0(str, beg, len, val);
6353 rb_enc_associate(str, enc);
6371rb_str_reverse(
VALUE str)
6379 enc = STR_ENC_GET(str);
6386 if (single_byte_optimizable(str)) {
6393 int clen = rb_enc_fast_mbclen(s, e, enc);
6404 int clen = rb_enc_mbclen(s, e, enc);
6414 str_enc_copy(rev, str);
6434rb_str_reverse_bang(
VALUE str)
6437 if (single_byte_optimizable(str)) {
6440 str_modify_keep_cr(str);
6450 str_shared_replace(str, rb_str_reverse(str));
6454 str_modify_keep_cr(str);
6479 i = rb_str_index(str, arg, 0);
6481 return RBOOL(i != -1);
6525 return rb_str_to_inum(str, base, FALSE);
6549rb_str_to_f(
VALUE str)
6567rb_str_to_s(
VALUE str)
6579 char s[RUBY_MAX_CHAR_LEN];
6580 int n = rb_enc_codelen(c, enc);
6587#define CHAR_ESC_LEN 13
6590rb_str_buf_cat_escaped_char(
VALUE result,
unsigned int c,
int unicode_p)
6592 char buf[CHAR_ESC_LEN + 1];
6600 snprintf(buf, CHAR_ESC_LEN,
"%c", c);
6602 else if (c < 0x10000) {
6603 snprintf(buf, CHAR_ESC_LEN,
"\\u%04X", c);
6606 snprintf(buf, CHAR_ESC_LEN,
"\\u{%X}", c);
6611 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", c);
6614 snprintf(buf, CHAR_ESC_LEN,
"\\x{%X}", c);
6617 l = (int)strlen(buf);
6623ruby_escaped_char(
int c)
6626 case '\0':
return "\\0";
6627 case '\n':
return "\\n";
6628 case '\r':
return "\\r";
6629 case '\t':
return "\\t";
6630 case '\f':
return "\\f";
6631 case '\013':
return "\\v";
6632 case '\010':
return "\\b";
6633 case '\007':
return "\\a";
6634 case '\033':
return "\\e";
6635 case '\x7f':
return "\\c?";
6641rb_str_escape(
VALUE str)
6647 const char *prev = p;
6648 char buf[CHAR_ESC_LEN + 1];
6650 int unicode_p = rb_enc_unicode_p(enc);
6656 int n = rb_enc_precise_mbclen(p, pend, enc);
6658 if (p > prev) str_buf_cat(result, prev, p - prev);
6661 n = (int)(pend - p);
6663 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
6664 str_buf_cat(result, buf, strlen(buf));
6672 cc = ruby_escaped_char(c);
6674 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6675 str_buf_cat(result, cc, strlen(cc));
6681 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6682 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6686 if (p > prev) str_buf_cat(result, prev, p - prev);
6710 const char *p, *pend, *prev;
6711 char buf[CHAR_ESC_LEN + 1];
6713 rb_encoding *resenc = rb_default_internal_encoding();
6714 int unicode_p = rb_enc_unicode_p(enc);
6717 if (resenc == NULL) resenc = rb_default_external_encoding();
6719 rb_enc_associate(result, resenc);
6720 str_buf_cat2(result,
"\"");
6728 n = rb_enc_precise_mbclen(p, pend, enc);
6730 if (p > prev) str_buf_cat(result, prev, p - prev);
6733 n = (int)(pend - p);
6735 snprintf(buf, CHAR_ESC_LEN,
"\\x%02X", *p & 0377);
6736 str_buf_cat(result, buf, strlen(buf));
6744 if ((asciicompat || unicode_p) &&
6745 (c ==
'"'|| c ==
'\\' ||
6750 (cc ==
'$' || cc ==
'@' || cc ==
'{'))))) {
6751 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6752 str_buf_cat2(result,
"\\");
6753 if (asciicompat || enc == resenc) {
6759 case '\n': cc =
'n';
break;
6760 case '\r': cc =
'r';
break;
6761 case '\t': cc =
't';
break;
6762 case '\f': cc =
'f';
break;
6763 case '\013': cc =
'v';
break;
6764 case '\010': cc =
'b';
break;
6765 case '\007': cc =
'a';
break;
6766 case 033: cc =
'e';
break;
6767 default: cc = 0;
break;
6770 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6773 str_buf_cat(result, buf, 2);
6790 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6791 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6796 if (p > prev) str_buf_cat(result, prev, p - prev);
6797 str_buf_cat2(result,
"\"");
6802#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6822 int encidx = rb_enc_get_index(str);
6825 const char *p, *pend;
6828 int u8 = (encidx == rb_utf8_encindex());
6829 static const char nonascii_suffix[] =
".dup.force_encoding(\"%s\")";
6834 len += strlen(enc->name);
6840 unsigned char c = *p++;
6843 case '"':
case '\\':
6844 case '\n':
case '\r':
6845 case '\t':
case '\f':
6846 case '\013':
case '\010':
case '\007':
case '\033':
6851 clen = IS_EVSTR(p, pend) ? 2 : 1;
6859 if (u8 && c > 0x7F) {
6860 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6865 else if (cc <= 0xFFFFF)
6878 if (clen > LONG_MAX - len) {
6890 unsigned char c = *p++;
6892 if (c ==
'"' || c ==
'\\') {
6896 else if (c ==
'#') {
6897 if (IS_EVSTR(p, pend)) *q++ =
'\\';
6900 else if (c ==
'\n') {
6904 else if (c ==
'\r') {
6908 else if (c ==
'\t') {
6912 else if (c ==
'\f') {
6916 else if (c ==
'\013') {
6920 else if (c ==
'\010') {
6924 else if (c ==
'\007') {
6928 else if (c ==
'\033') {
6938 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6943 snprintf(q, qend-q,
"u%04X", cc);
6945 snprintf(q, qend-q,
"u{%X}", cc);
6950 snprintf(q, qend-q,
"x%02X", c);
6957 snprintf(q, qend-q, nonascii_suffix, enc->name);
6958 encidx = rb_ascii8bit_encindex();
6961 rb_enc_associate_index(result, encidx);
6967unescape_ascii(
unsigned int c)
6991undump_after_backslash(
VALUE undumped,
const char **ss,
const char *s_end,
rb_encoding **penc,
bool *utf8,
bool *binary)
6993 const char *s = *ss;
6997 unsigned char buf[6];
7015 *buf = unescape_ascii(*s);
7027 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7028 if (*penc != enc_utf8) {
7030 rb_enc_associate(undumped, enc_utf8);
7047 if (hexlen == 0 || hexlen > 6) {
7053 if (0xd800 <= c && c <= 0xdfff) {
7066 if (0xd800 <= c && c <= 0xdfff) {
7097static VALUE rb_str_is_ascii_only_p(
VALUE str);
7115str_undump(
VALUE str)
7122 bool binary =
false;
7126 if (rb_str_is_ascii_only_p(str) ==
Qfalse) {
7129 if (!str_null_check(str, &w)) {
7133 if (*s !=
'"')
goto invalid_format;
7151 static const char force_encoding_suffix[] =
".force_encoding(\"";
7152 static const char dup_suffix[] =
".dup";
7153 const char *encname;
7158 size =
sizeof(dup_suffix) - 1;
7159 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7161 size =
sizeof(force_encoding_suffix) - 1;
7162 if (s_end - s <= size)
goto invalid_format;
7163 if (memcmp(s, force_encoding_suffix, size) != 0)
goto invalid_format;
7171 s = memchr(s,
'"', s_end-s);
7173 if (!s)
goto invalid_format;
7174 if (s_end - s != 2)
goto invalid_format;
7175 if (s[0] !=
'"' || s[1] !=
')')
goto invalid_format;
7177 encidx = rb_enc_find_index2(encname, (
long)size);
7181 rb_enc_associate_index(undumped, encidx);
7191 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7200 rb_raise(
rb_eRuntimeError,
"invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7206 if (rb_enc_dummy_p(enc)) {
7213str_true_enc(
VALUE str)
7216 rb_str_check_dummy_enc(enc);
7220static OnigCaseFoldType
7221check_case_options(
int argc,
VALUE *argv, OnigCaseFoldType flags)
7227 if (argv[0]==sym_turkic) {
7228 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7230 if (argv[1]==sym_lithuanian)
7231 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7236 else if (argv[0]==sym_lithuanian) {
7237 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7239 if (argv[1]==sym_turkic)
7240 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7247 else if (argv[0]==sym_ascii)
7248 flags |= ONIGENC_CASE_ASCII_ONLY;
7249 else if (argv[0]==sym_fold) {
7250 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7251 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7263 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() ||
rb_enc_mbmaxlen(enc) == 1))
7269#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7270#ifndef CASEMAP_DEBUG
7271# define CASEMAP_DEBUG 0
7279 OnigUChar space[FLEX_ARY_LEN];
7283mapping_buffer_free(
void *p)
7287 while (current_buffer) {
7288 previous_buffer = current_buffer;
7289 current_buffer = current_buffer->next;
7290 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7296 {0, mapping_buffer_free,}
7304 const OnigUChar *source_current, *source_end;
7305 int target_length = 0;
7306 VALUE buffer_anchor;
7309 size_t buffer_count = 0;
7310 int buffer_length_or_invalid;
7319 while (source_current < source_end) {
7321 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7322 if (CASEMAP_DEBUG) {
7323 fprintf(stderr,
"Buffer allocation, capa is %"PRIuSIZE
"\n", capa);
7326 *pre_buffer = current_buffer;
7327 pre_buffer = ¤t_buffer->next;
7328 current_buffer->next = NULL;
7329 current_buffer->capa = capa;
7330 buffer_length_or_invalid = enc->case_map(flags,
7331 &source_current, source_end,
7332 current_buffer->space,
7333 current_buffer->space+current_buffer->capa,
7335 if (buffer_length_or_invalid < 0) {
7336 current_buffer =
DATA_PTR(buffer_anchor);
7338 mapping_buffer_free(current_buffer);
7341 target_length += current_buffer->used = buffer_length_or_invalid;
7343 if (CASEMAP_DEBUG) {
7344 fprintf(stderr,
"Buffer count is %"PRIuSIZE
"\n", buffer_count);
7347 if (buffer_count==1) {
7348 target =
rb_str_new((
const char*)current_buffer->space, target_length);
7351 char *target_current;
7355 current_buffer =
DATA_PTR(buffer_anchor);
7356 while (current_buffer) {
7357 memcpy(target_current, current_buffer->space, current_buffer->used);
7358 target_current += current_buffer->used;
7359 current_buffer = current_buffer->next;
7362 current_buffer =
DATA_PTR(buffer_anchor);
7364 mapping_buffer_free(current_buffer);
7369 str_enc_copy(target, source);
7378 const OnigUChar *source_current, *source_end;
7379 OnigUChar *target_current, *target_end;
7381 int length_or_invalid;
7383 if (old_length == 0)
return Qnil;
7387 if (source == target) {
7388 target_current = (OnigUChar*)source_current;
7389 target_end = (OnigUChar*)source_end;
7396 length_or_invalid = onigenc_ascii_only_case_map(flags,
7397 &source_current, source_end,
7398 target_current, target_end, enc);
7399 if (length_or_invalid < 0)
7401 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7402 fprintf(stderr,
"problem with rb_str_ascii_casemap"
7403 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7405 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7408 str_enc_copy(target, source);
7414upcase_single(
VALUE str)
7417 bool modified =
false;
7420 unsigned int c = *(
unsigned char*)s;
7422 if (
'a' <= c && c <=
'z') {
7423 *s =
'A' + (c -
'a');
7451rb_str_upcase_bang(
int argc,
VALUE *argv,
VALUE str)
7454 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7456 flags = check_case_options(argc, argv, flags);
7457 str_modify_keep_cr(str);
7458 enc = str_true_enc(str);
7459 if (case_option_single_p(flags, enc, str)) {
7460 if (upcase_single(str))
7461 flags |= ONIGENC_CASE_MODIFIED;
7463 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7464 rb_str_ascii_casemap(str, str, &flags, enc);
7466 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7468 if (ONIGENC_CASE_MODIFIED&flags)
return str;
7490rb_str_upcase(
int argc,
VALUE *argv,
VALUE str)
7493 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7496 flags = check_case_options(argc, argv, flags);
7497 enc = str_true_enc(str);
7498 if (case_option_single_p(flags, enc, str)) {
7500 str_enc_copy(ret, str);
7503 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7505 rb_str_ascii_casemap(str, ret, &flags, enc);
7508 ret = rb_str_casemap(str, &flags, enc);
7515downcase_single(
VALUE str)
7518 bool modified =
false;
7521 unsigned int c = *(
unsigned char*)s;
7523 if (
'A' <= c && c <=
'Z') {
7524 *s =
'a' + (c -
'A');
7553rb_str_downcase_bang(
int argc,
VALUE *argv,
VALUE str)
7556 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7558 flags = check_case_options(argc, argv, flags);
7559 str_modify_keep_cr(str);
7560 enc = str_true_enc(str);
7561 if (case_option_single_p(flags, enc, str)) {
7562 if (downcase_single(str))
7563 flags |= ONIGENC_CASE_MODIFIED;
7565 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7566 rb_str_ascii_casemap(str, str, &flags, enc);
7568 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7570 if (ONIGENC_CASE_MODIFIED&flags)
return str;
7592rb_str_downcase(
int argc,
VALUE *argv,
VALUE str)
7595 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7598 flags = check_case_options(argc, argv, flags);
7599 enc = str_true_enc(str);
7600 if (case_option_single_p(flags, enc, str)) {
7602 str_enc_copy(ret, str);
7603 downcase_single(ret);
7605 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7607 rb_str_ascii_casemap(str, ret, &flags, enc);
7610 ret = rb_str_casemap(str, &flags, enc);
7638rb_str_capitalize_bang(
int argc,
VALUE *argv,
VALUE str)
7641 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7643 flags = check_case_options(argc, argv, flags);
7644 str_modify_keep_cr(str);
7645 enc = str_true_enc(str);
7647 if (flags&ONIGENC_CASE_ASCII_ONLY)
7648 rb_str_ascii_casemap(str, str, &flags, enc);
7650 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7652 if (ONIGENC_CASE_MODIFIED&flags)
return str;
7676rb_str_capitalize(
int argc,
VALUE *argv,
VALUE str)
7679 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7682 flags = check_case_options(argc, argv, flags);
7683 enc = str_true_enc(str);
7685 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7687 rb_str_ascii_casemap(str, ret, &flags, enc);
7690 ret = rb_str_casemap(str, &flags, enc);
7717rb_str_swapcase_bang(
int argc,
VALUE *argv,
VALUE str)
7720 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7722 flags = check_case_options(argc, argv, flags);
7723 str_modify_keep_cr(str);
7724 enc = str_true_enc(str);
7725 if (flags&ONIGENC_CASE_ASCII_ONLY)
7726 rb_str_ascii_casemap(str, str, &flags, enc);
7728 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7730 if (ONIGENC_CASE_MODIFIED&flags)
return str;
7754rb_str_swapcase(
int argc,
VALUE *argv,
VALUE str)
7757 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7760 flags = check_case_options(argc, argv, flags);
7761 enc = str_true_enc(str);
7763 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7765 rb_str_ascii_casemap(str, ret, &flags, enc);
7768 ret = rb_str_casemap(str, &flags, enc);
7773typedef unsigned char *USTR;
7777 unsigned int now, max;
7789 if (t->p == t->pend)
return -1;
7790 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'\\' && t->p + n < t->pend) {
7793 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7795 if (rb_enc_ascget(t->p, t->pend, &n, enc) ==
'-' && t->p + n < t->pend) {
7797 if (t->p < t->pend) {
7798 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7801 if (t->now < 0x80 && c < 0x80) {
7803 "invalid range \"%c-%c\" in string transliteration",
7818 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7819 if (t->now == t->max) {
7824 if (t->now < t->max) {
7840 const unsigned int errc = -1;
7841 unsigned int trans[256];
7843 struct tr trsrc, trrepl;
7845 unsigned int c, c0, last = 0;
7846 int modify = 0, i, l;
7847 unsigned char *s, *send;
7849 int singlebyte = single_byte_optimizable(str);
7853#define CHECK_IF_ASCII(c) \
7854 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7855 (cr = ENC_CODERANGE_VALID) : 0)
7861 return rb_str_delete_bang(1, &src, str);
7865 e1 = rb_enc_check(str, src);
7866 e2 = rb_enc_check(str, repl);
7871 enc = rb_enc_check(src, repl);
7875 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) ==
'^' &&
7876 trsrc.p + l < trsrc.pend) {
7882 trsrc.gen = trrepl.gen = 0;
7883 trsrc.now = trrepl.now = 0;
7884 trsrc.max = trrepl.max = 0;
7887 for (i=0; i<256; i++) {
7890 while ((c = trnext(&trsrc, enc)) != errc) {
7895 if (!hash) hash = rb_hash_new();
7899 while ((c = trnext(&trrepl, enc)) != errc)
7902 for (i=0; i<256; i++) {
7903 if (trans[i] != errc) {
7911 for (i=0; i<256; i++) {
7914 while ((c = trnext(&trsrc, enc)) != errc) {
7915 r = trnext(&trrepl, enc);
7916 if (r == errc) r = trrepl.now;
7919 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7922 if (!hash) hash = rb_hash_new();
7930 str_modify_keep_cr(str);
7936 unsigned int save = -1;
7937 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
7942 c0 = c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, e1);
7943 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7952 if (cflag) c = last;
7955 else if (cflag) c = errc;
7961 if (c != (
unsigned int)-1) {
7967 tlen = rb_enc_codelen(c, enc);
7973 if (enc != e1) may_modify = 1;
7975 if ((offset = t - buf) + tlen > max) {
7976 size_t MAYBE_UNUSED(old) = max + termlen;
7977 max = offset + tlen + (send - s);
7978 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
7982 if (may_modify && memcmp(s, t, tlen) != 0) {
7988 if (!STR_EMBED_P(str)) {
7989 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
7991 TERM_FILL((
char *)t, termlen);
7992 RSTRING(str)->as.heap.ptr = (
char *)buf;
7993 RSTRING(str)->as.heap.len = t - buf;
7994 STR_SET_NOEMBED(str);
7995 RSTRING(str)->as.heap.aux.capa = max;
7999 c = (
unsigned char)*s;
8000 if (trans[c] != errc) {
8017 long offset, max = (long)((send - s) * 1.2);
8018 unsigned char *buf =
ALLOC_N(
unsigned char, max + termlen), *t = buf;
8022 c0 = c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, e1);
8023 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8031 if (cflag) c = last;
8034 else if (cflag) c = errc;
8038 c = cflag ? last : errc;
8041 tlen = rb_enc_codelen(c, enc);
8046 if (enc != e1) may_modify = 1;
8048 if ((offset = t - buf) + tlen > max) {
8049 size_t MAYBE_UNUSED(old) = max + termlen;
8050 max = offset + tlen + (long)((send - s) * 1.2);
8051 SIZED_REALLOC_N(buf,
unsigned char, max + termlen, old);
8056 if (may_modify && memcmp(s, t, tlen) != 0) {
8064 if (!STR_EMBED_P(str)) {
8065 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8067 TERM_FILL((
char *)t, termlen);
8068 RSTRING(str)->as.heap.ptr = (
char *)buf;
8069 RSTRING(str)->as.heap.len = t - buf;
8070 STR_SET_NOEMBED(str);
8071 RSTRING(str)->as.heap.aux.capa = max;
8077 rb_enc_associate(str, enc);
8096 return tr_trans(str, src, repl, 0);
8143 tr_trans(str, src, repl, 0);
8147#define TR_TABLE_MAX (UCHAR_MAX+1)
8148#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8150tr_setup_table(
VALUE str,
char stable[TR_TABLE_SIZE],
int first,
8153 const unsigned int errc = -1;
8154 char buf[TR_TABLE_MAX];
8157 VALUE table = 0, ptable = 0;
8158 int i, l, cflag = 0;
8161 tr.gen =
tr.now =
tr.max = 0;
8163 if (
RSTRING_LEN(str) > 1 && rb_enc_ascget(
tr.p,
tr.pend, &l, enc) ==
'^') {
8168 for (i=0; i<TR_TABLE_MAX; i++) {
8171 stable[TR_TABLE_MAX] = cflag;
8173 else if (stable[TR_TABLE_MAX] && !cflag) {
8174 stable[TR_TABLE_MAX] = 0;
8176 for (i=0; i<TR_TABLE_MAX; i++) {
8180 while ((c = trnext(&
tr, enc)) != errc) {
8181 if (c < TR_TABLE_MAX) {
8182 buf[(
unsigned char)c] = !cflag;
8187 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8190 table = ptable ? ptable : rb_hash_new();
8194 table = rb_hash_new();
8199 if (table && (!ptable || (cflag ^ !
NIL_P(rb_hash_aref(ptable, key))))) {
8200 rb_hash_aset(table, key,
Qtrue);
8204 for (i=0; i<TR_TABLE_MAX; i++) {
8205 stable[i] = stable[i] && buf[i];
8207 if (!table && !cflag) {
8214tr_find(
unsigned int c,
const char table[TR_TABLE_SIZE],
VALUE del,
VALUE nodel)
8216 if (c < TR_TABLE_MAX) {
8217 return table[c] != 0;
8223 if (!
NIL_P(rb_hash_lookup(del, v)) &&
8224 (!nodel ||
NIL_P(rb_hash_lookup(nodel, v)))) {
8228 else if (nodel && !
NIL_P(rb_hash_lookup(nodel, v))) {
8231 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8245rb_str_delete_bang(
int argc,
VALUE *argv,
VALUE str)
8247 char squeez[TR_TABLE_SIZE];
8250 VALUE del = 0, nodel = 0;
8252 int i, ascompat, cr;
8256 for (i=0; i<argc; i++) {
8260 enc = rb_enc_check(str, s);
8261 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8264 str_modify_keep_cr(str);
8273 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
8284 c = rb_enc_codepoint_len(s, send, &clen, enc);
8286 if (tr_find(c, squeez, del, nodel)) {
8297 TERM_FILL(t, TERM_LEN(str));
8301 if (modify)
return str;
8321rb_str_delete(
int argc,
VALUE *argv,
VALUE str)
8324 rb_str_delete_bang(argc, argv, str);
8338rb_str_squeeze_bang(
int argc,
VALUE *argv,
VALUE str)
8340 char squeez[TR_TABLE_SIZE];
8342 VALUE del = 0, nodel = 0;
8343 unsigned char *s, *send, *t;
8345 int ascompat, singlebyte = single_byte_optimizable(str);
8349 enc = STR_ENC_GET(str);
8352 for (i=0; i<argc; i++) {
8356 enc = rb_enc_check(str, s);
8357 if (singlebyte && !single_byte_optimizable(s))
8359 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8363 str_modify_keep_cr(str);
8372 unsigned int c = *s++;
8373 if (c != save || (argc > 0 && !squeez[c])) {
8383 if (ascompat && (c = *s) < 0x80) {
8384 if (c != save || (argc > 0 && !squeez[c])) {
8390 c = rb_enc_codepoint_len((
char *)s, (
char *)send, &clen, enc);
8392 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8402 TERM_FILL((
char *)t, TERM_LEN(str));
8408 if (modify)
return str;
8431rb_str_squeeze(
int argc,
VALUE *argv,
VALUE str)
8434 rb_str_squeeze_bang(argc, argv, str);
8452 return tr_trans(str, src, repl, 1);
8475 tr_trans(str, src, repl, 1);
8504rb_str_count(
int argc,
VALUE *argv,
VALUE str)
8506 char table[TR_TABLE_SIZE];
8508 VALUE del = 0, nodel = 0, tstr;
8518 enc = rb_enc_check(str, tstr);
8523 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (
const unsigned char *)ptstr, (
const unsigned char *)ptstr+1)) &&
8524 !is_broken_string(str)) {
8526 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8532 if (*(
unsigned char*)s++ == c) n++;
8538 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8539 for (i=1; i<argc; i++) {
8542 enc = rb_enc_check(str, tstr);
8543 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8553 if (ascompat && (c = *(
unsigned char*)s) < 0x80) {
8561 c = rb_enc_codepoint_len(s, send, &clen, enc);
8562 if (tr_find(c, table, del, nodel)) {
8573rb_fs_check(
VALUE val)
8577 if (
NIL_P(val))
return 0;
8582static const char isspacetable[256] = {
8583 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8584 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8585 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8586 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8587 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8588 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8589 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8590 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8591 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8592 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8594 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8595 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8596 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8597 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8598 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8601#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8604split_string(
VALUE result,
VALUE str,
long beg,
long len,
long empty_count)
8606 if (empty_count >= 0 && len == 0) {
8607 return empty_count + 1;
8609 if (empty_count > 0) {
8613 rb_ary_push(result, str_new_empty_String(str));
8614 }
while (--empty_count > 0);
8618 rb_yield(str_new_empty_String(str));
8619 }
while (--empty_count > 0);
8624 rb_ary_push(result, str);
8633 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8637literal_split_pattern(
VALUE spat, split_type_t default_type)
8645 return SPLIT_TYPE_CHARS;
8648 if (len == 1 && ptr[0] ==
' ') {
8649 return SPLIT_TYPE_AWK;
8654 if (rb_enc_ascget(ptr, ptr + len, &l, enc) ==
' ' && len == l) {
8655 return SPLIT_TYPE_AWK;
8658 return default_type;
8671rb_str_split_m(
int argc,
VALUE *argv,
VALUE str)
8676 split_type_t split_type;
8677 long beg, end, i = 0, empty_count = -1;
8682 if (
rb_scan_args(argc, argv,
"02", &spat, &limit) == 2) {
8684 if (lim <= 0) limit =
Qnil;
8685 else if (lim == 1) {
8697 if (
NIL_P(limit) && !lim) empty_count = 0;
8699 enc = STR_ENC_GET(str);
8700 split_type = SPLIT_TYPE_REGEXP;
8702 spat = get_pat_quoted(spat, 0);
8704 else if (
NIL_P(spat = rb_fs)) {
8705 split_type = SPLIT_TYPE_AWK;
8707 else if (!(spat = rb_fs_check(spat))) {
8713 if (split_type != SPLIT_TYPE_AWK) {
8718 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8719 if (split_type == SPLIT_TYPE_AWK) {
8721 split_type = SPLIT_TYPE_STRING;
8726 mustnot_broken(spat);
8727 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8735#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8737 if (result) result = rb_ary_new();
8741 if (split_type == SPLIT_TYPE_AWK) {
8747 if (is_ascii_string(str)) {
8748 while (ptr < eptr) {
8749 c = (
unsigned char)*ptr++;
8751 if (ascii_isspace(c)) {
8757 if (!
NIL_P(limit) && lim <= i)
break;
8760 else if (ascii_isspace(c)) {
8761 SPLIT_STR(beg, end-beg);
8764 if (!
NIL_P(limit)) ++i;
8772 while (ptr < eptr) {
8775 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8784 if (!
NIL_P(limit) && lim <= i)
break;
8788 SPLIT_STR(beg, end-beg);
8791 if (!
NIL_P(limit)) ++i;
8799 else if (split_type == SPLIT_TYPE_STRING) {
8800 char *str_start = ptr;
8801 char *substr_start = ptr;
8805 mustnot_broken(str);
8806 enc = rb_enc_check(str, spat);
8807 while (ptr < eptr &&
8808 (end =
rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8811 if (t != ptr + end) {
8815 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8818 if (!
NIL_P(limit) && lim <= ++i)
break;
8820 beg = ptr - str_start;
8822 else if (split_type == SPLIT_TYPE_CHARS) {
8823 char *str_start = ptr;
8826 mustnot_broken(str);
8827 enc = rb_enc_get(str);
8828 while (ptr < eptr &&
8829 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8830 SPLIT_STR(ptr - str_start, n);
8832 if (!
NIL_P(limit) && lim <= ++i)
break;
8834 beg = ptr - str_start;
8845 (match ? (rb_match_unbusy(match),
rb_backref_set(match)) : (void)0)) {
8850 if (start == end && BEG(0) == END(0)) {
8855 else if (last_null == 1) {
8856 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8863 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8869 SPLIT_STR(beg, end-beg);
8870 beg = start = END(0);
8874 for (idx=1; idx < regs->num_regs; idx++) {
8875 if (BEG(idx) == -1)
continue;
8876 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8878 if (!
NIL_P(limit) && lim <= ++i)
break;
8880 if (match) rb_match_unbusy(match);
8886 return result ? result : str;
8896 return rb_str_split_m(1, &sep, str);
8899#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8905 rb_ary_push(ary, e);
8914#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8917chomp_newline(
const char *p,
const char *e,
rb_encoding *enc)
8923 if (prev && rb_enc_ascget(prev, e, NULL, enc) ==
'\r')
8942#define rb_rs get_rs()
8949 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8950 long pos, len, rslen;
8956 static ID keywords[1];
8961 chomp = (!UNDEF_P(chomp) &&
RTEST(chomp));
8965 if (!ENUM_ELEM(ary, str)) {
8982 enc = rb_enc_get(str);
8984 enc = rb_enc_check(str, rs);
8989 const char *eol = NULL;
8991 while (subend < pend) {
8992 long chomp_rslen = 0;
8994 if (rb_enc_ascget(subend, pend, &n, enc) !=
'\r')
8996 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
8998 if (eol == subend)
break;
9002 chomp_rslen = -rslen;
9006 if (!subptr) subptr = subend;
9010 }
while (subend < pend);
9012 if (rslen == 0) chomp_rslen = 0;
9014 subend - subptr + (chomp ? chomp_rslen : rslen));
9015 if (ENUM_ELEM(ary, line)) {
9016 str_mod_check(str, ptr, len);
9018 subptr = eol = NULL;
9037 while (subptr < pend) {
9038 pos =
rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9042 if (hit != adjusted) {
9046 subend = hit += rslen;
9049 subend = chomp_newline(subptr, subend, enc);
9056 if (ENUM_ELEM(ary, line)) {
9057 str_mod_check(str, ptr, len);
9062 if (subptr != pend) {
9065 pend = chomp_newline(subptr, pend, enc);
9067 else if (pend - subptr >= rslen &&
9068 memcmp(pend - rslen, rsptr, rslen) == 0) {
9073 ENUM_ELEM(ary, line);
9094rb_str_each_line(
int argc,
VALUE *argv,
VALUE str)
9097 return rb_str_enumerate_lines(argc, argv, str, 0);
9110rb_str_lines(
int argc,
VALUE *argv,
VALUE str)
9112 VALUE ary = WANTARRAY(
"lines", 0);
9113 return rb_str_enumerate_lines(argc, argv, str, ary);
9146rb_str_each_byte(
VALUE str)
9149 return rb_str_enumerate_bytes(str, 0);
9161rb_str_bytes(
VALUE str)
9164 return rb_str_enumerate_bytes(str, ary);
9184 enc = rb_enc_get(str);
9187 for (i = 0; i < len; i += n) {
9188 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9193 for (i = 0; i < len; i += n) {
9194 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9215rb_str_each_char(
VALUE str)
9218 return rb_str_enumerate_chars(str, 0);
9230rb_str_chars(
VALUE str)
9233 return rb_str_enumerate_chars(str, ary);
9237rb_str_enumerate_codepoints(
VALUE str,
VALUE ary)
9242 const char *ptr, *end;
9245 if (single_byte_optimizable(str))
9246 return rb_str_enumerate_bytes(str, ary);
9251 enc = STR_ENC_GET(str);
9254 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9275rb_str_each_codepoint(
VALUE str)
9278 return rb_str_enumerate_codepoints(str, 0);
9290rb_str_codepoints(
VALUE str)
9293 return rb_str_enumerate_codepoints(str, ary);
9299 int encidx = rb_enc_to_index(enc);
9301 const OnigUChar source_ascii[] =
"\\X";
9302 const OnigUChar *source = source_ascii;
9303 size_t source_len =
sizeof(source_ascii) - 1;
9306#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9307#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9308#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9309#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9310#define CASE_UTF(e) \
9311 case ENCINDEX_UTF_##e: { \
9312 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9313 source = source_UTF_##e; \
9314 source_len = sizeof(source_UTF_##e); \
9317 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9325 regex_t *reg_grapheme_cluster;
9327 int r = onig_new(®_grapheme_cluster, source, source + source_len,
9328 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9330 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9331 onig_error_code_to_str(message, r, &einfo);
9332 rb_fatal(
"cannot compile grapheme cluster regexp: %s", (
char *)message);
9335 return reg_grapheme_cluster;
9341 int encidx = rb_enc_to_index(enc);
9342 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9344 if (encidx == rb_utf8_encindex()) {
9345 if (!reg_grapheme_cluster_utf8) {
9346 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9349 return reg_grapheme_cluster_utf8;
9358 size_t grapheme_cluster_count = 0;
9360 const char *ptr, *end;
9362 if (!rb_enc_unicode_p(enc)) {
9366 bool cached_reg_grapheme_cluster =
true;
9367 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9368 if (!reg_grapheme_cluster) {
9369 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9370 cached_reg_grapheme_cluster =
false;
9377 OnigPosition len = onig_match(reg_grapheme_cluster,
9378 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9379 (
const OnigUChar *)ptr, NULL, 0);
9380 if (len <= 0)
break;
9381 grapheme_cluster_count++;
9385 if (!cached_reg_grapheme_cluster) {
9386 onig_free(reg_grapheme_cluster);
9389 return SIZET2NUM(grapheme_cluster_count);
9393rb_str_enumerate_grapheme_clusters(
VALUE str,
VALUE ary)
9397 const char *ptr0, *ptr, *end;
9399 if (!rb_enc_unicode_p(enc)) {
9400 return rb_str_enumerate_chars(str, ary);
9405 bool cached_reg_grapheme_cluster =
true;
9406 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9407 if (!reg_grapheme_cluster) {
9408 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9409 cached_reg_grapheme_cluster =
false;
9416 OnigPosition len = onig_match(reg_grapheme_cluster,
9417 (
const OnigUChar *)ptr, (
const OnigUChar *)end,
9418 (
const OnigUChar *)ptr, NULL, 0);
9419 if (len <= 0)
break;
9424 if (!cached_reg_grapheme_cluster) {
9425 onig_free(reg_grapheme_cluster);
9445rb_str_each_grapheme_cluster(
VALUE str)
9448 return rb_str_enumerate_grapheme_clusters(str, 0);
9460rb_str_grapheme_clusters(
VALUE str)
9463 return rb_str_enumerate_grapheme_clusters(str, ary);
9467chopped_length(
VALUE str)
9470 const char *p, *p2, *beg, *end;
9474 if (beg >= end)
return 0;
9477 if (p > beg && rb_enc_ascget(p, end, 0, enc) ==
'\n') {
9479 if (p2 && rb_enc_ascget(p2, end, 0, enc) ==
'\r') p = p2;
9495rb_str_chop_bang(
VALUE str)
9497 str_modify_keep_cr(str);
9500 len = chopped_length(str);
9501 STR_SET_LEN(str, len);
9521rb_str_chop(
VALUE str)
9527smart_chomp(
VALUE str,
const char *e,
const char *p)
9538 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
9546 if (--e > p && *(e-1) ==
'\r') {
9563 char *pp, *e, *rsptr;
9568 if (len == 0)
return 0;
9571 return smart_chomp(str, e, p);
9574 enc = rb_enc_get(str);
9585 if (rb_enc_ascget(pp, e, 0, enc) ==
'\r') {
9592 while (e > p && *(e-1) ==
'\n') {
9594 if (e > p && *(e-1) ==
'\r')
9600 if (rslen > len)
return len;
9602 enc = rb_enc_get(rs);
9603 newline = rsptr[rslen-1];
9606 if (newline ==
'\n')
9607 return smart_chomp(str, e, p);
9611 return smart_chomp(str, e, p);
9615 enc = rb_enc_check(str, rs);
9616 if (is_broken_string(rs)) {
9620 if (p[len-1] == newline &&
9622 memcmp(rsptr, pp, rslen) == 0)) {
9636chomp_rs(
int argc,
const VALUE *argv)
9653 long len = chompped_length(str, rs);
9654 if (len >= olen)
return Qnil;
9655 str_modify_keep_cr(str);
9656 STR_SET_LEN(str, len);
9674rb_str_chomp_bang(
int argc,
VALUE *argv,
VALUE str)
9677 str_modifiable(str);
9679 rs = chomp_rs(argc, argv);
9681 return rb_str_chomp_string(str, rs);
9694rb_str_chomp(
int argc,
VALUE *argv,
VALUE str)
9696 VALUE rs = chomp_rs(argc, argv);
9704 const char *
const start = s;
9706 if (!s || s >= e)
return 0;
9709 if (single_byte_optimizable(str)) {
9710 while (s < e && (*s ==
'\0' || ascii_isspace(*s))) s++;
9715 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9735rb_str_lstrip_bang(
VALUE str)
9741 str_modify_keep_cr(str);
9742 enc = STR_ENC_GET(str);
9744 loffset = lstrip_offset(str, start, start+olen, enc);
9746 long len = olen-loffset;
9747 s = start + loffset;
9748 memmove(start, s, len);
9749 STR_SET_LEN(str, len);
9773rb_str_lstrip(
VALUE str)
9778 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9779 if (loffset <= 0)
return str_duplicate(
rb_cString, str);
9788 rb_str_check_dummy_enc(enc);
9792 if (!s || s >= e)
return 0;
9796 if (single_byte_optimizable(str)) {
9798 while (s < t && ((c = *(t-1)) ==
'\0' || ascii_isspace(c))) t--;
9823rb_str_rstrip_bang(
VALUE str)
9829 str_modify_keep_cr(str);
9830 enc = STR_ENC_GET(str);
9832 roffset = rstrip_offset(str, start, start+olen, enc);
9834 long len = olen - roffset;
9836 STR_SET_LEN(str, len);
9860rb_str_rstrip(
VALUE str)
9866 enc = STR_ENC_GET(str);
9868 roffset = rstrip_offset(str, start, start+olen, enc);
9870 if (roffset <= 0)
return str_duplicate(
rb_cString, str);
9886rb_str_strip_bang(
VALUE str)
9889 long olen, loffset, roffset;
9892 str_modify_keep_cr(str);
9893 enc = STR_ENC_GET(str);
9895 loffset = lstrip_offset(str, start, start+olen, enc);
9896 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9898 if (loffset > 0 || roffset > 0) {
9899 long len = olen-roffset;
9902 memmove(start, start + loffset, len);
9904 STR_SET_LEN(str, len);
9928rb_str_strip(
VALUE str)
9931 long olen, loffset, roffset;
9935 loffset = lstrip_offset(str, start, start+olen, enc);
9936 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9938 if (loffset <= 0 && roffset <= 0)
return str_duplicate(
rb_cString, str);
9943scan_once(
VALUE str,
VALUE pat,
long *start,
int set_backref_str)
9945 VALUE result, match;
9948 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9966 *start = end + rb_enc_fast_mbclen(
RSTRING_PTR(str) + end,
9974 if (!regs || regs->num_regs == 1) {
9979 for (i=1; i < regs->num_regs; i++) {
9984 rb_ary_push(result, s);
10037 long last = -1, prev = 0;
10040 pat = get_pat_quoted(pat, 1);
10041 mustnot_broken(str);
10043 VALUE ary = rb_ary_new();
10045 while (!
NIL_P(result = scan_once(str, pat, &start, 0))) {
10048 rb_ary_push(ary, result);
10050 if (last >= 0) rb_pat_search(pat, str, last, 1);
10055 while (!
NIL_P(result = scan_once(str, pat, &start, 1))) {
10059 str_mod_check(str, p, len);
10061 if (last >= 0) rb_pat_search(pat, str, last, 1);
10085rb_str_hex(
VALUE str)
10087 return rb_str_to_inum(str, 16, FALSE);
10112rb_str_oct(
VALUE str)
10114 return rb_str_to_inum(str, -8, FALSE);
10117#ifndef HAVE_CRYPT_R
10122 rb_nativethread_lock_t lock;
10123} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10126crypt_mutex_initialize(
void)
10197# define CRYPT_END() ALLOCV_END(databuf)
10199 extern char *crypt(
const char *,
const char *);
10200# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10203 const char *s, *saltp;
10206 char salt_8bit_clean[3];
10210 mustnot_wchar(str);
10211 mustnot_wchar(salt);
10214 if (
RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10219 if (!
ISASCII((
unsigned char)saltp[0]) || !
ISASCII((
unsigned char)saltp[1])) {
10220 salt_8bit_clean[0] = saltp[0] & 0x7f;
10221 salt_8bit_clean[1] = saltp[1] & 0x7f;
10222 salt_8bit_clean[2] =
'\0';
10223 saltp = salt_8bit_clean;
10228# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10229 data->initialized = 0;
10231 res = crypt_r(s, saltp, data);
10233 crypt_mutex_initialize();
10235 res = crypt(s, saltp);
10276 char *ptr, *p, *pend;
10279 unsigned long sum0 = 0;
10291 str_mod_check(str, ptr, len);
10294 sum0 += (
unsigned char)*p;
10305 if (bits < (
int)
sizeof(
long)*CHAR_BIT) {
10306 sum0 &= (((
unsigned long)1)<<bits)-1;
10326rb_str_justify(
int argc,
VALUE *argv,
VALUE str,
char jflag)
10330 long width, len, flen = 1, fclen = 1;
10333 const char *f =
" ";
10334 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10336 int singlebyte = 1, cr;
10340 enc = STR_ENC_GET(str);
10345 enc = rb_enc_check(str, pad);
10348 fclen = str_strlen(pad, enc);
10349 singlebyte = single_byte_optimizable(pad);
10350 if (flen == 0 || fclen == 0) {
10354 len = str_strlen(str, enc);
10355 if (width < 0 || len >= width)
return str_duplicate(
rb_cString, str);
10357 llen = (jflag ==
'l') ? 0 : ((jflag ==
'r') ? n : n/2);
10361 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10362 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10365 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10366 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10367 (len += llen2 + rlen2) >= LONG_MAX - size) {
10371 res = str_new0(
rb_cString, 0, len, termlen);
10374 memset(p, *f, llen);
10378 while (llen >= fclen) {
10384 memcpy(p, f, llen2);
10391 memset(p, *f, rlen);
10395 while (rlen >= fclen) {
10401 memcpy(p, f, rlen2);
10405 TERM_FILL(p, termlen);
10407 rb_enc_associate(res, enc);
10429rb_str_ljust(
int argc,
VALUE *argv,
VALUE str)
10431 return rb_str_justify(argc, argv, str,
'l');
10445rb_str_rjust(
int argc,
VALUE *argv,
VALUE str)
10447 return rb_str_justify(argc, argv, str,
'r');
10462rb_str_center(
int argc,
VALUE *argv,
VALUE str)
10464 return rb_str_justify(argc, argv, str,
'c');
10480 sep = get_pat_quoted(sep, 0);
10492 pos = rb_str_index(str, sep, 0);
10493 if (pos < 0)
goto failed;
10501 return rb_ary_new3(3, str_duplicate(
rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10517 sep = get_pat_quoted(sep, 0);
10530 pos = rb_str_rindex(str, sep, pos);
10542 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(
rb_cString, str));
10554rb_str_start_with(
int argc,
VALUE *argv,
VALUE str)
10558 for (i=0; i<argc; i++) {
10559 VALUE tmp = argv[i];
10561 if (rb_reg_start_with_p(tmp, str))
10566 rb_enc_check(str, tmp);
10584rb_str_end_with(
int argc,
VALUE *argv,
VALUE str)
10590 for (i=0; i<argc; i++) {
10591 VALUE tmp = argv[i];
10594 enc = rb_enc_check(str, tmp);
10618deleted_prefix_length(
VALUE str,
VALUE prefix)
10620 char *strptr, *prefixptr;
10621 long olen, prefixlen;
10624 if (is_broken_string(prefix))
return 0;
10625 rb_enc_check(str, prefix);
10629 if (prefixlen <= 0)
return 0;
10631 if (olen < prefixlen)
return 0;
10634 if (memcmp(strptr, prefixptr, prefixlen) != 0)
return 0;
10649rb_str_delete_prefix_bang(
VALUE str,
VALUE prefix)
10652 str_modify_keep_cr(str);
10654 prefixlen = deleted_prefix_length(str, prefix);
10655 if (prefixlen <= 0)
return Qnil;
10669rb_str_delete_prefix(
VALUE str,
VALUE prefix)
10673 prefixlen = deleted_prefix_length(str, prefix);
10674 if (prefixlen <= 0)
return str_duplicate(
rb_cString, str);
10689deleted_suffix_length(
VALUE str,
VALUE suffix)
10691 char *strptr, *suffixptr, *s;
10692 long olen, suffixlen;
10696 if (is_broken_string(suffix))
return 0;
10697 enc = rb_enc_check(str, suffix);
10701 if (suffixlen <= 0)
return 0;
10703 if (olen < suffixlen)
return 0;
10706 s = strptr + olen - suffixlen;
10707 if (memcmp(s, suffixptr, suffixlen) != 0)
return 0;
10723rb_str_delete_suffix_bang(
VALUE str,
VALUE suffix)
10725 long olen, suffixlen, len;
10726 str_modifiable(str);
10728 suffixlen = deleted_suffix_length(str, suffix);
10729 if (suffixlen <= 0)
return Qnil;
10732 str_modify_keep_cr(str);
10733 len = olen - suffixlen;
10734 STR_SET_LEN(str, len);
10735 TERM_FILL(&
RSTRING_PTR(str)[len], TERM_LEN(str));
10751rb_str_delete_suffix(
VALUE str,
VALUE suffix)
10755 suffixlen = deleted_suffix_length(str, suffix);
10756 if (suffixlen <= 0)
return str_duplicate(
rb_cString, str);
10773 val = rb_fs_check(val);
10776 "value of %"PRIsVALUE
" must be String or Regexp",
10780 rb_warn_deprecated(
"`$;'", NULL);
10797 str_modifiable(str);
10798 rb_enc_associate(str, rb_to_encoding(enc));
10815 if (
FL_TEST(str, STR_NOEMBED)) {
10821 str_replace_shared_without_enc(str2, str);
10856rb_str_valid_encoding_p(
VALUE str)
10876rb_str_is_ascii_only_p(
VALUE str)
10886 static const char ellipsis[] =
"...";
10887 const long ellipsislen =
sizeof(ellipsis) - 1;
10890 const char *
const p =
RSTRING_PTR(str), *e = p + blen;
10891 VALUE estr, ret = 0;
10895 (e =
rb_enc_nth(p, e, len, enc)) - p == blen) {
10898 else if (len <= ellipsislen ||
10902 rb_enc_associate(ret, enc);
10914 rb_enc_from_encoding(enc), 0,
Qnil);
10952 if (enc == STR_ENC_GET(str)) {
10957 return enc_str_scrub(enc, str, repl, cr);
10965 const char *rep, *p, *e, *p1, *sp;
10978 if (!
NIL_P(repl)) {
10979 repl = str_compat_and_valid(repl, enc);
10982 if (rb_enc_dummy_p(enc)) {
10985 encidx = rb_enc_to_index(enc);
10987#define DEFAULT_REPLACE_CHAR(str) do { \
10988 static const char replace[sizeof(str)-1] = str; \
10989 rep = replace; replen = (int)sizeof(replace); \
11004 else if (!
NIL_P(repl)) {
11009 else if (encidx == rb_utf8_encindex()) {
11010 DEFAULT_REPLACE_CHAR(
"\xEF\xBF\xBD");
11014 DEFAULT_REPLACE_CHAR(
"?");
11019 p = search_nonascii(p, e);
11024 int ret = rb_enc_precise_mbclen(p, e, enc);
11043 if (e - p < clen) clen = e - p;
11050 for (; clen > 1; clen--) {
11051 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11063 str_mod_check(str, sp, slen);
11064 repl = str_compat_and_valid(repl, enc);
11071 p = search_nonascii(p, e);
11098 str_mod_check(str, sp, slen);
11099 repl = str_compat_and_valid(repl, enc);
11112 else if (!
NIL_P(repl)) {
11116 else if (encidx == ENCINDEX_UTF_16BE) {
11117 DEFAULT_REPLACE_CHAR(
"\xFF\xFD");
11119 else if (encidx == ENCINDEX_UTF_16LE) {
11120 DEFAULT_REPLACE_CHAR(
"\xFD\xFF");
11122 else if (encidx == ENCINDEX_UTF_32BE) {
11123 DEFAULT_REPLACE_CHAR(
"\x00\x00\xFF\xFD");
11125 else if (encidx == ENCINDEX_UTF_32LE) {
11126 DEFAULT_REPLACE_CHAR(
"\xFD\xFF\x00\x00");
11129 DEFAULT_REPLACE_CHAR(
"?");
11133 int ret = rb_enc_precise_mbclen(p, e, enc);
11146 if (e - p < clen) clen = e - p;
11147 if (clen <= mbminlen * 2) {
11152 for (; clen > mbminlen; clen-=mbminlen) {
11153 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11164 str_mod_check(str, sp, slen);
11165 repl = str_compat_and_valid(repl, enc);
11191 str_mod_check(str, sp, slen);
11192 repl = str_compat_and_valid(repl, enc);
11228str_scrub_bang(
int argc,
VALUE *argv,
VALUE str)
11236static ID id_normalize;
11237static ID id_normalized_p;
11238static VALUE mUnicodeNormalize;
11241unicode_normalize_common(
int argc,
VALUE *argv,
VALUE str,
ID id)
11243 static int UnicodeNormalizeRequired = 0;
11246 if (!UnicodeNormalizeRequired) {
11247 rb_require(
"unicode_normalize/normalize.rb");
11248 UnicodeNormalizeRequired = 1;
11252 return rb_funcallv(mUnicodeNormalize,
id, argc+1, argv2);
11289rb_str_unicode_normalize(
int argc,
VALUE *argv,
VALUE str)
11291 return unicode_normalize_common(argc, argv, str, id_normalize);
11305rb_str_unicode_normalize_bang(
int argc,
VALUE *argv,
VALUE str)
11307 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11334rb_str_unicode_normalized_p(
int argc,
VALUE *argv,
VALUE str)
11336 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11471#define sym_equal rb_obj_equal
11474sym_printable(
const char *s,
const char *send,
rb_encoding *enc)
11478 int c = rb_enc_precise_mbclen(s, send, enc);
11490rb_str_symname_p(
VALUE sym)
11495 rb_encoding *resenc = rb_default_internal_encoding();
11497 if (resenc == NULL) resenc = rb_default_external_encoding();
11498 enc = STR_ENC_GET(sym);
11501 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (
long)strlen(ptr) ||
11509rb_str_quote_unprintable(
VALUE str)
11517 resenc = rb_default_internal_encoding();
11518 if (resenc == NULL) resenc = rb_default_external_encoding();
11519 enc = STR_ENC_GET(str);
11522 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11523 !sym_printable(ptr, ptr + len, enc)) {
11524 return rb_str_escape(str);
11529MJIT_FUNC_EXPORTED
VALUE
11530rb_id_quote_unprintable(
ID id)
11532 VALUE str = rb_id2str(
id);
11533 if (!rb_str_symname_p(str)) {
11534 return rb_str_escape(str);
11552sym_inspect(
VALUE sym)
11559 if (!rb_str_symname_p(str)) {
11564 memmove(dest + 1, dest, len);
11571 memcpy(dest + 1, ptr, len);
11596MJIT_FUNC_EXPORTED
VALUE
11597rb_sym_proc_call(
ID mid,
int argc,
const VALUE *argv,
int kw_splat,
VALUE passed_proc)
11701 return rb_str_match(
rb_sym2str(sym), other);
11716sym_match_m(
int argc,
VALUE *argv,
VALUE sym)
11718 return rb_str_match_m(argc, argv,
rb_sym2str(sym));
11731sym_match_m_p(
int argc,
VALUE *argv,
VALUE sym)
11733 return rb_str_match_m_p(argc, argv, sym);
11751 return rb_str_aref_m(argc, argv,
rb_sym2str(sym));
11765sym_length(
VALUE sym)
11779sym_empty(
VALUE sym)
11813sym_downcase(
int argc,
VALUE *argv,
VALUE sym)
11829sym_capitalize(
int argc,
VALUE *argv,
VALUE sym)
11845sym_swapcase(
int argc,
VALUE *argv,
VALUE sym)
11859sym_start_with(
int argc,
VALUE *argv,
VALUE sym)
11861 return rb_str_start_with(argc, argv,
rb_sym2str(sym));
11874sym_end_with(
int argc,
VALUE *argv,
VALUE sym)
11876 return rb_str_end_with(argc, argv,
rb_sym2str(sym));
11888sym_encoding(
VALUE sym)
11894string_for_symbol(
VALUE name)
11913 name = string_for_symbol(name);
11923 name = string_for_symbol(name);
11947 return rb_fstring(str);
11954 return register_fstring(setup_fake_str(&fake_str,
ptr,
len, ENCINDEX_US_ASCII), TRUE);
11966 if (UNLIKELY(rb_enc_autoload_p(enc))) {
11967 rb_enc_autoload(enc);
11971 return register_fstring(rb_setup_fake_str(&fake_str,
ptr,
len, enc), TRUE);
11984 assert(rb_vm_fstring_table());
11985 st_foreach(rb_vm_fstring_table(), fstring_set_class_i,
rb_cString);
12148 rb_gc_register_address(&
rb_fs);
#define RUBY_ASSERT(expr)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
#define RUBY_ASSERT_ALWAYS(expr)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_isascii(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isascii(), except it additionally takes an encoding.
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implenentation detail of RB_OBJ_FROZEN().
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
VALUE rb_define_module(const char *name)
Defines a top-level module.
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
int rb_block_given_p(void)
Determines if the current method is given a block.
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
#define TYPE(_)
Old name of rb_type.
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
#define ALLOCV
Old name of RB_ALLOCV.
#define ISSPACE
Old name of rb_isspace.
#define T_STRING
Old name of RUBY_T_STRING.
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
#define xfree
Old name of ruby_xfree.
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
#define rb_str_cat2
Old name of rb_str_cat_cstr.
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
#define ID2SYM
Old name of RB_ID2SYM.
#define OBJ_FREEZE_RAW
Old name of RB_OBJ_FREEZE_RAW.
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
#define SYM2ID
Old name of RB_SYM2ID.
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
#define CLASS_OF
Old name of rb_class_of.
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
#define SIZET2NUM
Old name of RB_SIZE2NUM.
#define FIXABLE
Old name of RB_FIXABLE.
#define xmalloc
Old name of ruby_xmalloc.
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
#define LONG2FIX
Old name of RB_INT2FIX.
#define ISDIGIT
Old name of rb_isdigit.
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
#define ZALLOC_N
Old name of RB_ZALLOC_N.
#define ALLOC_N
Old name of RB_ALLOC_N.
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
#define FL_SET
Old name of RB_FL_SET.
#define rb_ary_new3
Old name of rb_ary_new_from_args.
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
#define LONG2NUM
Old name of RB_LONG2NUM.
#define ISALPHA
Old name of rb_isalpha.
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
#define ISASCII
Old name of rb_isascii.
#define TOLOWER
Old name of rb_tolower.
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
#define NUM2INT
Old name of RB_NUM2INT.
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
#define DBL2NUM
Old name of rb_float_new.
#define ISPRINT
Old name of rb_isprint.
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
#define FL_TEST
Old name of RB_FL_TEST.
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
#define NUM2LONG
Old name of RB_NUM2LONG.
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
#define FL_UNSET
Old name of RB_FL_UNSET.
#define UINT2NUM
Old name of RB_UINT2NUM.
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
#define rb_ary_new2
Old name of rb_ary_new_capa.
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
#define SYMBOL_P
Old name of RB_SYMBOL_P.
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
#define T_REGEXP
Old name of RUBY_T_REGEXP.
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports always regardless of runtime -W flag.
void rb_raise(VALUE exc, const char *fmt,...)
Exception entry point.
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
void rb_bug(const char *fmt,...)
Interpreter panic switch.
VALUE rb_eRangeError
RangeError exception.
VALUE rb_eTypeError
TypeError exception.
void rb_fatal(const char *fmt,...)
Raises the unsung "fatal" exception.
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
VALUE rb_eRuntimeError
RuntimeError exception.
VALUE rb_eArgError
ArgumentError exception.
VALUE rb_eIndexError
IndexError exception.
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
VALUE rb_cSymbol
Sumbol class.
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
VALUE rb_obj_freeze(VALUE obj)
Just calls rb_obj_freeze_inline() inside.
VALUE rb_mComparable
Comparable module.
VALUE rb_cString
String class.
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
static const char * rb_enc_name(rb_encoding *enc)
Queries the (canonical) name of the passed encoding.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
static char * rb_enc_prev_char(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the previous (left) character.
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
static bool rb_enc_asciicompat(rb_encoding *enc)
Queries if the passed encoding is in some sense compatible with ASCII.
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
static int rb_enc_mbcput(unsigned int c, void *buf, rb_encoding *enc)
Identical to rb_enc_uint_chr(), except it writes back to the passed buffer instead of allocating one.
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc)
Identical to rb_str_cat(), except it additionally takes an encoding.
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it additionally takes an encoding.
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
rb_econv_result_t
return value of rb_econv_convert()
@ econv_finished
The conversion stopped after converting everything.
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
#define rb_check_frozen
Just another name of rb_check_frozen.
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
VALUE rb_fs
The field separator character for inputs, or the $;.
VALUE rb_default_rs
This is the default value of rb_rs, i.e.
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
void rb_backref_set(VALUE md)
Updates $~.
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
VALUE rb_str_to_interned_str(VALUE str)
Identical to rb_interned_str(), except it takes a Ruby's string instead of C's.
void rb_str_free(VALUE str)
Destroys the given string for no reason.
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
VALUE rb_str_subseq(VALUE str, long beg, long len)
Identical to rb_str_substr(), except the numbers are interpreted as byte offsets instead of character...
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
#define rb_str_buf_cat
Just another name of rb_str_cat.
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
VALUE rb_str_dup(VALUE str)
Duplicates a string.
void rb_str_modify(VALUE str)
Declares that the string is about to be modified.
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
VALUE rb_str_cat(VALUE dst, const char *src, long srclen)
Destructively appends the passed contents to the string.
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
VALUE rb_filesystem_str_new_cstr(const char *ptr)
Identical to rb_filesystem_str_new(), except it assumes the passed pointer is a pointer to a C string...
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
VALUE rb_str_buf_append(VALUE dst, VALUE src)
Identical to rb_str_cat_cstr(), except it takes Ruby's string instead of C's.
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
#define rb_strlen_lit(str)
Length of a string literal.
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
VALUE rb_str_resize(VALUE str, long len)
Overwrites the length of the string.
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
VALUE rb_str_intern(VALUE str)
Identical to rb_to_symbol(), except it assumes the receiver being an instance of RString.
VALUE rb_obj_as_string(VALUE obj)
Try converting an object to its stringised representation using its to_s method, if any.
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
ID rb_intern(const char *name)
Finds or creates a symbol of the given name.
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
ID rb_to_id(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
ID rb_intern_str(VALUE str)
Identical to rb_intern(), except it takes an instance of rb_cString.
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
VALUE rb_yield(VALUE val)
Yields the block.
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
#define ALLOCA_N(type, n)
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
#define RBASIC(obj)
Convenient casting macro.
#define DATA_PTR(obj)
Convenient getter macro.
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
@ RSTRING_EMBED_LEN_MAX
Max possible number of characters that can be embedded.
#define StringValue(v)
Ensures that the parameter object is a String.
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
static long RSTRING_EMBED_LEN(VALUE str)
Queries the length of the string.
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
static char * RSTRING_END(VALUE str)
Queries the end of the contents pointer of the string.
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
static long RSTRING_LEN(VALUE str)
Queries the length of the string.
#define RSTRING(obj)
Convenient casting macro.
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
static char * RSTRING_PTR(VALUE str)
Queries the contents pointer of the string.
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
VALUE flags
Per-object flags.
union RString::@50 as
String's specific fields.
struct RString::@50::@51 heap
Strings that use separated memory region for contents use this pattern.
struct RBasic basic
Basic part, including flags and class.
long capa
Capacity of *ptr.
struct RString::@50::@52 embed
Embedded contents.
long len
Length of the string, not including terminating NUL character.
char ary[RSTRING_EMBED_LEN_MAX+1]
When a string is short enough, it uses this area to store the contents themselves.
union RString::@50::@51::@53 aux
Auxiliary info.
VALUE shared
Parent of the string.
char * ptr
Pointer to the contents of the string.
This is the struct that holds necessary info for a struct.
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
uintptr_t VALUE
Type that represents a Ruby object.
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
static bool RB_TYPE_P(VALUE obj, enum ruby_value_type t)
Queries if the given object is of given type.