Ruby 3.2.1p31 (2023-02-08 revision 31819e82c88c6f8ecfaeb162519bfa26a14b21fd)
transcode.c
1/**********************************************************************
2
3 transcode.c -
4
5 $Author$
6 created at: Tue Oct 30 16:10:22 JST 2007
7
8 Copyright (C) 2007 Martin Duerst
9
10**********************************************************************/
11
12#include "ruby/internal/config.h"
13
14#include <ctype.h>
15
16#include "internal.h"
17#include "internal/array.h"
18#include "internal/inits.h"
19#include "internal/object.h"
20#include "internal/string.h"
21#include "internal/transcode.h"
22#include "ruby/encoding.h"
23
24#include "transcode_data.h"
25#include "id.h"
26
27#define ENABLE_ECONV_NEWLINE_OPTION 1
28
29/* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
30static VALUE rb_eUndefinedConversionError;
31static VALUE rb_eInvalidByteSequenceError;
32static VALUE rb_eConverterNotFoundError;
33
34VALUE rb_cEncodingConverter;
35
36static ID id_destination_encoding;
37static ID id_destination_encoding_name;
38static ID id_error_bytes;
39static ID id_error_char;
40static ID id_incomplete_input;
41static ID id_readagain_bytes;
42static ID id_source_encoding;
43static ID id_source_encoding_name;
44
45static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback;
46static VALUE sym_xml, sym_text, sym_attr;
47static VALUE sym_universal_newline;
48static VALUE sym_crlf_newline;
49static VALUE sym_cr_newline;
50static VALUE sym_lf_newline;
51#ifdef ENABLE_ECONV_NEWLINE_OPTION
52static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
53#endif
54static VALUE sym_partial_input;
55
56static VALUE sym_invalid_byte_sequence;
57static VALUE sym_undefined_conversion;
58static VALUE sym_destination_buffer_full;
59static VALUE sym_source_buffer_empty;
60static VALUE sym_finished;
61static VALUE sym_after_output;
62static VALUE sym_incomplete_input;
63
64static unsigned char *
65allocate_converted_string(const char *sname, const char *dname,
66 const unsigned char *str, size_t len,
67 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
68 size_t *dst_len_ptr);
69
70/* dynamic structure, one per conversion (similar to iconv_t) */
71/* may carry conversion state (e.g. for iso-2022-jp) */
72typedef struct rb_transcoding {
73 const rb_transcoder *transcoder;
74
75 int flags;
76
77 int resume_position;
78 unsigned int next_table;
79 VALUE next_info;
80 unsigned char next_byte;
81 unsigned int output_index;
82
83 ssize_t recognized_len; /* already interpreted */
84 ssize_t readagain_len; /* not yet interpreted */
85 union {
86 unsigned char ary[8]; /* max_input <= sizeof(ary) */
87 unsigned char *ptr; /* length: max_input */
88 } readbuf; /* recognized_len + readagain_len used */
89
90 ssize_t writebuf_off;
91 ssize_t writebuf_len;
92 union {
93 unsigned char ary[8]; /* max_output <= sizeof(ary) */
94 unsigned char *ptr; /* length: max_output */
95 } writebuf;
96
97 union rb_transcoding_state_t { /* opaque data for stateful encoding */
98 void *ptr;
99 char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
100 double dummy_for_alignment;
101 } state;
103#define TRANSCODING_READBUF(tc) \
104 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
105 (tc)->readbuf.ary : \
106 (tc)->readbuf.ptr)
107#define TRANSCODING_WRITEBUF(tc) \
108 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
109 (tc)->writebuf.ary : \
110 (tc)->writebuf.ptr)
111#define TRANSCODING_WRITEBUF_SIZE(tc) \
112 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
113 sizeof((tc)->writebuf.ary) : \
114 (size_t)(tc)->transcoder->max_output)
115#define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
116#define TRANSCODING_STATE(tc) \
117 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
118 (tc)->state.ary : \
119 (tc)->state.ptr)
120
121typedef struct {
122 struct rb_transcoding *tc;
123 unsigned char *out_buf_start;
124 unsigned char *out_data_start;
125 unsigned char *out_data_end;
126 unsigned char *out_buf_end;
127 rb_econv_result_t last_result;
129
131 int flags;
132 int started; /* bool */
133
134 const char *source_encoding_name;
135 const char *destination_encoding_name;
136
137 const unsigned char *replacement_str;
138 size_t replacement_len;
139 const char *replacement_enc;
140
141 unsigned char *in_buf_start;
142 unsigned char *in_data_start;
143 unsigned char *in_data_end;
144 unsigned char *in_buf_end;
145 rb_econv_elem_t *elems;
146 int replacement_allocated; /* bool */
147 int num_allocated;
148 int num_trans;
149 int num_finished;
150 struct rb_transcoding *last_tc;
151
152 /* last error */
153 struct {
154 rb_econv_result_t result;
155 struct rb_transcoding *error_tc;
156 const char *source_encoding;
157 const char *destination_encoding;
158 const unsigned char *error_bytes_start;
159 size_t error_bytes_len;
160 size_t readagain_len;
161 } last_error;
162
163 /* The following fields are only for Encoding::Converter.
164 * rb_econv_open set them NULL. */
165 rb_encoding *source_encoding;
166 rb_encoding *destination_encoding;
167};
168
169/*
170 * Dispatch data and logic
171 */
172
173#define DECORATOR_P(sname, dname) (*(sname) == '\0')
174
175typedef struct {
176 const char *sname;
177 const char *dname;
178 const char *lib; /* null means no need to load a library */
179 const rb_transcoder *transcoder;
181
182static st_table *transcoder_table;
183
184static transcoder_entry_t *
185make_transcoder_entry(const char *sname, const char *dname)
186{
187 st_data_t val;
188 st_table *table2;
189
190 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
191 val = (st_data_t)st_init_strcasetable();
192 st_add_direct(transcoder_table, (st_data_t)sname, val);
193 }
194 table2 = (st_table *)val;
195 if (!st_lookup(table2, (st_data_t)dname, &val)) {
197 entry->sname = sname;
198 entry->dname = dname;
199 entry->lib = NULL;
200 entry->transcoder = NULL;
201 val = (st_data_t)entry;
202 st_add_direct(table2, (st_data_t)dname, val);
203 }
204 return (transcoder_entry_t *)val;
205}
206
207static transcoder_entry_t *
208get_transcoder_entry(const char *sname, const char *dname)
209{
210 st_data_t val;
211 st_table *table2;
212
213 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
214 return NULL;
215 }
216 table2 = (st_table *)val;
217 if (!st_lookup(table2, (st_data_t)dname, &val)) {
218 return NULL;
219 }
220 return (transcoder_entry_t *)val;
221}
222
223void
224rb_register_transcoder(const rb_transcoder *tr)
225{
226 const char *const sname = tr->src_encoding;
227 const char *const dname = tr->dst_encoding;
228
229 transcoder_entry_t *entry;
230
231 entry = make_transcoder_entry(sname, dname);
232 if (entry->transcoder) {
233 rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
234 sname, dname);
235 }
236
237 entry->transcoder = tr;
238}
239
240static void
241declare_transcoder(const char *sname, const char *dname, const char *lib)
242{
243 transcoder_entry_t *entry;
244
245 entry = make_transcoder_entry(sname, dname);
246 entry->lib = lib;
247}
248
249static const char transcoder_lib_prefix[] = "enc/trans/";
250
251void
252rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
253{
254 if (!lib) {
255 rb_raise(rb_eArgError, "invalid library name - (null)");
256 }
257 declare_transcoder(enc1, enc2, lib);
258}
259
260#define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
261
262typedef struct search_path_queue_tag {
263 struct search_path_queue_tag *next;
264 const char *enc;
266
267typedef struct {
268 st_table *visited;
269 search_path_queue_t *queue;
270 search_path_queue_t **queue_last_ptr;
271 const char *base_enc;
273
274static int
275transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
276{
277 const char *dname = (const char *)key;
280
281 if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
282 return ST_CONTINUE;
283 }
284
286 q->enc = dname;
287 q->next = NULL;
288 *bfs->queue_last_ptr = q;
289 bfs->queue_last_ptr = &q->next;
290
291 st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
292 return ST_CONTINUE;
293}
294
295static int
296transcode_search_path(const char *sname, const char *dname,
297 void (*callback)(const char *sname, const char *dname, int depth, void *arg),
298 void *arg)
299{
302 st_data_t val;
303 st_table *table2;
304 int found;
305 int pathlen = -1;
306
307 if (encoding_equal(sname, dname))
308 return -1;
309
311 q->enc = sname;
312 q->next = NULL;
313 bfs.queue_last_ptr = &q->next;
314 bfs.queue = q;
315
316 bfs.visited = st_init_strcasetable();
317 st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
318
319 while (bfs.queue) {
320 q = bfs.queue;
321 bfs.queue = q->next;
322 if (!bfs.queue)
323 bfs.queue_last_ptr = &bfs.queue;
324
325 if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
326 xfree(q);
327 continue;
328 }
329 table2 = (st_table *)val;
330
331 if (st_lookup(table2, (st_data_t)dname, &val)) {
332 st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
333 xfree(q);
334 found = 1;
335 goto cleanup;
336 }
337
338 bfs.base_enc = q->enc;
339 st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
340 bfs.base_enc = NULL;
341
342 xfree(q);
343 }
344 found = 0;
345
346 cleanup:
347 while (bfs.queue) {
348 q = bfs.queue;
349 bfs.queue = q->next;
350 xfree(q);
351 }
352
353 if (found) {
354 const char *enc = dname;
355 int depth;
356 pathlen = 0;
357 while (1) {
358 st_lookup(bfs.visited, (st_data_t)enc, &val);
359 if (!val)
360 break;
361 pathlen++;
362 enc = (const char *)val;
363 }
364 depth = pathlen;
365 enc = dname;
366 while (1) {
367 st_lookup(bfs.visited, (st_data_t)enc, &val);
368 if (!val)
369 break;
370 callback((const char *)val, enc, --depth, arg);
371 enc = (const char *)val;
372 }
373 }
374
375 st_free_table(bfs.visited);
376
377 return pathlen; /* is -1 if not found */
378}
379
380int rb_require_internal_silent(VALUE fname);
381
382static const rb_transcoder *
383load_transcoder_entry(transcoder_entry_t *entry)
384{
385 if (entry->transcoder)
386 return entry->transcoder;
387
388 if (entry->lib) {
389 const char *const lib = entry->lib;
390 const size_t len = strlen(lib);
391 const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
392 const VALUE fn = rb_str_new(0, total_len);
393 char *const path = RSTRING_PTR(fn);
394
395 memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
396 memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
397 rb_str_set_len(fn, total_len);
398 OBJ_FREEZE(fn);
399 rb_require_internal_silent(fn);
400 }
401
402 if (entry->transcoder)
403 return entry->transcoder;
404
405 return NULL;
406}
407
408static const char*
409get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
410{
411 if (encoding_equal(encname, "UTF-8")) {
412 *len_ret = 3;
413 *repl_encname_ptr = "UTF-8";
414 return "\xEF\xBF\xBD";
415 }
416 else {
417 *len_ret = 1;
418 *repl_encname_ptr = "US-ASCII";
419 return "?";
420 }
421}
422
423/*
424 * Transcoding engine logic
425 */
426
427static const unsigned char *
428transcode_char_start(rb_transcoding *tc,
429 const unsigned char *in_start,
430 const unsigned char *inchar_start,
431 const unsigned char *in_p,
432 size_t *char_len_ptr)
433{
434 const unsigned char *ptr;
435 if (inchar_start - in_start < tc->recognized_len) {
436 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
437 inchar_start, unsigned char, in_p - inchar_start);
438 ptr = TRANSCODING_READBUF(tc);
439 }
440 else {
441 ptr = inchar_start - tc->recognized_len;
442 }
443 *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
444 return ptr;
445}
446
448transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
449 const unsigned char *in_stop, unsigned char *out_stop,
450 rb_transcoding *tc,
451 const int opt)
452{
453 const rb_transcoder *tr = tc->transcoder;
454 int unitlen = tr->input_unit_length;
455 ssize_t readagain_len = 0;
456
457 const unsigned char *inchar_start;
458 const unsigned char *in_p;
459
460 unsigned char *out_p;
461
462 in_p = inchar_start = *in_pos;
463
464 out_p = *out_pos;
465
466#define SUSPEND(ret, num) \
467 do { \
468 tc->resume_position = (num); \
469 if (0 < in_p - inchar_start) \
470 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
471 inchar_start, unsigned char, in_p - inchar_start); \
472 *in_pos = in_p; \
473 *out_pos = out_p; \
474 tc->recognized_len += in_p - inchar_start; \
475 if (readagain_len) { \
476 tc->recognized_len -= readagain_len; \
477 tc->readagain_len = readagain_len; \
478 } \
479 return (ret); \
480 resume_label ## num:; \
481 } while (0)
482#define SUSPEND_OBUF(num) \
483 do { \
484 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
485 } while (0)
486
487#define SUSPEND_AFTER_OUTPUT(num) \
488 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
489 SUSPEND(econv_after_output, num); \
490 }
491
492#define next_table (tc->next_table)
493#define next_info (tc->next_info)
494#define next_byte (tc->next_byte)
495#define writebuf_len (tc->writebuf_len)
496#define writebuf_off (tc->writebuf_off)
497
498 switch (tc->resume_position) {
499 case 0: break;
500 case 1: goto resume_label1;
501 case 2: goto resume_label2;
502 case 3: goto resume_label3;
503 case 4: goto resume_label4;
504 case 5: goto resume_label5;
505 case 6: goto resume_label6;
506 case 7: goto resume_label7;
507 case 8: goto resume_label8;
508 case 9: goto resume_label9;
509 case 10: goto resume_label10;
510 case 11: goto resume_label11;
511 case 12: goto resume_label12;
512 case 13: goto resume_label13;
513 case 14: goto resume_label14;
514 case 15: goto resume_label15;
515 case 16: goto resume_label16;
516 case 17: goto resume_label17;
517 case 18: goto resume_label18;
518 case 19: goto resume_label19;
519 case 20: goto resume_label20;
520 case 21: goto resume_label21;
521 case 22: goto resume_label22;
522 case 23: goto resume_label23;
523 case 24: goto resume_label24;
524 case 25: goto resume_label25;
525 case 26: goto resume_label26;
526 case 27: goto resume_label27;
527 case 28: goto resume_label28;
528 case 29: goto resume_label29;
529 case 30: goto resume_label30;
530 case 31: goto resume_label31;
531 case 32: goto resume_label32;
532 case 33: goto resume_label33;
533 case 34: goto resume_label34;
534 }
535
536 while (1) {
537 inchar_start = in_p;
538 tc->recognized_len = 0;
539 next_table = tr->conv_tree_start;
540
541 SUSPEND_AFTER_OUTPUT(24);
542
543 if (in_stop <= in_p) {
544 if (!(opt & ECONV_PARTIAL_INPUT))
545 break;
546 SUSPEND(econv_source_buffer_empty, 7);
547 continue;
548 }
549
550#define BYTE_ADDR(index) (tr->byte_array + (index))
551#define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
552#define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
553#define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
554#define BL_MIN_BYTE (BL_BASE[0])
555#define BL_MAX_BYTE (BL_BASE[1])
556#define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
557#define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
558
559 next_byte = (unsigned char)*in_p++;
560 follow_byte:
561 if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
562 next_info = INVALID;
563 else {
564 next_info = (VALUE)BL_ACTION(next_byte);
565 }
566 follow_info:
567 switch (next_info & 0x1F) {
568 case NOMAP:
569 {
570 const unsigned char *p = inchar_start;
571 writebuf_off = 0;
572 while (p < in_p) {
573 TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
574 }
575 writebuf_len = writebuf_off;
576 writebuf_off = 0;
577 while (writebuf_off < writebuf_len) {
578 SUSPEND_OBUF(3);
579 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
580 }
581 }
582 continue;
583 case 0x00: case 0x04: case 0x08: case 0x0C:
584 case 0x10: case 0x14: case 0x18: case 0x1C:
585 SUSPEND_AFTER_OUTPUT(25);
586 while (in_p >= in_stop) {
587 if (!(opt & ECONV_PARTIAL_INPUT))
588 goto incomplete;
589 SUSPEND(econv_source_buffer_empty, 5);
590 }
591 next_byte = (unsigned char)*in_p++;
592 next_table = (unsigned int)next_info;
593 goto follow_byte;
594 case ZERObt: /* drop input */
595 continue;
596 case ONEbt:
597 SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
598 continue;
599 case TWObt:
600 SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
601 SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
602 continue;
603 case THREEbt:
604 SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
605 SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
606 SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
607 continue;
608 case FOURbt:
609 SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
610 SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
611 SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
612 SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
613 continue;
614 case GB4bt:
615 SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
616 SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
617 SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
618 SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
619 continue;
620 case STR1:
621 tc->output_index = 0;
622 while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
623 SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
624 tc->output_index++;
625 }
626 continue;
627 case FUNii:
628 next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
629 goto follow_info;
630 case FUNsi:
631 {
632 const unsigned char *char_start;
633 size_t char_len;
634 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
635 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
636 goto follow_info;
637 }
638 case FUNio:
639 SUSPEND_OBUF(13);
640 if (tr->max_output <= out_stop - out_p)
641 out_p += tr->func_io(TRANSCODING_STATE(tc),
642 next_info, out_p, out_stop - out_p);
643 else {
644 writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
645 next_info,
646 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
647 writebuf_off = 0;
648 while (writebuf_off < writebuf_len) {
649 SUSPEND_OBUF(20);
650 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
651 }
652 }
653 break;
654 case FUNso:
655 {
656 const unsigned char *char_start;
657 size_t char_len;
658 SUSPEND_OBUF(14);
659 if (tr->max_output <= out_stop - out_p) {
660 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
661 out_p += tr->func_so(TRANSCODING_STATE(tc),
662 char_start, (size_t)char_len,
663 out_p, out_stop - out_p);
664 }
665 else {
666 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
667 writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
668 char_start, (size_t)char_len,
669 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
670 writebuf_off = 0;
671 while (writebuf_off < writebuf_len) {
672 SUSPEND_OBUF(22);
673 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
674 }
675 }
676 break;
677 }
678 case FUNsio:
679 {
680 const unsigned char *char_start;
681 size_t char_len;
682 SUSPEND_OBUF(33);
683 if (tr->max_output <= out_stop - out_p) {
684 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
685 out_p += tr->func_sio(TRANSCODING_STATE(tc),
686 char_start, (size_t)char_len, next_info,
687 out_p, out_stop - out_p);
688 }
689 else {
690 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
691 writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
692 char_start, (size_t)char_len, next_info,
693 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
694 writebuf_off = 0;
695 while (writebuf_off < writebuf_len) {
696 SUSPEND_OBUF(34);
697 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
698 }
699 }
700 break;
701 }
702 case INVALID:
703 if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
704 if (tc->recognized_len + (in_p - inchar_start) < unitlen)
705 SUSPEND_AFTER_OUTPUT(26);
706 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
707 in_p = in_stop;
708 SUSPEND(econv_source_buffer_empty, 8);
709 }
710 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
711 in_p = in_stop;
712 }
713 else {
714 in_p = inchar_start + (unitlen - tc->recognized_len);
715 }
716 }
717 else {
718 ssize_t invalid_len; /* including the last byte which causes invalid */
719 ssize_t discard_len;
720 invalid_len = tc->recognized_len + (in_p - inchar_start);
721 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
722 readagain_len = invalid_len - discard_len;
723 }
724 goto invalid;
725 case UNDEF:
726 goto undef;
727 default:
728 rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
729 }
730 continue;
731
732 invalid:
733 SUSPEND(econv_invalid_byte_sequence, 1);
734 continue;
735
736 incomplete:
737 SUSPEND(econv_incomplete_input, 27);
738 continue;
739
740 undef:
741 SUSPEND(econv_undefined_conversion, 2);
742 continue;
743 }
744
745 /* cleanup */
746 if (tr->finish_func) {
747 SUSPEND_OBUF(4);
748 if (tr->max_output <= out_stop - out_p) {
749 out_p += tr->finish_func(TRANSCODING_STATE(tc),
750 out_p, out_stop - out_p);
751 }
752 else {
753 writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
754 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
755 writebuf_off = 0;
756 while (writebuf_off < writebuf_len) {
757 SUSPEND_OBUF(23);
758 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
759 }
760 }
761 }
762 while (1)
763 SUSPEND(econv_finished, 6);
764#undef SUSPEND
765#undef next_table
766#undef next_info
767#undef next_byte
768#undef writebuf_len
769#undef writebuf_off
770}
771
773transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
774 const unsigned char *in_stop, unsigned char *out_stop,
775 rb_transcoding *tc,
776 const int opt)
777{
778 if (tc->readagain_len) {
779 unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
780 const unsigned char *readagain_pos = readagain_buf;
781 const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
783
784 MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
785 unsigned char, tc->readagain_len);
786 tc->readagain_len = 0;
787 res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
788 if (res != econv_source_buffer_empty) {
789 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
790 readagain_pos, unsigned char, readagain_stop - readagain_pos);
791 tc->readagain_len += readagain_stop - readagain_pos;
792 return res;
793 }
794 }
795 return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
796}
797
798static rb_transcoding *
799rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
800{
801 rb_transcoding *tc;
802
803 tc = ALLOC(rb_transcoding);
804 tc->transcoder = tr;
805 tc->flags = flags;
806 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
807 tc->state.ptr = xmalloc(tr->state_size);
808 if (tr->state_init_func) {
809 (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
810 }
811 tc->resume_position = 0;
812 tc->recognized_len = 0;
813 tc->readagain_len = 0;
814 tc->writebuf_len = 0;
815 tc->writebuf_off = 0;
816 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
817 tc->readbuf.ptr = xmalloc(tr->max_input);
818 }
819 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
820 tc->writebuf.ptr = xmalloc(tr->max_output);
821 }
822 return tc;
823}
824
826rb_transcoding_convert(rb_transcoding *tc,
827 const unsigned char **input_ptr, const unsigned char *input_stop,
828 unsigned char **output_ptr, unsigned char *output_stop,
829 int flags)
830{
831 return transcode_restartable(
832 input_ptr, output_ptr,
833 input_stop, output_stop,
834 tc, flags);
835}
836
837static void
838rb_transcoding_close(rb_transcoding *tc)
839{
840 const rb_transcoder *tr = tc->transcoder;
841 if (tr->state_fini_func) {
842 (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
843 }
844 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
845 xfree(tc->state.ptr);
846 if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
847 xfree(tc->readbuf.ptr);
848 if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
849 xfree(tc->writebuf.ptr);
850 xfree(tc);
851}
852
853static size_t
854rb_transcoding_memsize(rb_transcoding *tc)
855{
856 size_t size = sizeof(rb_transcoding);
857 const rb_transcoder *tr = tc->transcoder;
858
859 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
860 size += tr->state_size;
861 }
862 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
863 size += tr->max_input;
864 }
865 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
866 size += tr->max_output;
867 }
868 return size;
869}
870
871static rb_econv_t *
872rb_econv_alloc(int n_hint)
873{
874 rb_econv_t *ec;
875
876 if (n_hint <= 0)
877 n_hint = 1;
878
879 ec = ALLOC(rb_econv_t);
880 ec->flags = 0;
881 ec->source_encoding_name = NULL;
882 ec->destination_encoding_name = NULL;
883 ec->started = 0;
884 ec->replacement_str = NULL;
885 ec->replacement_len = 0;
886 ec->replacement_enc = NULL;
887 ec->replacement_allocated = 0;
888 ec->in_buf_start = NULL;
889 ec->in_data_start = NULL;
890 ec->in_data_end = NULL;
891 ec->in_buf_end = NULL;
892 ec->num_allocated = n_hint;
893 ec->num_trans = 0;
894 ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
895 ec->num_finished = 0;
896 ec->last_tc = NULL;
897 ec->last_error.result = econv_source_buffer_empty;
898 ec->last_error.error_tc = NULL;
899 ec->last_error.source_encoding = NULL;
900 ec->last_error.destination_encoding = NULL;
901 ec->last_error.error_bytes_start = NULL;
902 ec->last_error.error_bytes_len = 0;
903 ec->last_error.readagain_len = 0;
904 ec->source_encoding = NULL;
905 ec->destination_encoding = NULL;
906 return ec;
907}
908
909static int
910rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
911{
912 int n, j;
913 int bufsize = 4096;
914 unsigned char *p;
915
916 if (ec->num_trans == ec->num_allocated) {
917 n = ec->num_allocated * 2;
918 REALLOC_N(ec->elems, rb_econv_elem_t, n);
919 ec->num_allocated = n;
920 }
921
922 p = xmalloc(bufsize);
923
924 MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
925
926 ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
927 ec->elems[i].out_buf_start = p;
928 ec->elems[i].out_buf_end = p + bufsize;
929 ec->elems[i].out_data_start = p;
930 ec->elems[i].out_data_end = p;
931 ec->elems[i].last_result = econv_source_buffer_empty;
932
933 ec->num_trans++;
934
935 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
936 for (j = ec->num_trans-1; i <= j; j--) {
937 rb_transcoding *tc = ec->elems[j].tc;
938 const rb_transcoder *tr2 = tc->transcoder;
939 if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
940 ec->last_tc = tc;
941 break;
942 }
943 }
944
945 return 0;
946}
947
948static rb_econv_t *
949rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
950{
951 rb_econv_t *ec;
952 int i, ret;
953
954 for (i = 0; i < n; i++) {
955 const rb_transcoder *tr;
956 tr = load_transcoder_entry(entries[i]);
957 if (!tr)
958 return NULL;
959 }
960
961 ec = rb_econv_alloc(n);
962
963 for (i = 0; i < n; i++) {
964 const rb_transcoder *tr = load_transcoder_entry(entries[i]);
965 ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
966 if (ret == -1) {
967 rb_econv_close(ec);
968 return NULL;
969 }
970 }
971
972 return ec;
973}
974
976 transcoder_entry_t **entries;
977 int num_additional;
978};
979
980static void
981trans_open_i(const char *sname, const char *dname, int depth, void *arg)
982{
983 struct trans_open_t *toarg = arg;
984
985 if (!toarg->entries) {
986 toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
987 }
988 toarg->entries[depth] = get_transcoder_entry(sname, dname);
989}
990
991static rb_econv_t *
992rb_econv_open0(const char *sname, const char *dname, int ecflags)
993{
994 transcoder_entry_t **entries = NULL;
995 int num_trans;
996 rb_econv_t *ec;
997
998 /* Just check if sname and dname are defined */
999 /* (This check is needed?) */
1000 if (*sname) rb_enc_find_index(sname);
1001 if (*dname) rb_enc_find_index(dname);
1002
1003 if (*sname == '\0' && *dname == '\0') {
1004 num_trans = 0;
1005 entries = NULL;
1006 sname = dname = "";
1007 }
1008 else {
1009 struct trans_open_t toarg;
1010 toarg.entries = NULL;
1011 toarg.num_additional = 0;
1012 num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
1013 entries = toarg.entries;
1014 if (num_trans < 0) {
1015 xfree(entries);
1016 return NULL;
1017 }
1018 }
1019
1020 ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
1021 xfree(entries);
1022 if (!ec)
1023 return NULL;
1024
1025 ec->flags = ecflags;
1026 ec->source_encoding_name = sname;
1027 ec->destination_encoding_name = dname;
1028
1029 return ec;
1030}
1031
1032#define MAX_ECFLAGS_DECORATORS 32
1033
1034static int
1035decorator_names(int ecflags, const char **decorators_ret)
1036{
1037 int num_decorators;
1038
1039 switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
1044 case 0:
1045 break;
1046 default:
1047 return -1;
1048 }
1049
1050 if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
1052 return -1;
1053
1054 num_decorators = 0;
1055
1056 if (ecflags & ECONV_XML_TEXT_DECORATOR)
1057 decorators_ret[num_decorators++] = "xml_text_escape";
1059 decorators_ret[num_decorators++] = "xml_attr_content_escape";
1060 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
1061 decorators_ret[num_decorators++] = "xml_attr_quote";
1062
1063 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
1064 decorators_ret[num_decorators++] = "crlf_newline";
1065 if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
1066 decorators_ret[num_decorators++] = "cr_newline";
1067 if (ecflags & ECONV_LF_NEWLINE_DECORATOR)
1068 decorators_ret[num_decorators++] = "lf_newline";
1070 decorators_ret[num_decorators++] = "universal_newline";
1071
1072 return num_decorators;
1073}
1074
1075rb_econv_t *
1076rb_econv_open(const char *sname, const char *dname, int ecflags)
1077{
1078 rb_econv_t *ec;
1079 int num_decorators;
1080 const char *decorators[MAX_ECFLAGS_DECORATORS];
1081 int i;
1082
1083 num_decorators = decorator_names(ecflags, decorators);
1084 if (num_decorators == -1)
1085 return NULL;
1086
1087 ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
1088 if (!ec)
1089 return NULL;
1090
1091 for (i = 0; i < num_decorators; i++)
1092 if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
1093 rb_econv_close(ec);
1094 return NULL;
1095 }
1096
1097 ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1098
1099 return ec;
1100}
1101
1102static int
1103trans_sweep(rb_econv_t *ec,
1104 const unsigned char **input_ptr, const unsigned char *input_stop,
1105 unsigned char **output_ptr, unsigned char *output_stop,
1106 int flags,
1107 int start)
1108{
1109 int try;
1110 int i, f;
1111
1112 const unsigned char **ipp, *is, *iold;
1113 unsigned char **opp, *os, *oold;
1115
1116 try = 1;
1117 while (try) {
1118 try = 0;
1119 for (i = start; i < ec->num_trans; i++) {
1120 rb_econv_elem_t *te = &ec->elems[i];
1121
1122 if (i == 0) {
1123 ipp = input_ptr;
1124 is = input_stop;
1125 }
1126 else {
1127 rb_econv_elem_t *prev_te = &ec->elems[i-1];
1128 ipp = (const unsigned char **)&prev_te->out_data_start;
1129 is = prev_te->out_data_end;
1130 }
1131
1132 if (i == ec->num_trans-1) {
1133 opp = output_ptr;
1134 os = output_stop;
1135 }
1136 else {
1137 if (te->out_buf_start != te->out_data_start) {
1138 ssize_t len = te->out_data_end - te->out_data_start;
1139 ssize_t off = te->out_data_start - te->out_buf_start;
1140 MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
1141 te->out_data_start = te->out_buf_start;
1142 te->out_data_end -= off;
1143 }
1144 opp = &te->out_data_end;
1145 os = te->out_buf_end;
1146 }
1147
1148 f = flags;
1149 if (ec->num_finished != i)
1151 if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
1152 start = 1;
1153 flags &= ~ECONV_AFTER_OUTPUT;
1154 }
1155 if (i != 0)
1156 f &= ~ECONV_AFTER_OUTPUT;
1157 iold = *ipp;
1158 oold = *opp;
1159 te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1160 if (iold != *ipp || oold != *opp)
1161 try = 1;
1162
1163 switch (res) {
1167 case econv_after_output:
1168 return i;
1169
1172 break;
1173
1174 case econv_finished:
1175 ec->num_finished = i+1;
1176 break;
1177 }
1178 }
1179 }
1180 return -1;
1181}
1182
1183static rb_econv_result_t
1184rb_trans_conv(rb_econv_t *ec,
1185 const unsigned char **input_ptr, const unsigned char *input_stop,
1186 unsigned char **output_ptr, unsigned char *output_stop,
1187 int flags,
1188 int *result_position_ptr)
1189{
1190 int i;
1191 int needreport_index;
1192 int sweep_start;
1193
1194 unsigned char empty_buf;
1195 unsigned char *empty_ptr = &empty_buf;
1196
1197 if (!input_ptr) {
1198 input_ptr = (const unsigned char **)&empty_ptr;
1199 input_stop = empty_ptr;
1200 }
1201
1202 if (!output_ptr) {
1203 output_ptr = &empty_ptr;
1204 output_stop = empty_ptr;
1205 }
1206
1207 if (ec->elems[0].last_result == econv_after_output)
1208 ec->elems[0].last_result = econv_source_buffer_empty;
1209
1210 for (i = ec->num_trans-1; 0 <= i; i--) {
1211 switch (ec->elems[i].last_result) {
1215 case econv_after_output:
1216 case econv_finished:
1217 sweep_start = i+1;
1218 goto found_needreport;
1219
1222 break;
1223
1224 default:
1225 rb_bug("unexpected transcode last result");
1226 }
1227 }
1228
1229 /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
1230
1231 if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
1232 (flags & ECONV_AFTER_OUTPUT)) {
1234
1235 res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1237 result_position_ptr);
1238
1239 if (res == econv_source_buffer_empty)
1240 return econv_after_output;
1241 return res;
1242 }
1243
1244 sweep_start = 0;
1245
1246 found_needreport:
1247
1248 do {
1249 needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1250 sweep_start = needreport_index + 1;
1251 } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1252
1253 for (i = ec->num_trans-1; 0 <= i; i--) {
1254 if (ec->elems[i].last_result != econv_source_buffer_empty) {
1255 rb_econv_result_t res = ec->elems[i].last_result;
1256 if (res == econv_invalid_byte_sequence ||
1257 res == econv_incomplete_input ||
1259 res == econv_after_output) {
1260 ec->elems[i].last_result = econv_source_buffer_empty;
1261 }
1262 if (result_position_ptr)
1263 *result_position_ptr = i;
1264 return res;
1265 }
1266 }
1267 if (result_position_ptr)
1268 *result_position_ptr = -1;
1270}
1271
1272static rb_econv_result_t
1273rb_econv_convert0(rb_econv_t *ec,
1274 const unsigned char **input_ptr, const unsigned char *input_stop,
1275 unsigned char **output_ptr, unsigned char *output_stop,
1276 int flags)
1277{
1279 int result_position;
1280 int has_output = 0;
1281
1282 memset(&ec->last_error, 0, sizeof(ec->last_error));
1283
1284 if (ec->num_trans == 0) {
1285 size_t len;
1286 if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1287 if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1288 len = output_stop - *output_ptr;
1289 memcpy(*output_ptr, ec->in_data_start, len);
1290 *output_ptr = output_stop;
1291 ec->in_data_start += len;
1293 goto gotresult;
1294 }
1295 len = ec->in_data_end - ec->in_data_start;
1296 memcpy(*output_ptr, ec->in_data_start, len);
1297 *output_ptr += len;
1298 ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1299 if (flags & ECONV_AFTER_OUTPUT) {
1300 res = econv_after_output;
1301 goto gotresult;
1302 }
1303 }
1304 if (output_stop - *output_ptr < input_stop - *input_ptr) {
1305 len = output_stop - *output_ptr;
1306 }
1307 else {
1308 len = input_stop - *input_ptr;
1309 }
1310 if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
1311 *(*output_ptr)++ = *(*input_ptr)++;
1312 res = econv_after_output;
1313 goto gotresult;
1314 }
1315 memcpy(*output_ptr, *input_ptr, len);
1316 *output_ptr += len;
1317 *input_ptr += len;
1318 if (*input_ptr != input_stop)
1320 else if (flags & ECONV_PARTIAL_INPUT)
1322 else
1323 res = econv_finished;
1324 goto gotresult;
1325 }
1326
1327 if (ec->elems[ec->num_trans-1].out_data_start) {
1328 unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1329 unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1330 if (data_start != data_end) {
1331 size_t len;
1332 if (output_stop - *output_ptr < data_end - data_start) {
1333 len = output_stop - *output_ptr;
1334 memcpy(*output_ptr, data_start, len);
1335 *output_ptr = output_stop;
1336 ec->elems[ec->num_trans-1].out_data_start += len;
1338 goto gotresult;
1339 }
1340 len = data_end - data_start;
1341 memcpy(*output_ptr, data_start, len);
1342 *output_ptr += len;
1343 ec->elems[ec->num_trans-1].out_data_start =
1344 ec->elems[ec->num_trans-1].out_data_end =
1345 ec->elems[ec->num_trans-1].out_buf_start;
1346 has_output = 1;
1347 }
1348 }
1349
1350 if (ec->in_buf_start &&
1351 ec->in_data_start != ec->in_data_end) {
1352 res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1353 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
1354 if (res != econv_source_buffer_empty)
1355 goto gotresult;
1356 }
1357
1358 if (has_output &&
1359 (flags & ECONV_AFTER_OUTPUT) &&
1360 *input_ptr != input_stop) {
1361 input_stop = *input_ptr;
1362 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1363 if (res == econv_source_buffer_empty)
1364 res = econv_after_output;
1365 }
1366 else if ((flags & ECONV_AFTER_OUTPUT) ||
1367 ec->num_trans == 1) {
1368 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1369 }
1370 else {
1371 flags |= ECONV_AFTER_OUTPUT;
1372 do {
1373 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1374 } while (res == econv_after_output);
1375 }
1376
1377 gotresult:
1378 ec->last_error.result = res;
1379 if (res == econv_invalid_byte_sequence ||
1380 res == econv_incomplete_input ||
1382 rb_transcoding *error_tc = ec->elems[result_position].tc;
1383 ec->last_error.error_tc = error_tc;
1384 ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
1385 ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
1386 ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
1387 ec->last_error.error_bytes_len = error_tc->recognized_len;
1388 ec->last_error.readagain_len = error_tc->readagain_len;
1389 }
1390
1391 return res;
1392}
1393
1394static int output_replacement_character(rb_econv_t *ec);
1395
1396static int
1397output_hex_charref(rb_econv_t *ec)
1398{
1399 int ret;
1400 unsigned char utfbuf[1024];
1401 const unsigned char *utf;
1402 size_t utf_len;
1403 int utf_allocated = 0;
1404 char charef_buf[16];
1405 const unsigned char *p;
1406
1407 if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
1408 utf = ec->last_error.error_bytes_start;
1409 utf_len = ec->last_error.error_bytes_len;
1410 }
1411 else {
1412 utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
1413 ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
1414 utfbuf, sizeof(utfbuf),
1415 &utf_len);
1416 if (!utf)
1417 return -1;
1418 if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1419 utf_allocated = 1;
1420 }
1421
1422 if (utf_len % 4 != 0)
1423 goto fail;
1424
1425 p = utf;
1426 while (4 <= utf_len) {
1427 unsigned int u = 0;
1428 u += p[0] << 24;
1429 u += p[1] << 16;
1430 u += p[2] << 8;
1431 u += p[3];
1432 snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
1433
1434 ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
1435 if (ret == -1)
1436 goto fail;
1437
1438 p += 4;
1439 utf_len -= 4;
1440 }
1441
1442 if (utf_allocated)
1443 xfree((void *)utf);
1444 return 0;
1445
1446 fail:
1447 if (utf_allocated)
1448 xfree((void *)utf);
1449 return -1;
1450}
1451
1454 const unsigned char **input_ptr, const unsigned char *input_stop,
1455 unsigned char **output_ptr, unsigned char *output_stop,
1456 int flags)
1457{
1459
1460 unsigned char empty_buf;
1461 unsigned char *empty_ptr = &empty_buf;
1462
1463 ec->started = 1;
1464
1465 if (!input_ptr) {
1466 input_ptr = (const unsigned char **)&empty_ptr;
1467 input_stop = empty_ptr;
1468 }
1469
1470 if (!output_ptr) {
1471 output_ptr = &empty_ptr;
1472 output_stop = empty_ptr;
1473 }
1474
1475 resume:
1476 ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1477
1478 if (ret == econv_invalid_byte_sequence ||
1479 ret == econv_incomplete_input) {
1480 /* deal with invalid byte sequence */
1481 /* todo: add more alternative behaviors */
1482 switch (ec->flags & ECONV_INVALID_MASK) {
1484 if (output_replacement_character(ec) == 0)
1485 goto resume;
1486 }
1487 }
1488
1489 if (ret == econv_undefined_conversion) {
1490 /* valid character in source encoding
1491 * but no related character(s) in destination encoding */
1492 /* todo: add more alternative behaviors */
1493 switch (ec->flags & ECONV_UNDEF_MASK) {
1495 if (output_replacement_character(ec) == 0)
1496 goto resume;
1497 break;
1498
1500 if (output_hex_charref(ec) == 0)
1501 goto resume;
1502 break;
1503 }
1504 }
1505
1506 return ret;
1507}
1508
1509const char *
1511{
1512 rb_transcoding *tc = ec->last_tc;
1513 const rb_transcoder *tr;
1514
1515 if (tc == NULL)
1516 return "";
1517
1518 tr = tc->transcoder;
1519
1520 if (tr->asciicompat_type == asciicompat_encoder)
1521 return tr->src_encoding;
1522 return tr->dst_encoding;
1523}
1524
1525static unsigned char *
1526allocate_converted_string(const char *sname, const char *dname,
1527 const unsigned char *str, size_t len,
1528 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
1529 size_t *dst_len_ptr)
1530{
1531 unsigned char *dst_str;
1532 size_t dst_len;
1533 size_t dst_bufsize;
1534
1535 rb_econv_t *ec;
1537
1538 const unsigned char *sp;
1539 unsigned char *dp;
1540
1541 if (caller_dst_buf)
1542 dst_bufsize = caller_dst_bufsize;
1543 else if (len == 0)
1544 dst_bufsize = 1;
1545 else
1546 dst_bufsize = len;
1547
1548 ec = rb_econv_open(sname, dname, 0);
1549 if (ec == NULL)
1550 return NULL;
1551 if (caller_dst_buf)
1552 dst_str = caller_dst_buf;
1553 else
1554 dst_str = xmalloc(dst_bufsize);
1555 dst_len = 0;
1556 sp = str;
1557 dp = dst_str+dst_len;
1558 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1559 dst_len = dp - dst_str;
1560 while (res == econv_destination_buffer_full) {
1561 if (SIZE_MAX/2 < dst_bufsize) {
1562 goto fail;
1563 }
1564 dst_bufsize *= 2;
1565 if (dst_str == caller_dst_buf) {
1566 unsigned char *tmp;
1567 tmp = xmalloc(dst_bufsize);
1568 memcpy(tmp, dst_str, dst_bufsize/2);
1569 dst_str = tmp;
1570 }
1571 else {
1572 dst_str = xrealloc(dst_str, dst_bufsize);
1573 }
1574 dp = dst_str+dst_len;
1575 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1576 dst_len = dp - dst_str;
1577 }
1578 if (res != econv_finished) {
1579 goto fail;
1580 }
1581 rb_econv_close(ec);
1582 *dst_len_ptr = dst_len;
1583 return dst_str;
1584
1585 fail:
1586 if (dst_str != caller_dst_buf)
1587 xfree(dst_str);
1588 rb_econv_close(ec);
1589 return NULL;
1590}
1591
1592/* result: 0:success -1:failure */
1593int
1595 const unsigned char *str, size_t len, const char *str_encoding)
1596{
1597 const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
1598 unsigned char insert_buf[4096];
1599 const unsigned char *insert_str = NULL;
1600 size_t insert_len;
1601
1602 int last_trans_index;
1603 rb_transcoding *tc;
1604
1605 unsigned char **buf_start_p;
1606 unsigned char **data_start_p;
1607 unsigned char **data_end_p;
1608 unsigned char **buf_end_p;
1609
1610 size_t need;
1611
1612 ec->started = 1;
1613
1614 if (len == 0)
1615 return 0;
1616
1617 if (encoding_equal(insert_encoding, str_encoding)) {
1618 insert_str = str;
1619 insert_len = len;
1620 }
1621 else {
1622 insert_str = allocate_converted_string(str_encoding, insert_encoding,
1623 str, len, insert_buf, sizeof(insert_buf), &insert_len);
1624 if (insert_str == NULL)
1625 return -1;
1626 }
1627
1628 need = insert_len;
1629
1630 last_trans_index = ec->num_trans-1;
1631 if (ec->num_trans == 0) {
1632 tc = NULL;
1633 buf_start_p = &ec->in_buf_start;
1634 data_start_p = &ec->in_data_start;
1635 data_end_p = &ec->in_data_end;
1636 buf_end_p = &ec->in_buf_end;
1637 }
1638 else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1639 tc = ec->elems[last_trans_index].tc;
1640 need += tc->readagain_len;
1641 if (need < insert_len)
1642 goto fail;
1643 if (last_trans_index == 0) {
1644 buf_start_p = &ec->in_buf_start;
1645 data_start_p = &ec->in_data_start;
1646 data_end_p = &ec->in_data_end;
1647 buf_end_p = &ec->in_buf_end;
1648 }
1649 else {
1650 rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
1651 buf_start_p = &ee->out_buf_start;
1652 data_start_p = &ee->out_data_start;
1653 data_end_p = &ee->out_data_end;
1654 buf_end_p = &ee->out_buf_end;
1655 }
1656 }
1657 else {
1658 rb_econv_elem_t *ee = &ec->elems[last_trans_index];
1659 buf_start_p = &ee->out_buf_start;
1660 data_start_p = &ee->out_data_start;
1661 data_end_p = &ee->out_data_end;
1662 buf_end_p = &ee->out_buf_end;
1663 tc = ec->elems[last_trans_index].tc;
1664 }
1665
1666 if (*buf_start_p == NULL) {
1667 unsigned char *buf = xmalloc(need);
1668 *buf_start_p = buf;
1669 *data_start_p = buf;
1670 *data_end_p = buf;
1671 *buf_end_p = buf+need;
1672 }
1673 else if ((size_t)(*buf_end_p - *data_end_p) < need) {
1674 MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
1675 *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1676 *data_start_p = *buf_start_p;
1677 if ((size_t)(*buf_end_p - *data_end_p) < need) {
1678 unsigned char *buf;
1679 size_t s = (*data_end_p - *buf_start_p) + need;
1680 if (s < need)
1681 goto fail;
1682 buf = xrealloc(*buf_start_p, s);
1683 *data_start_p = buf;
1684 *data_end_p = buf + (*data_end_p - *buf_start_p);
1685 *buf_start_p = buf;
1686 *buf_end_p = buf + s;
1687 }
1688 }
1689
1690 memcpy(*data_end_p, insert_str, insert_len);
1691 *data_end_p += insert_len;
1692 if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
1693 memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1694 *data_end_p += tc->readagain_len;
1695 tc->readagain_len = 0;
1696 }
1697
1698 if (insert_str != str && insert_str != insert_buf)
1699 xfree((void*)insert_str);
1700 return 0;
1701
1702 fail:
1703 if (insert_str != str && insert_str != insert_buf)
1704 xfree((void*)insert_str);
1705 return -1;
1706}
1707
1708void
1710{
1711 int i;
1712
1713 if (ec->replacement_allocated) {
1714 xfree((void *)ec->replacement_str);
1715 }
1716 for (i = 0; i < ec->num_trans; i++) {
1717 rb_transcoding_close(ec->elems[i].tc);
1718 if (ec->elems[i].out_buf_start)
1719 xfree(ec->elems[i].out_buf_start);
1720 }
1721 xfree(ec->in_buf_start);
1722 xfree(ec->elems);
1723 xfree(ec);
1724}
1725
1726size_t
1727rb_econv_memsize(rb_econv_t *ec)
1728{
1729 size_t size = sizeof(rb_econv_t);
1730 int i;
1731
1732 if (ec->replacement_allocated) {
1733 size += ec->replacement_len;
1734 }
1735 for (i = 0; i < ec->num_trans; i++) {
1736 size += rb_transcoding_memsize(ec->elems[i].tc);
1737
1738 if (ec->elems[i].out_buf_start) {
1739 size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1740 }
1741 }
1742 size += ec->in_buf_end - ec->in_buf_start;
1743 size += sizeof(rb_econv_elem_t) * ec->num_allocated;
1744
1745 return size;
1746}
1747
1748int
1750{
1751 if (ec->num_trans == 0)
1752 return 0;
1753#if SIZEOF_SIZE_T > SIZEOF_INT
1754 if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
1755#endif
1756 return (int)ec->elems[0].tc->readagain_len;
1757}
1758
1759void
1760rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
1761{
1762 rb_transcoding *tc;
1763 if (ec->num_trans == 0 || n == 0)
1764 return;
1765 tc = ec->elems[0].tc;
1766 memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
1767 tc->readagain_len -= n;
1768}
1769
1771 const char *ascii_compat_name;
1772 const char *ascii_incompat_name;
1773};
1774
1775static int
1776asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
1777{
1778 struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
1779 transcoder_entry_t *entry = (transcoder_entry_t *)val;
1780 const rb_transcoder *tr;
1781
1782 if (DECORATOR_P(entry->sname, entry->dname))
1783 return ST_CONTINUE;
1784 tr = load_transcoder_entry(entry);
1785 if (tr && tr->asciicompat_type == asciicompat_decoder) {
1786 data->ascii_compat_name = tr->dst_encoding;
1787 return ST_STOP;
1788 }
1789 return ST_CONTINUE;
1790}
1791
1792const char *
1793rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
1794{
1795 st_data_t v;
1796 st_table *table2;
1797 struct asciicompat_encoding_t data;
1798
1799 if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
1800 return NULL;
1801 table2 = (st_table *)v;
1802
1803 /*
1804 * Assumption:
1805 * There is at most one transcoder for
1806 * converting from ASCII incompatible encoding.
1807 *
1808 * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1809 */
1810 if (table2->num_entries != 1)
1811 return NULL;
1812
1813 data.ascii_incompat_name = ascii_incompat_name;
1814 data.ascii_compat_name = NULL;
1815 st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1816 return data.ascii_compat_name;
1817}
1818
1819/*
1820 * Append `len` bytes pointed by `ss` to `dst` with converting with `ec`.
1821 *
1822 * If the result of the conversion is not compatible with the encoding of
1823 * `dst`, `dst` may not be valid encoding.
1824 */
1825VALUE
1826rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
1827{
1828 unsigned const char *sp, *se;
1829 unsigned char *ds, *dp, *de;
1831 int max_output;
1832 enum ruby_coderange_type coderange;
1833 rb_encoding *dst_enc = ec->destination_encoding;
1834
1835 if (NIL_P(dst)) {
1836 dst = rb_str_buf_new(len);
1837 if (dst_enc) {
1838 rb_enc_associate(dst, dst_enc);
1839 }
1840 coderange = ENC_CODERANGE_7BIT; // scan from the start
1841 }
1842 else {
1843 dst_enc = rb_enc_get(dst);
1844 coderange = rb_enc_str_coderange(dst);
1845 }
1846
1847 if (ec->last_tc)
1848 max_output = ec->last_tc->transcoder->max_output;
1849 else
1850 max_output = 1;
1851
1852 do {
1853 int cr;
1854 long dlen = RSTRING_LEN(dst);
1855 if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
1856 unsigned long new_capa = (unsigned long)dlen + len + max_output;
1857 if (LONG_MAX < new_capa)
1858 rb_raise(rb_eArgError, "too long string");
1859 rb_str_modify_expand(dst, new_capa - dlen);
1860 }
1861 sp = (const unsigned char *)ss;
1862 se = sp + len;
1863 ds = (unsigned char *)RSTRING_PTR(dst);
1864 de = ds + rb_str_capacity(dst);
1865 dp = ds += dlen;
1866 res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
1867 switch (coderange) {
1868 case ENC_CODERANGE_7BIT:
1870 cr = (int)coderange;
1871 rb_str_coderange_scan_restartable((char *)ds, (char *)dp, dst_enc, &cr);
1872 coderange = cr;
1873 ENC_CODERANGE_SET(dst, coderange);
1874 break;
1877 break;
1878 }
1879 len -= (const char *)sp - ss;
1880 ss = (const char *)sp;
1881 rb_str_set_len(dst, dlen + (dp - ds));
1883 } while (res == econv_destination_buffer_full);
1884
1885 return dst;
1886}
1887
1888VALUE
1889rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
1890{
1891 src = rb_str_new_frozen(src);
1892 dst = rb_econv_append(ec, RSTRING_PTR(src) + off, len, dst, flags);
1893 RB_GC_GUARD(src);
1894 return dst;
1895}
1896
1897VALUE
1899{
1900 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
1901}
1902
1903VALUE
1904rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
1905{
1906 return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
1907}
1908
1909VALUE
1911{
1912 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
1913}
1914
1915static int
1916rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
1917{
1918 transcoder_entry_t *entry;
1919 const rb_transcoder *tr;
1920
1921 if (ec->started != 0)
1922 return -1;
1923
1924 entry = get_transcoder_entry(sname, dname);
1925 if (!entry)
1926 return -1;
1927
1928 tr = load_transcoder_entry(entry);
1929 if (!tr) return -1;
1930
1931 return rb_econv_add_transcoder_at(ec, tr, n);
1932}
1933
1934static int
1935rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
1936{
1937 return rb_econv_add_converter(ec, "", decorator_name, n);
1938}
1939
1940int
1941rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
1942{
1943 const rb_transcoder *tr;
1944
1945 if (ec->num_trans == 0)
1946 return rb_econv_decorate_at(ec, decorator_name, 0);
1947
1948 tr = ec->elems[0].tc->transcoder;
1949
1950 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1951 tr->asciicompat_type == asciicompat_decoder)
1952 return rb_econv_decorate_at(ec, decorator_name, 1);
1953
1954 return rb_econv_decorate_at(ec, decorator_name, 0);
1955}
1956
1957int
1958rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
1959{
1960 const rb_transcoder *tr;
1961
1962 if (ec->num_trans == 0)
1963 return rb_econv_decorate_at(ec, decorator_name, 0);
1964
1965 tr = ec->elems[ec->num_trans-1].tc->transcoder;
1966
1967 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1968 tr->asciicompat_type == asciicompat_encoder)
1969 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
1970
1971 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
1972}
1973
1974void
1976{
1977 const char *dname = 0;
1978
1979 switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
1981 dname = "universal_newline";
1982 break;
1984 dname = "crlf_newline";
1985 break;
1987 dname = "cr_newline";
1988 break;
1990 dname = "lf_newline";
1991 break;
1992 }
1993
1994 if (dname) {
1995 const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
1996 int num_trans = ec->num_trans;
1997 int i, j = 0;
1998
1999 for (i=0; i < num_trans; i++) {
2000 if (transcoder == ec->elems[i].tc->transcoder) {
2001 rb_transcoding_close(ec->elems[i].tc);
2002 xfree(ec->elems[i].out_buf_start);
2003 ec->num_trans--;
2004 }
2005 else
2006 ec->elems[j++] = ec->elems[i];
2007 }
2008 }
2009
2010 ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2011}
2012
2013static VALUE
2014econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
2015{
2016 int has_description = 0;
2017
2018 if (NIL_P(mesg))
2019 mesg = rb_str_new(NULL, 0);
2020
2021 if (*sname != '\0' || *dname != '\0') {
2022 if (*sname == '\0')
2023 rb_str_cat2(mesg, dname);
2024 else if (*dname == '\0')
2025 rb_str_cat2(mesg, sname);
2026 else
2027 rb_str_catf(mesg, "%s to %s", sname, dname);
2028 has_description = 1;
2029 }
2030
2031 if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2035 const char *pre = "";
2036 if (has_description)
2037 rb_str_cat2(mesg, " with ");
2038 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
2039 rb_str_cat2(mesg, pre); pre = ",";
2040 rb_str_cat2(mesg, "universal_newline");
2041 }
2042 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
2043 rb_str_cat2(mesg, pre); pre = ",";
2044 rb_str_cat2(mesg, "crlf_newline");
2045 }
2046 if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
2047 rb_str_cat2(mesg, pre); pre = ",";
2048 rb_str_cat2(mesg, "cr_newline");
2049 }
2050 if (ecflags & ECONV_LF_NEWLINE_DECORATOR) {
2051 rb_str_cat2(mesg, pre); pre = ",";
2052 rb_str_cat2(mesg, "lf_newline");
2053 }
2054 if (ecflags & ECONV_XML_TEXT_DECORATOR) {
2055 rb_str_cat2(mesg, pre); pre = ",";
2056 rb_str_cat2(mesg, "xml_text");
2057 }
2058 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
2059 rb_str_cat2(mesg, pre); pre = ",";
2060 rb_str_cat2(mesg, "xml_attr_content");
2061 }
2062 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
2063 rb_str_cat2(mesg, pre); pre = ",";
2064 rb_str_cat2(mesg, "xml_attr_quote");
2065 }
2066 has_description = 1;
2067 }
2068 if (!has_description) {
2069 rb_str_cat2(mesg, "no-conversion");
2070 }
2071
2072 return mesg;
2073}
2074
2075VALUE
2076rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
2077{
2078 VALUE mesg, exc;
2079 mesg = rb_str_new_cstr("code converter not found (");
2080 econv_description(sname, dname, ecflags, mesg);
2081 rb_str_cat2(mesg, ")");
2082 exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
2083 return exc;
2084}
2085
2086static VALUE
2087make_econv_exception(rb_econv_t *ec)
2088{
2089 VALUE mesg, exc;
2090 if (ec->last_error.result == econv_invalid_byte_sequence ||
2091 ec->last_error.result == econv_incomplete_input) {
2092 const char *err = (const char *)ec->last_error.error_bytes_start;
2093 size_t error_len = ec->last_error.error_bytes_len;
2094 VALUE bytes = rb_str_new(err, error_len);
2095 VALUE dumped = rb_str_dump(bytes);
2096 size_t readagain_len = ec->last_error.readagain_len;
2097 VALUE bytes2 = Qnil;
2098 VALUE dumped2;
2099 if (ec->last_error.result == econv_incomplete_input) {
2100 mesg = rb_sprintf("incomplete %s on %s",
2101 StringValueCStr(dumped),
2102 ec->last_error.source_encoding);
2103 }
2104 else if (readagain_len) {
2105 bytes2 = rb_str_new(err+error_len, readagain_len);
2106 dumped2 = rb_str_dump(bytes2);
2107 mesg = rb_sprintf("%s followed by %s on %s",
2108 StringValueCStr(dumped),
2109 StringValueCStr(dumped2),
2110 ec->last_error.source_encoding);
2111 }
2112 else {
2113 mesg = rb_sprintf("%s on %s",
2114 StringValueCStr(dumped),
2115 ec->last_error.source_encoding);
2116 }
2117
2118 exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
2119 rb_ivar_set(exc, id_error_bytes, bytes);
2120 rb_ivar_set(exc, id_readagain_bytes, bytes2);
2121 rb_ivar_set(exc, id_incomplete_input, RBOOL(ec->last_error.result == econv_incomplete_input));
2122 goto set_encs;
2123 }
2124 if (ec->last_error.result == econv_undefined_conversion) {
2125 VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
2126 ec->last_error.error_bytes_len);
2127 VALUE dumped = Qnil;
2128 int idx;
2129 if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
2130 rb_encoding *utf8 = rb_utf8_encoding();
2131 const char *start, *end;
2132 int n;
2133 start = (const char *)ec->last_error.error_bytes_start;
2134 end = start + ec->last_error.error_bytes_len;
2135 n = rb_enc_precise_mbclen(start, end, utf8);
2136 if (MBCLEN_CHARFOUND_P(n) &&
2137 (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
2138 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2139 dumped = rb_sprintf("U+%04X", cc);
2140 }
2141 }
2142 if (NIL_P(dumped))
2143 dumped = rb_str_dump(bytes);
2144 if (strcmp(ec->last_error.source_encoding,
2145 ec->source_encoding_name) == 0 &&
2146 strcmp(ec->last_error.destination_encoding,
2147 ec->destination_encoding_name) == 0) {
2148 mesg = rb_sprintf("%s from %s to %s",
2149 StringValueCStr(dumped),
2150 ec->last_error.source_encoding,
2151 ec->last_error.destination_encoding);
2152 }
2153 else {
2154 int i;
2155 mesg = rb_sprintf("%s to %s in conversion from %s",
2156 StringValueCStr(dumped),
2157 ec->last_error.destination_encoding,
2158 ec->source_encoding_name);
2159 for (i = 0; i < ec->num_trans; i++) {
2160 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
2161 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
2162 rb_str_catf(mesg, " to %s",
2163 ec->elems[i].tc->transcoder->dst_encoding);
2164 }
2165 }
2166 exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
2167 idx = rb_enc_find_index(ec->last_error.source_encoding);
2168 if (0 <= idx)
2169 rb_enc_associate_index(bytes, idx);
2170 rb_ivar_set(exc, id_error_char, bytes);
2171 goto set_encs;
2172 }
2173 return Qnil;
2174
2175 set_encs:
2176 rb_ivar_set(exc, id_source_encoding_name, rb_str_new2(ec->last_error.source_encoding));
2177 rb_ivar_set(exc, id_destination_encoding_name, rb_str_new2(ec->last_error.destination_encoding));
2178 int idx = rb_enc_find_index(ec->last_error.source_encoding);
2179 if (0 <= idx)
2180 rb_ivar_set(exc, id_source_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2181 idx = rb_enc_find_index(ec->last_error.destination_encoding);
2182 if (0 <= idx)
2183 rb_ivar_set(exc, id_destination_encoding, rb_enc_from_encoding(rb_enc_from_index(idx)));
2184 return exc;
2185}
2186
2187static void
2188more_output_buffer(
2189 VALUE destination,
2190 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2191 int max_output,
2192 unsigned char **out_start_ptr,
2193 unsigned char **out_pos,
2194 unsigned char **out_stop_ptr)
2195{
2196 size_t len = (*out_pos - *out_start_ptr);
2197 size_t new_len = (len + max_output) * 2;
2198 *out_start_ptr = resize_destination(destination, len, new_len);
2199 *out_pos = *out_start_ptr + len;
2200 *out_stop_ptr = *out_start_ptr + new_len;
2201}
2202
2203static int
2204make_replacement(rb_econv_t *ec)
2205{
2206 rb_transcoding *tc;
2207 const rb_transcoder *tr;
2208 const unsigned char *replacement;
2209 const char *repl_enc;
2210 const char *ins_enc;
2211 size_t len;
2212
2213 if (ec->replacement_str)
2214 return 0;
2215
2217
2218 tc = ec->last_tc;
2219 if (*ins_enc) {
2220 tr = tc->transcoder;
2221 rb_enc_find(tr->dst_encoding);
2222 replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
2223 }
2224 else {
2225 replacement = (unsigned char *)"?";
2226 len = 1;
2227 repl_enc = "";
2228 }
2229
2230 ec->replacement_str = replacement;
2231 ec->replacement_len = len;
2232 ec->replacement_enc = repl_enc;
2233 ec->replacement_allocated = 0;
2234 return 0;
2235}
2236
2237int
2239 const unsigned char *str, size_t len, const char *encname)
2240{
2241 unsigned char *str2;
2242 size_t len2;
2243 const char *encname2;
2244
2246
2247 if (!*encname2 || encoding_equal(encname, encname2)) {
2248 str2 = xmalloc(len);
2249 MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
2250 len2 = len;
2251 encname2 = encname;
2252 }
2253 else {
2254 str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
2255 if (!str2)
2256 return -1;
2257 }
2258
2259 if (ec->replacement_allocated) {
2260 xfree((void *)ec->replacement_str);
2261 }
2262 ec->replacement_allocated = 1;
2263 ec->replacement_str = str2;
2264 ec->replacement_len = len2;
2265 ec->replacement_enc = encname2;
2266 return 0;
2267}
2268
2269static int
2270output_replacement_character(rb_econv_t *ec)
2271{
2272 int ret;
2273
2274 if (make_replacement(ec) == -1)
2275 return -1;
2276
2277 ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
2278 if (ret == -1)
2279 return -1;
2280
2281 return 0;
2282}
2283
2284#if 1
2285#define hash_fallback rb_hash_aref
2286
2287static VALUE
2288proc_fallback(VALUE fallback, VALUE c)
2289{
2290 return rb_proc_call(fallback, rb_ary_new4(1, &c));
2291}
2292
2293static VALUE
2294method_fallback(VALUE fallback, VALUE c)
2295{
2296 return rb_method_call(1, &c, fallback);
2297}
2298
2299static VALUE
2300aref_fallback(VALUE fallback, VALUE c)
2301{
2302 return rb_funcallv_public(fallback, idAREF, 1, &c);
2303}
2304
2305static void
2306transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2307 const unsigned char *in_stop, unsigned char *out_stop,
2308 VALUE destination,
2309 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2310 const char *src_encoding,
2311 const char *dst_encoding,
2312 int ecflags,
2313 VALUE ecopts)
2314{
2315 rb_econv_t *ec;
2316 rb_transcoding *last_tc;
2318 unsigned char *out_start = *out_pos;
2319 int max_output;
2320 VALUE exc;
2321 VALUE fallback = Qnil;
2322 VALUE (*fallback_func)(VALUE, VALUE) = 0;
2323
2324 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2325 if (!ec)
2326 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2327
2328 if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
2329 fallback = rb_hash_aref(ecopts, sym_fallback);
2330 if (RB_TYPE_P(fallback, T_HASH)) {
2331 fallback_func = hash_fallback;
2332 }
2333 else if (rb_obj_is_proc(fallback)) {
2334 fallback_func = proc_fallback;
2335 }
2336 else if (rb_obj_is_method(fallback)) {
2337 fallback_func = method_fallback;
2338 }
2339 else {
2340 fallback_func = aref_fallback;
2341 }
2342 }
2343 last_tc = ec->last_tc;
2344 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2345
2346 resume:
2347 ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
2348
2349 if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
2350 VALUE rep = rb_enc_str_new(
2351 (const char *)ec->last_error.error_bytes_start,
2352 ec->last_error.error_bytes_len,
2353 rb_enc_find(ec->last_error.source_encoding));
2354 rep = (*fallback_func)(fallback, rep);
2355 if (!UNDEF_P(rep) && !NIL_P(rep)) {
2356 StringValue(rep);
2357 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
2358 RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2359 if ((int)ret == -1) {
2360 rb_raise(rb_eArgError, "too big fallback string");
2361 }
2362 goto resume;
2363 }
2364 }
2365
2366 if (ret == econv_invalid_byte_sequence ||
2367 ret == econv_incomplete_input ||
2369 exc = make_econv_exception(ec);
2370 rb_econv_close(ec);
2371 rb_exc_raise(exc);
2372 }
2373
2374 if (ret == econv_destination_buffer_full) {
2375 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2376 goto resume;
2377 }
2378
2379 rb_econv_close(ec);
2380 return;
2381}
2382#else
2383/* sample transcode_loop implementation in byte-by-byte stream style */
2384static void
2385transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2386 const unsigned char *in_stop, unsigned char *out_stop,
2387 VALUE destination,
2388 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2389 const char *src_encoding,
2390 const char *dst_encoding,
2391 int ecflags,
2392 VALUE ecopts)
2393{
2394 rb_econv_t *ec;
2395 rb_transcoding *last_tc;
2397 unsigned char *out_start = *out_pos;
2398 const unsigned char *ptr;
2399 int max_output;
2400 VALUE exc;
2401
2402 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2403 if (!ec)
2404 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2405
2406 last_tc = ec->last_tc;
2407 max_output = last_tc ? last_tc->transcoder->max_output : 1;
2408
2410 ptr = *in_pos;
2411 while (ret != econv_finished) {
2412 unsigned char input_byte;
2413 const unsigned char *p = &input_byte;
2414
2415 if (ret == econv_source_buffer_empty) {
2416 if (ptr < in_stop) {
2417 input_byte = *ptr;
2418 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2419 }
2420 else {
2421 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
2422 }
2423 }
2424 else {
2425 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2426 }
2427 if (&input_byte != p)
2428 ptr += p - &input_byte;
2429 switch (ret) {
2433 exc = make_econv_exception(ec);
2434 rb_econv_close(ec);
2435 rb_exc_raise(exc);
2436 break;
2437
2439 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2440 break;
2441
2443 break;
2444
2445 case econv_finished:
2446 break;
2447 }
2448 }
2449 rb_econv_close(ec);
2450 *in_pos = in_stop;
2451 return;
2452}
2453#endif
2454
2455
2456/*
2457 * String-specific code
2458 */
2459
2460static unsigned char *
2461str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
2462{
2463 rb_str_resize(destination, new_len);
2464 return (unsigned char *)RSTRING_PTR(destination);
2465}
2466
2467static int
2468econv_opts(VALUE opt, int ecflags)
2469{
2470 VALUE v;
2471 int newlineflag = 0;
2472
2473 v = rb_hash_aref(opt, sym_invalid);
2474 if (NIL_P(v)) {
2475 }
2476 else if (v==sym_replace) {
2477 ecflags |= ECONV_INVALID_REPLACE;
2478 }
2479 else {
2480 rb_raise(rb_eArgError, "unknown value for invalid character option");
2481 }
2482
2483 v = rb_hash_aref(opt, sym_undef);
2484 if (NIL_P(v)) {
2485 }
2486 else if (v==sym_replace) {
2487 ecflags |= ECONV_UNDEF_REPLACE;
2488 }
2489 else {
2490 rb_raise(rb_eArgError, "unknown value for undefined character option");
2491 }
2492
2493 v = rb_hash_aref(opt, sym_replace);
2494 if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
2495 ecflags |= ECONV_UNDEF_REPLACE;
2496 }
2497
2498 v = rb_hash_aref(opt, sym_xml);
2499 if (!NIL_P(v)) {
2500 if (v==sym_text) {
2502 }
2503 else if (v==sym_attr) {
2505 }
2506 else if (SYMBOL_P(v)) {
2507 rb_raise(rb_eArgError, "unexpected value for xml option: %"PRIsVALUE, rb_sym2str(v));
2508 }
2509 else {
2510 rb_raise(rb_eArgError, "unexpected value for xml option");
2511 }
2512 }
2513
2514#ifdef ENABLE_ECONV_NEWLINE_OPTION
2515 v = rb_hash_aref(opt, sym_newline);
2516 if (!NIL_P(v)) {
2517 newlineflag = 2;
2518 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2519 if (v == sym_universal) {
2521 }
2522 else if (v == sym_crlf) {
2524 }
2525 else if (v == sym_cr) {
2526 ecflags |= ECONV_CR_NEWLINE_DECORATOR;
2527 }
2528 else if (v == sym_lf) {
2529 ecflags |= ECONV_LF_NEWLINE_DECORATOR;
2530 }
2531 else if (SYMBOL_P(v)) {
2532 rb_raise(rb_eArgError, "unexpected value for newline option: %"PRIsVALUE,
2533 rb_sym2str(v));
2534 }
2535 else {
2536 rb_raise(rb_eArgError, "unexpected value for newline option");
2537 }
2538 }
2539#endif
2540 {
2541 int setflags = 0;
2542
2543 v = rb_hash_aref(opt, sym_universal_newline);
2544 if (RTEST(v))
2546 newlineflag |= !NIL_P(v);
2547
2548 v = rb_hash_aref(opt, sym_crlf_newline);
2549 if (RTEST(v))
2550 setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2551 newlineflag |= !NIL_P(v);
2552
2553 v = rb_hash_aref(opt, sym_cr_newline);
2554 if (RTEST(v))
2555 setflags |= ECONV_CR_NEWLINE_DECORATOR;
2556 newlineflag |= !NIL_P(v);
2557
2558 v = rb_hash_aref(opt, sym_lf_newline);
2559 if (RTEST(v))
2560 setflags |= ECONV_LF_NEWLINE_DECORATOR;
2561 newlineflag |= !NIL_P(v);
2562
2563 switch (newlineflag) {
2564 case 1:
2565 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2566 ecflags |= setflags;
2567 break;
2568
2569 case 3:
2570 rb_warning(":newline option precedes other newline options");
2571 break;
2572 }
2573 }
2574
2575 return ecflags;
2576}
2577
2578int
2579rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
2580{
2581 VALUE newhash = Qnil;
2582 VALUE v;
2583
2584 if (NIL_P(opthash)) {
2585 *opts = Qnil;
2586 return ecflags;
2587 }
2588 ecflags = econv_opts(opthash, ecflags);
2589
2590 v = rb_hash_aref(opthash, sym_replace);
2591 if (!NIL_P(v)) {
2592 StringValue(v);
2593 if (is_broken_string(v)) {
2594 VALUE dumped = rb_str_dump(v);
2595 rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
2596 StringValueCStr(dumped),
2597 rb_enc_name(rb_enc_get(v)));
2598 }
2599 v = rb_str_new_frozen(v);
2600 newhash = rb_hash_new();
2601 rb_hash_aset(newhash, sym_replace, v);
2602 }
2603
2604 v = rb_hash_aref(opthash, sym_fallback);
2605 if (!NIL_P(v)) {
2606 VALUE h = rb_check_hash_type(v);
2607 if (NIL_P(h)
2608 ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, idAREF))
2609 : (v = h, 1)) {
2610 if (NIL_P(newhash))
2611 newhash = rb_hash_new();
2612 rb_hash_aset(newhash, sym_fallback, v);
2613 }
2614 }
2615
2616 if (!NIL_P(newhash))
2617 rb_hash_freeze(newhash);
2618 *opts = newhash;
2619
2620 return ecflags;
2621}
2622
2623int
2625{
2626 return rb_econv_prepare_options(opthash, opts, 0);
2627}
2628
2629rb_econv_t *
2630rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
2631{
2632 rb_econv_t *ec;
2633 VALUE replacement;
2634
2635 if (NIL_P(opthash)) {
2636 replacement = Qnil;
2637 }
2638 else {
2639 if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
2640 rb_bug("rb_econv_open_opts called with invalid opthash");
2641 replacement = rb_hash_aref(opthash, sym_replace);
2642 }
2643
2644 ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
2645 if (!ec)
2646 return ec;
2647
2648 if (!NIL_P(replacement)) {
2649 int ret;
2650 rb_encoding *enc = rb_enc_get(replacement);
2651
2652 ret = rb_econv_set_replacement(ec,
2653 (const unsigned char *)RSTRING_PTR(replacement),
2654 RSTRING_LEN(replacement),
2655 rb_enc_name(enc));
2656 if (ret == -1) {
2657 rb_econv_close(ec);
2658 return NULL;
2659 }
2660 }
2661 return ec;
2662}
2663
2664static int
2665enc_arg(VALUE *arg, const char **name_p, rb_encoding **enc_p)
2666{
2667 rb_encoding *enc;
2668 const char *n;
2669 int encidx;
2670 VALUE encval;
2671
2672 if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2673 !(enc = rb_enc_from_index(encidx))) {
2674 enc = NULL;
2675 encidx = 0;
2676 n = StringValueCStr(*arg);
2677 }
2678 else {
2679 n = rb_enc_name(enc);
2680 }
2681
2682 *name_p = n;
2683 *enc_p = enc;
2684
2685 return encidx;
2686}
2687
2688static int
2689str_transcode_enc_args(VALUE str, VALUE *arg1, VALUE *arg2,
2690 const char **sname_p, rb_encoding **senc_p,
2691 const char **dname_p, rb_encoding **denc_p)
2692{
2693 rb_encoding *senc, *denc;
2694 const char *sname, *dname;
2695 int sencidx, dencidx;
2696
2697 dencidx = enc_arg(arg1, &dname, &denc);
2698
2699 if (NIL_P(*arg2)) {
2700 sencidx = rb_enc_get_index(str);
2701 senc = rb_enc_from_index(sencidx);
2702 sname = rb_enc_name(senc);
2703 }
2704 else {
2705 sencidx = enc_arg(arg2, &sname, &senc);
2706 }
2707
2708 *sname_p = sname;
2709 *senc_p = senc;
2710 *dname_p = dname;
2711 *denc_p = denc;
2712 return dencidx;
2713}
2714
2715static int
2716str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
2717{
2718 VALUE dest;
2719 VALUE str = *self;
2720 VALUE arg1, arg2;
2721 long blen, slen;
2722 unsigned char *buf, *bp, *sp;
2723 const unsigned char *fromp;
2724 rb_encoding *senc, *denc;
2725 const char *sname, *dname;
2726 int dencidx;
2727 int explicitly_invalid_replace = TRUE;
2728
2729 rb_check_arity(argc, 0, 2);
2730
2731 if (argc == 0) {
2732 arg1 = rb_enc_default_internal();
2733 if (NIL_P(arg1)) {
2734 if (!ecflags) return -1;
2735 arg1 = rb_obj_encoding(str);
2736 }
2737 if (!(ecflags & ECONV_INVALID_MASK)) {
2738 explicitly_invalid_replace = FALSE;
2739 }
2741 }
2742 else {
2743 arg1 = argv[0];
2744 }
2745 arg2 = argc<=1 ? Qnil : argv[1];
2746 dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2747
2748 if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2752 if (senc && senc == denc) {
2753 if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) {
2754 VALUE rep = Qnil;
2755 if (!NIL_P(ecopts)) {
2756 rep = rb_hash_aref(ecopts, sym_replace);
2757 }
2758 dest = rb_enc_str_scrub(senc, str, rep);
2759 if (NIL_P(dest)) dest = str;
2760 *self = dest;
2761 return dencidx;
2762 }
2763 return NIL_P(arg2) ? -1 : dencidx;
2764 }
2765 if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2766 if (is_ascii_string(str)) {
2767 return dencidx;
2768 }
2769 }
2770 if (encoding_equal(sname, dname)) {
2771 return NIL_P(arg2) ? -1 : dencidx;
2772 }
2773 }
2774 else {
2775 if (senc && denc && !rb_enc_asciicompat(senc) && !rb_enc_asciicompat(denc)) {
2776 rb_encoding *utf8 = rb_utf8_encoding();
2777 str = rb_str_conv_enc(str, senc, utf8);
2778 senc = utf8;
2779 sname = "UTF-8";
2780 }
2781 if (encoding_equal(sname, dname)) {
2782 sname = "";
2783 dname = "";
2784 }
2785 }
2786
2787 fromp = sp = (unsigned char *)RSTRING_PTR(str);
2788 slen = RSTRING_LEN(str);
2789 blen = slen + 30; /* len + margin */
2790 dest = rb_str_tmp_new(blen);
2791 bp = (unsigned char *)RSTRING_PTR(dest);
2792
2793 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2794 if (fromp != sp+slen) {
2795 rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
2796 }
2797 buf = (unsigned char *)RSTRING_PTR(dest);
2798 *bp = '\0';
2799 rb_str_set_len(dest, bp - buf);
2800
2801 /* set encoding */
2802 if (!denc) {
2803 dencidx = rb_define_dummy_encoding(dname);
2804 RB_GC_GUARD(arg1);
2805 RB_GC_GUARD(arg2);
2806 }
2807 *self = dest;
2808
2809 return dencidx;
2810}
2811
2812static int
2813str_transcode(int argc, VALUE *argv, VALUE *self)
2814{
2815 VALUE opt;
2816 int ecflags = 0;
2817 VALUE ecopts = Qnil;
2818
2819 argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
2820 if (!NIL_P(opt)) {
2821 ecflags = rb_econv_prepare_opts(opt, &ecopts);
2822 }
2823 return str_transcode0(argc, argv, self, ecflags, ecopts);
2824}
2825
2826static inline VALUE
2827str_encode_associate(VALUE str, int encidx)
2828{
2829 int cr = 0;
2830
2831 rb_enc_associate_index(str, encidx);
2832
2833 /* transcoded string never be broken. */
2834 if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
2835 rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
2836 }
2837 else {
2839 }
2840 ENC_CODERANGE_SET(str, cr);
2841 return str;
2842}
2843
2844/*
2845 * call-seq:
2846 * encode!(dst_encoding = Encoding.default_internal, **enc_opts) -> self
2847 * encode!(dst_encoding, src_encoding, **enc_opts) -> self
2848 *
2849 * Like #encode, but applies encoding changes to +self+; returns +self+.
2850 *
2851 */
2852
2853static VALUE
2854str_encode_bang(int argc, VALUE *argv, VALUE str)
2855{
2856 VALUE newstr;
2857 int encidx;
2858
2859 rb_check_frozen(str);
2860
2861 newstr = str;
2862 encidx = str_transcode(argc, argv, &newstr);
2863
2864 if (encidx < 0) return str;
2865 if (newstr == str) {
2866 rb_enc_associate_index(str, encidx);
2867 return str;
2868 }
2869 rb_str_shared_replace(str, newstr);
2870 return str_encode_associate(str, encidx);
2871}
2872
2873static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
2874
2875static VALUE
2876str_encode(int argc, VALUE *argv, VALUE str)
2877{
2878 VALUE newstr = str;
2879 int encidx = str_transcode(argc, argv, &newstr);
2880 return encoded_dup(newstr, str, encidx);
2881}
2882
2883VALUE
2884rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
2885{
2886 int argc = 1;
2887 VALUE *argv = &to;
2888 VALUE newstr = str;
2889 int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2890 return encoded_dup(newstr, str, encidx);
2891}
2892
2893static VALUE
2894encoded_dup(VALUE newstr, VALUE str, int encidx)
2895{
2896 if (encidx < 0) return rb_str_dup(str);
2897 if (newstr == str) {
2898 newstr = rb_str_dup(str);
2899 rb_enc_associate_index(newstr, encidx);
2900 return newstr;
2901 }
2902 else {
2903 RBASIC_SET_CLASS(newstr, rb_obj_class(str));
2904 }
2905 return str_encode_associate(newstr, encidx);
2906}
2907
2908/*
2909 * Document-class: Encoding::Converter
2910 *
2911 * Encoding conversion class.
2912 */
2913static void
2914econv_free(void *ptr)
2915{
2916 rb_econv_t *ec = ptr;
2917 rb_econv_close(ec);
2918}
2919
2920static size_t
2921econv_memsize(const void *ptr)
2922{
2923 return sizeof(rb_econv_t);
2924}
2925
2926static const rb_data_type_t econv_data_type = {
2927 "econv",
2928 {0, econv_free, econv_memsize,},
2929 0, 0, RUBY_TYPED_FREE_IMMEDIATELY
2930};
2931
2932static VALUE
2933econv_s_allocate(VALUE klass)
2934{
2935 return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
2936}
2937
2938static rb_encoding *
2939make_dummy_encoding(const char *name)
2940{
2941 rb_encoding *enc;
2942 int idx;
2943 idx = rb_define_dummy_encoding(name);
2944 enc = rb_enc_from_index(idx);
2945 return enc;
2946}
2947
2948static rb_encoding *
2949make_encoding(const char *name)
2950{
2951 rb_encoding *enc;
2952 enc = rb_enc_find(name);
2953 if (!enc)
2954 enc = make_dummy_encoding(name);
2955 return enc;
2956}
2957
2958static VALUE
2959make_encobj(const char *name)
2960{
2961 return rb_enc_from_encoding(make_encoding(name));
2962}
2963
2964/*
2965 * call-seq:
2966 * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
2967 * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
2968 *
2969 * Returns the corresponding ASCII compatible encoding.
2970 *
2971 * Returns nil if the argument is an ASCII compatible encoding.
2972 *
2973 * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
2974 * can represents exactly the same characters as the given ASCII incompatible encoding.
2975 * So, no conversion undefined error occurs when converting between the two encodings.
2976 *
2977 * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
2978 * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
2979 * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
2980 *
2981 */
2982static VALUE
2983econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
2984{
2985 const char *arg_name, *result_name;
2986 rb_encoding *arg_enc, *result_enc;
2987
2988 enc_arg(&arg, &arg_name, &arg_enc);
2989
2990 result_name = rb_econv_asciicompat_encoding(arg_name);
2991
2992 if (result_name == NULL)
2993 return Qnil;
2994
2995 result_enc = make_encoding(result_name);
2996
2997 return rb_enc_from_encoding(result_enc);
2998}
2999
3000static void
3001econv_args(int argc, VALUE *argv,
3002 VALUE *snamev_p, VALUE *dnamev_p,
3003 const char **sname_p, const char **dname_p,
3004 rb_encoding **senc_p, rb_encoding **denc_p,
3005 int *ecflags_p,
3006 VALUE *ecopts_p)
3007{
3008 VALUE opt, flags_v, ecopts;
3009 int sidx, didx;
3010 const char *sname, *dname;
3011 rb_encoding *senc, *denc;
3012 int ecflags;
3013
3014 argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
3015
3016 if (!NIL_P(flags_v)) {
3017 if (!NIL_P(opt)) {
3018 rb_error_arity(argc + 1, 2, 3);
3019 }
3020 ecflags = NUM2INT(rb_to_int(flags_v));
3021 ecopts = Qnil;
3022 }
3023 else if (!NIL_P(opt)) {
3024 ecflags = rb_econv_prepare_opts(opt, &ecopts);
3025 }
3026 else {
3027 ecflags = 0;
3028 ecopts = Qnil;
3029 }
3030
3031 senc = NULL;
3032 sidx = rb_to_encoding_index(*snamev_p);
3033 if (0 <= sidx) {
3034 senc = rb_enc_from_index(sidx);
3035 }
3036 else {
3037 StringValue(*snamev_p);
3038 }
3039
3040 denc = NULL;
3041 didx = rb_to_encoding_index(*dnamev_p);
3042 if (0 <= didx) {
3043 denc = rb_enc_from_index(didx);
3044 }
3045 else {
3046 StringValue(*dnamev_p);
3047 }
3048
3049 sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
3050 dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
3051
3052 *sname_p = sname;
3053 *dname_p = dname;
3054 *senc_p = senc;
3055 *denc_p = denc;
3056 *ecflags_p = ecflags;
3057 *ecopts_p = ecopts;
3058}
3059
3060static int
3061decorate_convpath(VALUE convpath, int ecflags)
3062{
3063 int num_decorators;
3064 const char *decorators[MAX_ECFLAGS_DECORATORS];
3065 int i;
3066 int n, len;
3067
3068 num_decorators = decorator_names(ecflags, decorators);
3069 if (num_decorators == -1)
3070 return -1;
3071
3072 len = n = RARRAY_LENINT(convpath);
3073 if (n != 0) {
3074 VALUE pair = RARRAY_AREF(convpath, n-1);
3075 if (RB_TYPE_P(pair, T_ARRAY)) {
3076 const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0)));
3077 const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1)));
3078 transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
3079 const rb_transcoder *tr = load_transcoder_entry(entry);
3080 if (!tr)
3081 return -1;
3082 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
3083 tr->asciicompat_type == asciicompat_encoder) {
3084 n--;
3085 rb_ary_store(convpath, len + num_decorators - 1, pair);
3086 }
3087 }
3088 else {
3089 rb_ary_store(convpath, len + num_decorators - 1, pair);
3090 }
3091 }
3092
3093 for (i = 0; i < num_decorators; i++)
3094 rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
3095
3096 return 0;
3097}
3098
3099static void
3100search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3101{
3102 VALUE *ary_p = arg;
3103 VALUE v;
3104
3105 if (NIL_P(*ary_p)) {
3106 *ary_p = rb_ary_new();
3107 }
3108
3109 if (DECORATOR_P(sname, dname)) {
3110 v = rb_str_new_cstr(dname);
3111 }
3112 else {
3113 v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
3114 }
3115 rb_ary_store(*ary_p, depth, v);
3116}
3117
3118/*
3119 * call-seq:
3120 * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
3121 * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
3122 *
3123 * Returns a conversion path.
3124 *
3125 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3126 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3127 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3128 *
3129 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3130 * or
3131 * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3132 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3133 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3134 * # "universal_newline"]
3135 *
3136 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3137 * or
3138 * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3139 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3140 * # "universal_newline",
3141 * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3142 */
3143static VALUE
3144econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
3145{
3146 VALUE snamev, dnamev;
3147 const char *sname, *dname;
3148 rb_encoding *senc, *denc;
3149 int ecflags;
3150 VALUE ecopts;
3151 VALUE convpath;
3152
3153 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3154
3155 convpath = Qnil;
3156 transcode_search_path(sname, dname, search_convpath_i, &convpath);
3157
3158 if (NIL_P(convpath)) {
3159 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3160 RB_GC_GUARD(snamev);
3161 RB_GC_GUARD(dnamev);
3162 rb_exc_raise(exc);
3163 }
3164
3165 if (decorate_convpath(convpath, ecflags) == -1) {
3166 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3167 RB_GC_GUARD(snamev);
3168 RB_GC_GUARD(dnamev);
3169 rb_exc_raise(exc);
3170 }
3171
3172 return convpath;
3173}
3174
3175/*
3176 * Check the existence of a conversion path.
3177 * Returns the number of converters in the conversion path.
3178 * result: >=0:success -1:failure
3179 */
3180int
3181rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
3182{
3183 VALUE convpath = Qnil;
3184 transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3185 &convpath);
3186 return RTEST(convpath);
3187}
3188
3190 rb_econv_t *ec;
3191 int index;
3192 int ret;
3193};
3194
3195static void
3196rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3197{
3199 int ret;
3200
3201 if (a->ret == -1)
3202 return;
3203
3204 ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3205
3206 a->ret = ret;
3207 return;
3208}
3209
3210static rb_econv_t *
3211rb_econv_init_by_convpath(VALUE self, VALUE convpath,
3212 const char **sname_p, const char **dname_p,
3213 rb_encoding **senc_p, rb_encoding**denc_p)
3214{
3215 rb_econv_t *ec;
3216 long i;
3217 int ret, first=1;
3218 VALUE elt;
3219 rb_encoding *senc = 0, *denc = 0;
3220 const char *sname, *dname;
3221
3222 ec = rb_econv_alloc(RARRAY_LENINT(convpath));
3223 DATA_PTR(self) = ec;
3224
3225 for (i = 0; i < RARRAY_LEN(convpath); i++) {
3226 VALUE snamev, dnamev;
3227 VALUE pair;
3228 elt = rb_ary_entry(convpath, i);
3229 if (!NIL_P(pair = rb_check_array_type(elt))) {
3230 if (RARRAY_LEN(pair) != 2)
3231 rb_raise(rb_eArgError, "not a 2-element array in convpath");
3232 snamev = rb_ary_entry(pair, 0);
3233 enc_arg(&snamev, &sname, &senc);
3234 dnamev = rb_ary_entry(pair, 1);
3235 enc_arg(&dnamev, &dname, &denc);
3236 }
3237 else {
3238 sname = "";
3239 dname = StringValueCStr(elt);
3240 }
3241 if (DECORATOR_P(sname, dname)) {
3242 ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3243 if (ret == -1) {
3244 VALUE msg = rb_sprintf("decoration failed: %s", dname);
3245 RB_GC_GUARD(snamev);
3246 RB_GC_GUARD(dnamev);
3248 }
3249 }
3250 else {
3251 int j = ec->num_trans;
3252 struct rb_econv_init_by_convpath_t arg;
3253 arg.ec = ec;
3254 arg.index = ec->num_trans;
3255 arg.ret = 0;
3256 ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3257 if (ret == -1 || arg.ret == -1) {
3258 VALUE msg = rb_sprintf("adding conversion failed: %s to %s", sname, dname);
3259 RB_GC_GUARD(snamev);
3260 RB_GC_GUARD(dnamev);
3262 }
3263 if (first) {
3264 first = 0;
3265 *senc_p = senc;
3266 *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3267 }
3268 *denc_p = denc;
3269 *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3270 }
3271 }
3272
3273 if (first) {
3274 *senc_p = NULL;
3275 *denc_p = NULL;
3276 *sname_p = "";
3277 *dname_p = "";
3278 }
3279
3280 ec->source_encoding_name = *sname_p;
3281 ec->destination_encoding_name = *dname_p;
3282
3283 return ec;
3284}
3285
3286/*
3287 * call-seq:
3288 * Encoding::Converter.new(source_encoding, destination_encoding)
3289 * Encoding::Converter.new(source_encoding, destination_encoding, opt)
3290 * Encoding::Converter.new(convpath)
3291 *
3292 * possible options elements:
3293 * hash form:
3294 * :invalid => nil # raise error on invalid byte sequence (default)
3295 * :invalid => :replace # replace invalid byte sequence
3296 * :undef => nil # raise error on undefined conversion (default)
3297 * :undef => :replace # replace undefined conversion
3298 * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
3299 * :newline => :universal # decorator for converting CRLF and CR to LF
3300 * :newline => :lf # decorator for converting CRLF and CR to LF when writing
3301 * :newline => :crlf # decorator for converting LF to CRLF
3302 * :newline => :cr # decorator for converting LF to CR
3303 * :universal_newline => true # decorator for converting CRLF and CR to LF
3304 * :crlf_newline => true # decorator for converting LF to CRLF
3305 * :cr_newline => true # decorator for converting LF to CR
3306 * :lf_newline => true # decorator for converting CRLF and CR to LF when writing
3307 * :xml => :text # escape as XML CharData.
3308 * :xml => :attr # escape as XML AttValue
3309 * integer form:
3310 * Encoding::Converter::INVALID_REPLACE
3311 * Encoding::Converter::UNDEF_REPLACE
3312 * Encoding::Converter::UNDEF_HEX_CHARREF
3313 * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3314 * Encoding::Converter::LF_NEWLINE_DECORATOR
3315 * Encoding::Converter::CRLF_NEWLINE_DECORATOR
3316 * Encoding::Converter::CR_NEWLINE_DECORATOR
3317 * Encoding::Converter::XML_TEXT_DECORATOR
3318 * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3319 * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3320 *
3321 * Encoding::Converter.new creates an instance of Encoding::Converter.
3322 *
3323 * Source_encoding and destination_encoding should be a string or
3324 * Encoding object.
3325 *
3326 * opt should be nil, a hash or an integer.
3327 *
3328 * convpath should be an array.
3329 * convpath may contain
3330 * - two-element arrays which contain encodings or encoding names, or
3331 * - strings representing decorator names.
3332 *
3333 * Encoding::Converter.new optionally takes an option.
3334 * The option should be a hash or an integer.
3335 * The option hash can contain :invalid => nil, etc.
3336 * The option integer should be logical-or of constants such as
3337 * Encoding::Converter::INVALID_REPLACE, etc.
3338 *
3339 * [:invalid => nil]
3340 * Raise error on invalid byte sequence. This is a default behavior.
3341 * [:invalid => :replace]
3342 * Replace invalid byte sequence by replacement string.
3343 * [:undef => nil]
3344 * Raise an error if a character in source_encoding is not defined in destination_encoding.
3345 * This is a default behavior.
3346 * [:undef => :replace]
3347 * Replace undefined character in destination_encoding with replacement string.
3348 * [:replace => string]
3349 * Specify the replacement string.
3350 * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3351 * [:universal_newline => true]
3352 * Convert CRLF and CR to LF.
3353 * [:crlf_newline => true]
3354 * Convert LF to CRLF.
3355 * [:cr_newline => true]
3356 * Convert LF to CR.
3357 * [:lf_newline => true]
3358 * Convert CRLF and CR to LF (when writing).
3359 * [:xml => :text]
3360 * Escape as XML CharData.
3361 * This form can be used as an HTML 4.0 #PCDATA.
3362 * - '&' -> '&amp;'
3363 * - '<' -> '&lt;'
3364 * - '>' -> '&gt;'
3365 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3366 * [:xml => :attr]
3367 * Escape as XML AttValue.
3368 * The converted result is quoted as "...".
3369 * This form can be used as an HTML 4.0 attribute value.
3370 * - '&' -> '&amp;'
3371 * - '<' -> '&lt;'
3372 * - '>' -> '&gt;'
3373 * - '"' -> '&quot;'
3374 * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3375 *
3376 * Examples:
3377 * # UTF-16BE to UTF-8
3378 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3379 *
3380 * # Usually, decorators such as newline conversion are inserted last.
3381 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3382 * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3383 * # "universal_newline"]
3384 *
3385 * # But, if the last encoding is ASCII incompatible,
3386 * # decorators are inserted before the last conversion.
3387 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3388 * p ec.convpath #=> ["crlf_newline",
3389 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3390 *
3391 * # Conversion path can be specified directly.
3392 * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3393 * p ec.convpath #=> ["universal_newline",
3394 * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3395 * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3396 */
3397static VALUE
3398econv_init(int argc, VALUE *argv, VALUE self)
3399{
3400 VALUE ecopts;
3401 VALUE snamev, dnamev;
3402 const char *sname, *dname;
3403 rb_encoding *senc, *denc;
3404 rb_econv_t *ec;
3405 int ecflags;
3406 VALUE convpath;
3407
3408 if (rb_check_typeddata(self, &econv_data_type)) {
3409 rb_raise(rb_eTypeError, "already initialized");
3410 }
3411
3412 if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
3413 ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3414 ecflags = 0;
3415 ecopts = Qnil;
3416 }
3417 else {
3418 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3419 ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
3420 }
3421
3422 if (!ec) {
3423 VALUE exc = rb_econv_open_exc(sname, dname, ecflags);
3424 RB_GC_GUARD(snamev);
3425 RB_GC_GUARD(dnamev);
3426 rb_exc_raise(exc);
3427 }
3428
3429 if (!DECORATOR_P(sname, dname)) {
3430 if (!senc)
3431 senc = make_dummy_encoding(sname);
3432 if (!denc)
3433 denc = make_dummy_encoding(dname);
3434 RB_GC_GUARD(snamev);
3435 RB_GC_GUARD(dnamev);
3436 }
3437
3438 ec->source_encoding = senc;
3439 ec->destination_encoding = denc;
3440
3441 DATA_PTR(self) = ec;
3442
3443 return self;
3444}
3445
3446/*
3447 * call-seq:
3448 * ec.inspect -> string
3449 *
3450 * Returns a printable version of <i>ec</i>
3451 *
3452 * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3453 * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3454 *
3455 */
3456static VALUE
3457econv_inspect(VALUE self)
3458{
3459 const char *cname = rb_obj_classname(self);
3460 rb_econv_t *ec;
3461
3462 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3463 if (!ec)
3464 return rb_sprintf("#<%s: uninitialized>", cname);
3465 else {
3466 const char *sname = ec->source_encoding_name;
3467 const char *dname = ec->destination_encoding_name;
3468 VALUE str;
3469 str = rb_sprintf("#<%s: ", cname);
3470 econv_description(sname, dname, ec->flags, str);
3471 rb_str_cat2(str, ">");
3472 return str;
3473 }
3474}
3475
3476static rb_econv_t *
3477check_econv(VALUE self)
3478{
3479 rb_econv_t *ec;
3480
3481 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3482 if (!ec) {
3483 rb_raise(rb_eTypeError, "uninitialized encoding converter");
3484 }
3485 return ec;
3486}
3487
3488static VALUE
3489econv_get_encoding(rb_encoding *encoding)
3490{
3491 if (!encoding)
3492 return Qnil;
3493 return rb_enc_from_encoding(encoding);
3494}
3495
3496/*
3497 * call-seq:
3498 * ec.source_encoding -> encoding
3499 *
3500 * Returns the source encoding as an Encoding object.
3501 */
3502static VALUE
3503econv_source_encoding(VALUE self)
3504{
3505 rb_econv_t *ec = check_econv(self);
3506 return econv_get_encoding(ec->source_encoding);
3507}
3508
3509/*
3510 * call-seq:
3511 * ec.destination_encoding -> encoding
3512 *
3513 * Returns the destination encoding as an Encoding object.
3514 */
3515static VALUE
3516econv_destination_encoding(VALUE self)
3517{
3518 rb_econv_t *ec = check_econv(self);
3519 return econv_get_encoding(ec->destination_encoding);
3520}
3521
3522/*
3523 * call-seq:
3524 * ec.convpath -> ary
3525 *
3526 * Returns the conversion path of ec.
3527 *
3528 * The result is an array of conversions.
3529 *
3530 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3531 * p ec.convpath
3532 * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3533 * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3534 * # "crlf_newline"]
3535 *
3536 * Each element of the array is a pair of encodings or a string.
3537 * A pair means an encoding conversion.
3538 * A string means a decorator.
3539 *
3540 * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3541 * a converter from ISO-8859-1 to UTF-8.
3542 * "crlf_newline" means newline converter from LF to CRLF.
3543 */
3544static VALUE
3545econv_convpath(VALUE self)
3546{
3547 rb_econv_t *ec = check_econv(self);
3548 VALUE result;
3549 int i;
3550
3551 result = rb_ary_new();
3552 for (i = 0; i < ec->num_trans; i++) {
3553 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
3554 VALUE v;
3555 if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
3556 v = rb_str_new_cstr(tr->dst_encoding);
3557 else
3558 v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
3559 rb_ary_push(result, v);
3560 }
3561 return result;
3562}
3563
3564/*
3565 * call-seq:
3566 * ec == other -> true or false
3567 */
3568static VALUE
3569econv_equal(VALUE self, VALUE other)
3570{
3571 rb_econv_t *ec1 = check_econv(self);
3572 rb_econv_t *ec2;
3573 int i;
3574
3575 if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3576 return Qnil;
3577 }
3578 ec2 = DATA_PTR(other);
3579 if (!ec2) return Qfalse;
3580 if (ec1->source_encoding_name != ec2->source_encoding_name &&
3581 strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
3582 return Qfalse;
3583 if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
3584 strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
3585 return Qfalse;
3586 if (ec1->flags != ec2->flags) return Qfalse;
3587 if (ec1->replacement_enc != ec2->replacement_enc &&
3588 strcmp(ec1->replacement_enc, ec2->replacement_enc))
3589 return Qfalse;
3590 if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
3591 if (ec1->replacement_str != ec2->replacement_str &&
3592 memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
3593 return Qfalse;
3594
3595 if (ec1->num_trans != ec2->num_trans) return Qfalse;
3596 for (i = 0; i < ec1->num_trans; i++) {
3597 if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3598 return Qfalse;
3599 }
3600 return Qtrue;
3601}
3602
3603static VALUE
3604econv_result_to_symbol(rb_econv_result_t res)
3605{
3606 switch (res) {
3607 case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
3608 case econv_incomplete_input: return sym_incomplete_input;
3609 case econv_undefined_conversion: return sym_undefined_conversion;
3610 case econv_destination_buffer_full: return sym_destination_buffer_full;
3611 case econv_source_buffer_empty: return sym_source_buffer_empty;
3612 case econv_finished: return sym_finished;
3613 case econv_after_output: return sym_after_output;
3614 default: return INT2NUM(res); /* should not be reached */
3615 }
3616}
3617
3618/*
3619 * call-seq:
3620 * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3621 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3622 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3623 * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3624 *
3625 * possible opt elements:
3626 * hash form:
3627 * :partial_input => true # source buffer may be part of larger source
3628 * :after_output => true # stop conversion after output before input
3629 * integer form:
3630 * Encoding::Converter::PARTIAL_INPUT
3631 * Encoding::Converter::AFTER_OUTPUT
3632 *
3633 * possible results:
3634 * :invalid_byte_sequence
3635 * :incomplete_input
3636 * :undefined_conversion
3637 * :after_output
3638 * :destination_buffer_full
3639 * :source_buffer_empty
3640 * :finished
3641 *
3642 * primitive_convert converts source_buffer into destination_buffer.
3643 *
3644 * source_buffer should be a string or nil.
3645 * nil means an empty string.
3646 *
3647 * destination_buffer should be a string.
3648 *
3649 * destination_byteoffset should be an integer or nil.
3650 * nil means the end of destination_buffer.
3651 * If it is omitted, nil is assumed.
3652 *
3653 * destination_bytesize should be an integer or nil.
3654 * nil means unlimited.
3655 * If it is omitted, nil is assumed.
3656 *
3657 * opt should be nil, a hash or an integer.
3658 * nil means no flags.
3659 * If it is omitted, nil is assumed.
3660 *
3661 * primitive_convert converts the content of source_buffer from beginning
3662 * and store the result into destination_buffer.
3663 *
3664 * destination_byteoffset and destination_bytesize specify the region which
3665 * the converted result is stored.
3666 * destination_byteoffset specifies the start position in destination_buffer in bytes.
3667 * If destination_byteoffset is nil,
3668 * destination_buffer.bytesize is used for appending the result.
3669 * destination_bytesize specifies maximum number of bytes.
3670 * If destination_bytesize is nil,
3671 * destination size is unlimited.
3672 * After conversion, destination_buffer is resized to
3673 * destination_byteoffset + actually produced number of bytes.
3674 * Also destination_buffer's encoding is set to destination_encoding.
3675 *
3676 * primitive_convert drops the converted part of source_buffer.
3677 * the dropped part is converted in destination_buffer or
3678 * buffered in Encoding::Converter object.
3679 *
3680 * primitive_convert stops conversion when one of following condition met.
3681 * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3682 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3683 * - unexpected end of source buffer (:incomplete_input)
3684 * this occur only when :partial_input is not specified.
3685 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3686 * - character not representable in output encoding (:undefined_conversion)
3687 * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3688 * - after some output is generated, before input is done (:after_output)
3689 * this occur only when :after_output is specified.
3690 * - destination buffer is full (:destination_buffer_full)
3691 * this occur only when destination_bytesize is non-nil.
3692 * - source buffer is empty (:source_buffer_empty)
3693 * this occur only when :partial_input is specified.
3694 * - conversion is finished (:finished)
3695 *
3696 * example:
3697 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3698 * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3699 * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3700 *
3701 * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3702 * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3703 * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3704 * ret = ec.primitive_convert(src, dst="", nil, 1)
3705 * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3706 * ret = ec.primitive_convert(src, dst="", nil, 1)
3707 * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3708 * ret = ec.primitive_convert(src, dst="", nil, 1)
3709 * p [ret, src, dst] #=> [:finished, "", "i"]
3710 *
3711 */
3712static VALUE
3713econv_primitive_convert(int argc, VALUE *argv, VALUE self)
3714{
3715 VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3716 rb_econv_t *ec = check_econv(self);
3718 const unsigned char *ip, *is;
3719 unsigned char *op, *os;
3720 long output_byteoffset, output_bytesize;
3721 unsigned long output_byteend;
3722 int flags;
3723
3724 argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3725
3726 if (NIL_P(output_byteoffset_v))
3727 output_byteoffset = 0; /* dummy */
3728 else
3729 output_byteoffset = NUM2LONG(output_byteoffset_v);
3730
3731 if (NIL_P(output_bytesize_v))
3732 output_bytesize = 0; /* dummy */
3733 else
3734 output_bytesize = NUM2LONG(output_bytesize_v);
3735
3736 if (!NIL_P(flags_v)) {
3737 if (!NIL_P(opt)) {
3738 rb_error_arity(argc + 1, 2, 5);
3739 }
3740 flags = NUM2INT(rb_to_int(flags_v));
3741 }
3742 else if (!NIL_P(opt)) {
3743 VALUE v;
3744 flags = 0;
3745 v = rb_hash_aref(opt, sym_partial_input);
3746 if (RTEST(v))
3747 flags |= ECONV_PARTIAL_INPUT;
3748 v = rb_hash_aref(opt, sym_after_output);
3749 if (RTEST(v))
3750 flags |= ECONV_AFTER_OUTPUT;
3751 }
3752 else {
3753 flags = 0;
3754 }
3755
3756 StringValue(output);
3757 if (!NIL_P(input))
3758 StringValue(input);
3759 rb_str_modify(output);
3760
3761 if (NIL_P(output_bytesize_v)) {
3762#if USE_RVARGC
3763 output_bytesize = rb_str_capacity(output);
3764#else
3765 output_bytesize = RSTRING_EMBED_LEN_MAX;
3766#endif
3767 if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3768 output_bytesize = RSTRING_LEN(input);
3769 }
3770
3771 retry:
3772
3773 if (NIL_P(output_byteoffset_v))
3774 output_byteoffset = RSTRING_LEN(output);
3775
3776 if (output_byteoffset < 0)
3777 rb_raise(rb_eArgError, "negative output_byteoffset");
3778
3779 if (RSTRING_LEN(output) < output_byteoffset)
3780 rb_raise(rb_eArgError, "output_byteoffset too big");
3781
3782 if (output_bytesize < 0)
3783 rb_raise(rb_eArgError, "negative output_bytesize");
3784
3785 output_byteend = (unsigned long)output_byteoffset +
3786 (unsigned long)output_bytesize;
3787
3788 if (output_byteend < (unsigned long)output_byteoffset ||
3789 LONG_MAX < output_byteend)
3790 rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
3791
3792 if (rb_str_capacity(output) < output_byteend)
3793 rb_str_resize(output, output_byteend);
3794
3795 if (NIL_P(input)) {
3796 ip = is = NULL;
3797 }
3798 else {
3799 ip = (const unsigned char *)RSTRING_PTR(input);
3800 is = ip + RSTRING_LEN(input);
3801 }
3802
3803 op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3804 os = op + output_bytesize;
3805
3806 res = rb_econv_convert(ec, &ip, is, &op, os, flags);
3807 rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
3808 if (!NIL_P(input)) {
3809 rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
3810 }
3811
3812 if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
3813 if (LONG_MAX / 2 < output_bytesize)
3814 rb_raise(rb_eArgError, "too long conversion result");
3815 output_bytesize *= 2;
3816 output_byteoffset_v = Qnil;
3817 goto retry;
3818 }
3819
3820 if (ec->destination_encoding) {
3821 rb_enc_associate(output, ec->destination_encoding);
3822 }
3823
3824 return econv_result_to_symbol(res);
3825}
3826
3827/*
3828 * call-seq:
3829 * ec.convert(source_string) -> destination_string
3830 *
3831 * Convert source_string and return destination_string.
3832 *
3833 * source_string is assumed as a part of source.
3834 * i.e. :partial_input=>true is specified internally.
3835 * finish method should be used last.
3836 *
3837 * ec = Encoding::Converter.new("utf-8", "euc-jp")
3838 * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
3839 * puts ec.finish.dump #=> ""
3840 *
3841 * ec = Encoding::Converter.new("euc-jp", "utf-8")
3842 * puts ec.convert("\xA4").dump #=> ""
3843 * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
3844 * puts ec.finish.dump #=> ""
3845 *
3846 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3847 * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
3848 * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
3849 * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3850 * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
3851 *
3852 * If a conversion error occur,
3853 * Encoding::UndefinedConversionError or
3854 * Encoding::InvalidByteSequenceError is raised.
3855 * Encoding::Converter#convert doesn't supply methods to recover or restart
3856 * from these exceptions.
3857 * When you want to handle these conversion errors,
3858 * use Encoding::Converter#primitive_convert.
3859 *
3860 */
3861static VALUE
3862econv_convert(VALUE self, VALUE source_string)
3863{
3864 VALUE ret, dst;
3865 VALUE av[5];
3866 int ac;
3867 rb_econv_t *ec = check_econv(self);
3868
3869 StringValue(source_string);
3870
3871 dst = rb_str_new(NULL, 0);
3872
3873 av[0] = rb_str_dup(source_string);
3874 av[1] = dst;
3875 av[2] = Qnil;
3876 av[3] = Qnil;
3878 ac = 5;
3879
3880 ret = econv_primitive_convert(ac, av, self);
3881
3882 if (ret == sym_invalid_byte_sequence ||
3883 ret == sym_undefined_conversion ||
3884 ret == sym_incomplete_input) {
3885 VALUE exc = make_econv_exception(ec);
3886 rb_exc_raise(exc);
3887 }
3888
3889 if (ret == sym_finished) {
3890 rb_raise(rb_eArgError, "converter already finished");
3891 }
3892
3893 if (ret != sym_source_buffer_empty) {
3894 rb_bug("unexpected result of econv_primitive_convert");
3895 }
3896
3897 return dst;
3898}
3899
3900/*
3901 * call-seq:
3902 * ec.finish -> string
3903 *
3904 * Finishes the converter.
3905 * It returns the last part of the converted string.
3906 *
3907 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3908 * p ec.convert("\u3042") #=> "\e$B$\""
3909 * p ec.finish #=> "\e(B"
3910 */
3911static VALUE
3912econv_finish(VALUE self)
3913{
3914 VALUE ret, dst;
3915 VALUE av[5];
3916 int ac;
3917 rb_econv_t *ec = check_econv(self);
3918
3919 dst = rb_str_new(NULL, 0);
3920
3921 av[0] = Qnil;
3922 av[1] = dst;
3923 av[2] = Qnil;
3924 av[3] = Qnil;
3925 av[4] = INT2FIX(0);
3926 ac = 5;
3927
3928 ret = econv_primitive_convert(ac, av, self);
3929
3930 if (ret == sym_invalid_byte_sequence ||
3931 ret == sym_undefined_conversion ||
3932 ret == sym_incomplete_input) {
3933 VALUE exc = make_econv_exception(ec);
3934 rb_exc_raise(exc);
3935 }
3936
3937 if (ret != sym_finished) {
3938 rb_bug("unexpected result of econv_primitive_convert");
3939 }
3940
3941 return dst;
3942}
3943
3944/*
3945 * call-seq:
3946 * ec.primitive_errinfo -> array
3947 *
3948 * primitive_errinfo returns important information regarding the last error
3949 * as a 5-element array:
3950 *
3951 * [result, enc1, enc2, error_bytes, readagain_bytes]
3952 *
3953 * result is the last result of primitive_convert.
3954 *
3955 * Other elements are only meaningful when result is
3956 * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
3957 *
3958 * enc1 and enc2 indicate a conversion step as a pair of strings.
3959 * For example, a converter from EUC-JP to ISO-8859-1 converts
3960 * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
3961 * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
3962 *
3963 * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
3964 * error_bytes is discarded portion.
3965 * readagain_bytes is buffered portion which is read again on next conversion.
3966 *
3967 * Example:
3968 *
3969 * # \xff is invalid as EUC-JP.
3970 * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
3971 * ec.primitive_convert(src="\xff", dst="", nil, 10)
3972 * p ec.primitive_errinfo
3973 * #=> [:invalid_byte_sequence, "EUC-JP", "Shift_JIS", "\xFF", ""]
3974 *
3975 * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
3976 * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
3977 * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
3978 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3979 * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
3980 * p ec.primitive_errinfo
3981 * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
3982 *
3983 * # partial character is invalid
3984 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3985 * ec.primitive_convert(src="\xa4", dst="", nil, 10)
3986 * p ec.primitive_errinfo
3987 * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
3988 *
3989 * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
3990 * # partial characters.
3991 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3992 * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
3993 * p ec.primitive_errinfo
3994 * #=> [:source_buffer_empty, nil, nil, nil, nil]
3995 *
3996 * # \xd8\x00\x00@ is invalid as UTF-16BE because
3997 * # no low surrogate after high surrogate (\xd8\x00).
3998 * # It is detected by 3rd byte (\00) which is part of next character.
3999 * # So the high surrogate (\xd8\x00) is discarded and
4000 * # the 3rd byte is read again later.
4001 * # Since the byte is buffered in ec, it is dropped from src.
4002 * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
4003 * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
4004 * p ec.primitive_errinfo
4005 * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
4006 * p src
4007 * #=> "@"
4008 *
4009 * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
4010 * # The problem is detected by 4th byte.
4011 * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
4012 * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
4013 * p ec.primitive_errinfo
4014 * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
4015 * p src
4016 * #=> ""
4017 *
4018 */
4019static VALUE
4020econv_primitive_errinfo(VALUE self)
4021{
4022 rb_econv_t *ec = check_econv(self);
4023
4024 VALUE ary;
4025
4026 ary = rb_ary_new2(5);
4027
4028 rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
4029 rb_ary_store(ary, 4, Qnil);
4030
4031 if (ec->last_error.source_encoding)
4032 rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
4033
4034 if (ec->last_error.destination_encoding)
4035 rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
4036
4037 if (ec->last_error.error_bytes_start) {
4038 rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
4039 rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
4040 }
4041
4042 return ary;
4043}
4044
4045/*
4046 * call-seq:
4047 * ec.insert_output(string) -> nil
4048 *
4049 * Inserts string into the encoding converter.
4050 * The string will be converted to the destination encoding and
4051 * output on later conversions.
4052 *
4053 * If the destination encoding is stateful,
4054 * string is converted according to the state and the state is updated.
4055 *
4056 * This method should be used only when a conversion error occurs.
4057 *
4058 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4059 * src = "HIRAGANA LETTER A is \u{3042}."
4060 * dst = ""
4061 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4062 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
4063 * ec.insert_output("<err>")
4064 * p ec.primitive_convert(src, dst) #=> :finished
4065 * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
4066 *
4067 * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4068 * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4069 * dst = ""
4070 * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4071 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4072 * ec.insert_output "?" # state change required to output "?".
4073 * p ec.primitive_convert(src, dst) #=> :finished
4074 * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4075 *
4076 */
4077static VALUE
4078econv_insert_output(VALUE self, VALUE string)
4079{
4080 const char *insert_enc;
4081
4082 int ret;
4083
4084 rb_econv_t *ec = check_econv(self);
4085
4086 StringValue(string);
4087 insert_enc = rb_econv_encoding_to_insert_output(ec);
4088 string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
4089
4090 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
4091 if (ret == -1) {
4092 rb_raise(rb_eArgError, "too big string");
4093 }
4094
4095 return Qnil;
4096}
4097
4098/*
4099 * call-seq:
4100 * ec.putback -> string
4101 * ec.putback(max_numbytes) -> string
4102 *
4103 * Put back the bytes which will be converted.
4104 *
4105 * The bytes are caused by invalid_byte_sequence error.
4106 * When invalid_byte_sequence error, some bytes are discarded and
4107 * some bytes are buffered to be converted later.
4108 * The latter bytes can be put back.
4109 * It can be observed by
4110 * Encoding::InvalidByteSequenceError#readagain_bytes and
4111 * Encoding::Converter#primitive_errinfo.
4112 *
4113 * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4114 * src = "\x00\xd8\x61\x00"
4115 * dst = ""
4116 * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
4117 * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4118 * p ec.putback #=> "a\x00"
4119 * p ec.putback #=> "" # no more bytes to put back
4120 *
4121 */
4122static VALUE
4123econv_putback(int argc, VALUE *argv, VALUE self)
4124{
4125 rb_econv_t *ec = check_econv(self);
4126 int n;
4127 int putbackable;
4128 VALUE str, max;
4129
4130 if (!rb_check_arity(argc, 0, 1) || NIL_P(max = argv[0])) {
4131 n = rb_econv_putbackable(ec);
4132 }
4133 else {
4134 n = NUM2INT(max);
4135 putbackable = rb_econv_putbackable(ec);
4136 if (putbackable < n)
4137 n = putbackable;
4138 }
4139
4140 str = rb_str_new(NULL, n);
4141 rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
4142
4143 if (ec->source_encoding) {
4144 rb_enc_associate(str, ec->source_encoding);
4145 }
4146
4147 return str;
4148}
4149
4150/*
4151 * call-seq:
4152 * ec.last_error -> exception or nil
4153 *
4154 * Returns an exception object for the last conversion.
4155 * Returns nil if the last conversion did not produce an error.
4156 *
4157 * "error" means that
4158 * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4159 * Encoding::Converter#convert and
4160 * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4161 * Encoding::Converter#primitive_convert.
4162 *
4163 * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4164 * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
4165 * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4166 * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
4167 * p ec.last_error #=> nil
4168 *
4169 */
4170static VALUE
4171econv_last_error(VALUE self)
4172{
4173 rb_econv_t *ec = check_econv(self);
4174 VALUE exc;
4175
4176 exc = make_econv_exception(ec);
4177 if (NIL_P(exc))
4178 return Qnil;
4179 return exc;
4180}
4181
4182/*
4183 * call-seq:
4184 * ec.replacement -> string
4185 *
4186 * Returns the replacement string.
4187 *
4188 * ec = Encoding::Converter.new("euc-jp", "us-ascii")
4189 * p ec.replacement #=> "?"
4190 *
4191 * ec = Encoding::Converter.new("euc-jp", "utf-8")
4192 * p ec.replacement #=> "\uFFFD"
4193 */
4194static VALUE
4195econv_get_replacement(VALUE self)
4196{
4197 rb_econv_t *ec = check_econv(self);
4198 int ret;
4199 rb_encoding *enc;
4200
4201 ret = make_replacement(ec);
4202 if (ret == -1) {
4203 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4204 }
4205
4206 enc = rb_enc_find(ec->replacement_enc);
4207 return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
4208}
4209
4210/*
4211 * call-seq:
4212 * ec.replacement = string
4213 *
4214 * Sets the replacement string.
4215 *
4216 * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4217 * ec.replacement = "<undef>"
4218 * p ec.convert("a \u3042 b") #=> "a <undef> b"
4219 */
4220static VALUE
4221econv_set_replacement(VALUE self, VALUE arg)
4222{
4223 rb_econv_t *ec = check_econv(self);
4224 VALUE string = arg;
4225 int ret;
4226 rb_encoding *enc;
4227
4228 StringValue(string);
4229 enc = rb_enc_get(string);
4230
4231 ret = rb_econv_set_replacement(ec,
4232 (const unsigned char *)RSTRING_PTR(string),
4233 RSTRING_LEN(string),
4234 rb_enc_name(enc));
4235
4236 if (ret == -1) {
4237 /* xxx: rb_eInvalidByteSequenceError? */
4238 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4239 }
4240
4241 return arg;
4242}
4243
4244VALUE
4246{
4247 return make_econv_exception(ec);
4248}
4249
4250void
4252{
4253 VALUE exc;
4254
4255 exc = make_econv_exception(ec);
4256 if (NIL_P(exc))
4257 return;
4258 rb_exc_raise(exc);
4259}
4260
4261/*
4262 * call-seq:
4263 * ecerr.source_encoding_name -> string
4264 *
4265 * Returns the source encoding name as a string.
4266 */
4267static VALUE
4268ecerr_source_encoding_name(VALUE self)
4269{
4270 return rb_attr_get(self, id_source_encoding_name);
4271}
4272
4273/*
4274 * call-seq:
4275 * ecerr.source_encoding -> encoding
4276 *
4277 * Returns the source encoding as an encoding object.
4278 *
4279 * Note that the result may not be equal to the source encoding of
4280 * the encoding converter if the conversion has multiple steps.
4281 *
4282 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4283 * begin
4284 * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4285 * rescue Encoding::UndefinedConversionError
4286 * p $!.source_encoding #=> #<Encoding:UTF-8>
4287 * p $!.destination_encoding #=> #<Encoding:EUC-JP>
4288 * p $!.source_encoding_name #=> "UTF-8"
4289 * p $!.destination_encoding_name #=> "EUC-JP"
4290 * end
4291 *
4292 */
4293static VALUE
4294ecerr_source_encoding(VALUE self)
4295{
4296 return rb_attr_get(self, id_source_encoding);
4297}
4298
4299/*
4300 * call-seq:
4301 * ecerr.destination_encoding_name -> string
4302 *
4303 * Returns the destination encoding name as a string.
4304 */
4305static VALUE
4306ecerr_destination_encoding_name(VALUE self)
4307{
4308 return rb_attr_get(self, id_destination_encoding_name);
4309}
4310
4311/*
4312 * call-seq:
4313 * ecerr.destination_encoding -> string
4314 *
4315 * Returns the destination encoding as an encoding object.
4316 */
4317static VALUE
4318ecerr_destination_encoding(VALUE self)
4319{
4320 return rb_attr_get(self, id_destination_encoding);
4321}
4322
4323/*
4324 * call-seq:
4325 * ecerr.error_char -> string
4326 *
4327 * Returns the one-character string which cause Encoding::UndefinedConversionError.
4328 *
4329 * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4330 * begin
4331 * ec.convert("\xa0")
4332 * rescue Encoding::UndefinedConversionError
4333 * puts $!.error_char.dump #=> "\xC2\xA0"
4334 * p $!.error_char.encoding #=> #<Encoding:UTF-8>
4335 * end
4336 *
4337 */
4338static VALUE
4339ecerr_error_char(VALUE self)
4340{
4341 return rb_attr_get(self, id_error_char);
4342}
4343
4344/*
4345 * call-seq:
4346 * ecerr.error_bytes -> string
4347 *
4348 * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4349 *
4350 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4351 * begin
4352 * ec.convert("abc\xA1\xFFdef")
4353 * rescue Encoding::InvalidByteSequenceError
4354 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4355 * puts $!.error_bytes.dump #=> "\xA1"
4356 * puts $!.readagain_bytes.dump #=> "\xFF"
4357 * end
4358 */
4359static VALUE
4360ecerr_error_bytes(VALUE self)
4361{
4362 return rb_attr_get(self, id_error_bytes);
4363}
4364
4365/*
4366 * call-seq:
4367 * ecerr.readagain_bytes -> string
4368 *
4369 * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4370 */
4371static VALUE
4372ecerr_readagain_bytes(VALUE self)
4373{
4374 return rb_attr_get(self, id_readagain_bytes);
4375}
4376
4377/*
4378 * call-seq:
4379 * ecerr.incomplete_input? -> true or false
4380 *
4381 * Returns true if the invalid byte sequence error is caused by
4382 * premature end of string.
4383 *
4384 * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4385 *
4386 * begin
4387 * ec.convert("abc\xA1z")
4388 * rescue Encoding::InvalidByteSequenceError
4389 * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4390 * p $!.incomplete_input? #=> false
4391 * end
4392 *
4393 * begin
4394 * ec.convert("abc\xA1")
4395 * ec.finish
4396 * rescue Encoding::InvalidByteSequenceError
4397 * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4398 * p $!.incomplete_input? #=> true
4399 * end
4400 */
4401static VALUE
4402ecerr_incomplete_input(VALUE self)
4403{
4404 return rb_attr_get(self, id_incomplete_input);
4405}
4406
4407/*
4408 * Document-class: Encoding::UndefinedConversionError
4409 *
4410 * Raised by Encoding and String methods when a transcoding operation
4411 * fails.
4412 */
4413
4414/*
4415 * Document-class: Encoding::InvalidByteSequenceError
4416 *
4417 * Raised by Encoding and String methods when the string being
4418 * transcoded contains a byte invalid for the either the source or
4419 * target encoding.
4420 */
4421
4422/*
4423 * Document-class: Encoding::ConverterNotFoundError
4424 *
4425 * Raised by transcoding methods when a named encoding does not
4426 * correspond with a known converter.
4427 */
4428
4429void
4430Init_transcode(void)
4431{
4432 transcoder_table = st_init_strcasetable();
4433
4434 id_destination_encoding = rb_intern_const("destination_encoding");
4435 id_destination_encoding_name = rb_intern_const("destination_encoding_name");
4436 id_error_bytes = rb_intern_const("error_bytes");
4437 id_error_char = rb_intern_const("error_char");
4438 id_incomplete_input = rb_intern_const("incomplete_input");
4439 id_readagain_bytes = rb_intern_const("readagain_bytes");
4440 id_source_encoding = rb_intern_const("source_encoding");
4441 id_source_encoding_name = rb_intern_const("source_encoding_name");
4442
4443 sym_invalid = ID2SYM(rb_intern_const("invalid"));
4444 sym_undef = ID2SYM(rb_intern_const("undef"));
4445 sym_replace = ID2SYM(rb_intern_const("replace"));
4446 sym_fallback = ID2SYM(rb_intern_const("fallback"));
4447 sym_xml = ID2SYM(rb_intern_const("xml"));
4448 sym_text = ID2SYM(rb_intern_const("text"));
4449 sym_attr = ID2SYM(rb_intern_const("attr"));
4450
4451 sym_invalid_byte_sequence = ID2SYM(rb_intern_const("invalid_byte_sequence"));
4452 sym_undefined_conversion = ID2SYM(rb_intern_const("undefined_conversion"));
4453 sym_destination_buffer_full = ID2SYM(rb_intern_const("destination_buffer_full"));
4454 sym_source_buffer_empty = ID2SYM(rb_intern_const("source_buffer_empty"));
4455 sym_finished = ID2SYM(rb_intern_const("finished"));
4456 sym_after_output = ID2SYM(rb_intern_const("after_output"));
4457 sym_incomplete_input = ID2SYM(rb_intern_const("incomplete_input"));
4458 sym_universal_newline = ID2SYM(rb_intern_const("universal_newline"));
4459 sym_crlf_newline = ID2SYM(rb_intern_const("crlf_newline"));
4460 sym_cr_newline = ID2SYM(rb_intern_const("cr_newline"));
4461 sym_lf_newline = ID2SYM(rb_intern("lf_newline"));
4462 sym_partial_input = ID2SYM(rb_intern_const("partial_input"));
4463
4464#ifdef ENABLE_ECONV_NEWLINE_OPTION
4465 sym_newline = ID2SYM(rb_intern_const("newline"));
4466 sym_universal = ID2SYM(rb_intern_const("universal"));
4467 sym_crlf = ID2SYM(rb_intern_const("crlf"));
4468 sym_cr = ID2SYM(rb_intern_const("cr"));
4469 sym_lf = ID2SYM(rb_intern_const("lf"));
4470#endif
4471
4472 InitVM(transcode);
4473}
4474
4475void
4476InitVM_transcode(void)
4477{
4478 rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
4479 rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
4480 rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
4481
4482 rb_define_method(rb_cString, "encode", str_encode, -1);
4483 rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
4484
4485 rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject);
4486 rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
4487 rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
4488 rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
4489 rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
4490 rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
4491 rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
4492 rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
4493 rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
4494 rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
4495 rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
4496 rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
4497 rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
4498 rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
4499 rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
4500 rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
4501 rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
4502 rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
4503 rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
4504
4505 /* Document-const: INVALID_MASK
4506 *
4507 * Mask for invalid byte sequences
4508 */
4509 rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
4510
4511 /* Document-const: INVALID_REPLACE
4512 *
4513 * Replace invalid byte sequences
4514 */
4515 rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
4516
4517 /* Document-const: UNDEF_MASK
4518 *
4519 * Mask for a valid character in the source encoding but no related
4520 * character(s) in destination encoding.
4521 */
4522 rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
4523
4524 /* Document-const: UNDEF_REPLACE
4525 *
4526 * Replace byte sequences that are undefined in the destination encoding.
4527 */
4528 rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
4529
4530 /* Document-const: UNDEF_HEX_CHARREF
4531 *
4532 * Replace byte sequences that are undefined in the destination encoding
4533 * with an XML hexadecimal character reference. This is valid for XML
4534 * conversion.
4535 */
4536 rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
4537
4538 /* Document-const: PARTIAL_INPUT
4539 *
4540 * Indicates the source may be part of a larger string. See
4541 * primitive_convert for an example.
4542 */
4543 rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
4544
4545 /* Document-const: AFTER_OUTPUT
4546 *
4547 * Stop converting after some output is complete but before all of the
4548 * input was consumed. See primitive_convert for an example.
4549 */
4550 rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
4551
4552 /* Document-const: UNIVERSAL_NEWLINE_DECORATOR
4553 *
4554 * Decorator for converting CRLF and CR to LF
4555 */
4556 rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
4557
4558 /* Document-const: LF_NEWLINE_DECORATOR
4559 *
4560 * Decorator for converting CRLF and CR to LF when writing
4561 */
4562 rb_define_const(rb_cEncodingConverter, "LF_NEWLINE_DECORATOR", INT2FIX(ECONV_LF_NEWLINE_DECORATOR));
4563
4564 /* Document-const: CRLF_NEWLINE_DECORATOR
4565 *
4566 * Decorator for converting LF to CRLF
4567 */
4568 rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
4569
4570 /* Document-const: CR_NEWLINE_DECORATOR
4571 *
4572 * Decorator for converting LF to CR
4573 */
4574 rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
4575
4576 /* Document-const: XML_TEXT_DECORATOR
4577 *
4578 * Escape as XML CharData
4579 */
4580 rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
4581
4582 /* Document-const: XML_ATTR_CONTENT_DECORATOR
4583 *
4584 * Escape as XML AttValue
4585 */
4586 rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
4587
4588 /* Document-const: XML_ATTR_QUOTE_DECORATOR
4589 *
4590 * Escape as XML AttValue
4591 */
4592 rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
4593
4594 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
4595 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4596 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
4597 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
4598 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
4599
4600 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
4601 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
4602 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
4603 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
4604 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
4605 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
4606 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
4607
4608 Init_newline();
4609}
ruby_coderange_type
What rb_enc_str_coderange() returns.
Definition coderange.h:33
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
Definition class.c:920
#define ECONV_XML_ATTR_QUOTE_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_QUOTE_DECORATOR.
Definition transcode.h:539
#define ECONV_AFTER_OUTPUT
Old name of RUBY_ECONV_AFTER_OUTPUT.
Definition transcode.h:555
#define rb_str_new2
Old name of rb_str_new_cstr.
Definition string.h:1675
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define ECONV_UNIVERSAL_NEWLINE_DECORATOR
Old name of RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR.
Definition transcode.h:532
#define REALLOC_N
Old name of RB_REALLOC_N.
Definition memory.h:397
#define ALLOC
Old name of RB_ALLOC.
Definition memory.h:394
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:145
#define ECONV_XML_ATTR_CONTENT_DECORATOR
Old name of RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR.
Definition transcode.h:537
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define ECONV_INVALID_MASK
Old name of RUBY_ECONV_INVALID_MASK.
Definition transcode.h:523
#define ECONV_CRLF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CRLF_NEWLINE_DECORATOR.
Definition transcode.h:533
#define xrealloc
Old name of ruby_xrealloc.
Definition xmalloc.h:56
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:143
#define ECONV_UNDEF_REPLACE
Old name of RUBY_ECONV_UNDEF_REPLACE.
Definition transcode.h:526
#define ECONV_XML_TEXT_DECORATOR
Old name of RUBY_ECONV_XML_TEXT_DECORATOR.
Definition transcode.h:536
#define rb_ary_new4
Old name of rb_ary_new_from_values.
Definition array.h:653
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define ECONV_CR_NEWLINE_DECORATOR
Old name of RUBY_ECONV_CR_NEWLINE_DECORATOR.
Definition transcode.h:534
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ECONV_INVALID_REPLACE
Old name of RUBY_ECONV_INVALID_REPLACE.
Definition transcode.h:524
#define T_HASH
Old name of RUBY_T_HASH.
Definition value_type.h:65
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:393
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:533
#define rb_exc_new3
Old name of rb_exc_new_str.
Definition error.h:38
#define ECONV_UNDEF_MASK
Old name of RUBY_ECONV_UNDEF_MASK.
Definition transcode.h:525
#define Qtrue
Old name of RUBY_Qtrue.
#define ECONV_PARTIAL_INPUT
Old name of RUBY_ECONV_PARTIAL_INPUT.
Definition transcode.h:554
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define ECONV_ERROR_HANDLER_MASK
Old name of RUBY_ECONV_ERROR_HANDLER_MASK.
Definition transcode.h:522
#define INT2NUM
Old name of RB_INT2NUM.
Definition int.h:43
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define ECONV_LF_NEWLINE_DECORATOR
Old name of RUBY_ECONV_LF_NEWLINE_DECORATOR.
Definition transcode.h:535
#define T_ARRAY
Old name of RUBY_T_ARRAY.
Definition value_type.h:56
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:532
#define ECONV_UNDEF_HEX_CHARREF
Old name of RUBY_ECONV_UNDEF_HEX_CHARREF.
Definition transcode.h:527
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ECONV_NEWLINE_DECORATOR_MASK
Old name of RUBY_ECONV_NEWLINE_DECORATOR_MASK.
Definition transcode.h:529
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:651
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
void rb_raise(VALUE exc, const char *fmt,...)
Exception entry point.
Definition error.c:3148
void rb_exc_raise(VALUE mesg)
Raises an exception in the current thread.
Definition eval.c:684
int rb_typeddata_is_kind_of(VALUE obj, const rb_data_type_t *data_type)
Checks if the given object is of given kind.
Definition error.c:1041
void rb_bug(const char *fmt,...)
Interpreter panic switch.
Definition error.c:794
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1091
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1089
void * rb_check_typeddata(VALUE obj, const rb_data_type_t *data_type)
Identical to rb_typeddata_is_kind_of(), except it raises exceptions instead of returning false.
Definition error.c:1058
VALUE rb_exc_new_str(VALUE etype, VALUE str)
Identical to rb_exc_new_cstr(), except it takes a Ruby's string instead of C's.
Definition error.c:1142
VALUE rb_eArgError
ArgumentError exception.
Definition error.c:1092
VALUE rb_eEncodingError
EncodingError exception.
Definition error.c:1097
void rb_warning(const char *fmt,...)
Issues a warning.
Definition error.c:442
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:190
VALUE rb_cEncoding
Encoding class.
Definition encoding.c:57
VALUE rb_cString
String class.
Definition string.c:79
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3022
Encoding relates APIs.
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1208
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:821
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:719
int rb_econv_prepare_options(VALUE opthash, VALUE *ecopts, int ecflags)
Identical to rb_econv_prepare_opts(), except it additionally takes the initial value of flags.
Definition transcode.c:2579
VALUE rb_econv_open_exc(const char *senc, const char *denc, int ecflags)
Creates a rb_eConverterNotFoundError exception object (but does not raise).
Definition transcode.c:2076
const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec)
Queries an encoding name which best suits for rb_econv_insert_output()'s last parameter.
Definition transcode.c:1510
int rb_econv_prepare_opts(VALUE opthash, VALUE *ecopts)
Splits a keyword arguments hash (that for instance String#encode took) into a set of enum ruby_econv_...
Definition transcode.c:2624
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1453
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_incomplete_input
The conversion stopped in middle of reading a character, possibly due to a partial read of a socket e...
Definition transcode.h:69
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_undefined_conversion
The conversion stopped when it found a character in the input which cannot be representable in the ou...
Definition transcode.h:41
@ econv_after_output
The conversion stopped after writing something to somewhere, before reading everything.
Definition transcode.h:63
@ econv_source_buffer_empty
The conversion stopped because there is no input.
Definition transcode.h:51
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
@ econv_invalid_byte_sequence
The conversion stopped when it found an invalid sequence.
Definition transcode.h:35
int rb_econv_putbackable(rb_econv_t *ec)
Queries if rb_econv_putback() makes sense, i.e.
Definition transcode.c:1749
int rb_econv_has_convpath_p(const char *from_encoding, const char *to_encoding)
Queries if there is more than one way to convert between the passed two encodings.
Definition transcode.c:3181
rb_econv_t * rb_econv_open(const char *source_encoding, const char *destination_encoding, int ecflags)
Creates a new instance of struct rb_econv_t.
Definition transcode.c:1076
VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
Identical to rb_econv_str_convert(), except it appends the conversion result to the additionally pass...
Definition transcode.c:1898
VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, VALUE dst, int flags)
Identical to rb_econv_str_append(), except it appends only a part of the passed string with conversio...
Definition transcode.c:1889
const char * rb_econv_asciicompat_encoding(const char *encname)
Queries the passed encoding's corresponding ASCII compatible encoding.
Definition transcode.c:1793
int rb_econv_insert_output(rb_econv_t *ec, const unsigned char *str, size_t len, const char *str_encoding)
Appends the passed string to the passed converter's output buffer.
Definition transcode.c:1594
VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
Identical to rb_econv_convert(), except it takes Ruby's string instead of C's pointer.
Definition transcode.c:1910
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2630
int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
Identical to rb_econv_decorate_at_first(), except it adds to the opposite direction.
Definition transcode.c:1958
void rb_econv_binmode(rb_econv_t *ec)
This badly named function does not set the destination encoding to binary, but instead just nullifies...
Definition transcode.c:1975
int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
"Decorate"s a converter.
Definition transcode.c:1941
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2884
VALUE rb_econv_make_exception(rb_econv_t *ec)
This function makes sense right after rb_econv_convert() returns.
Definition transcode.c:4245
void rb_econv_check_error(rb_econv_t *ec)
This is a rb_econv_make_exception() + rb_exc_raise() combo.
Definition transcode.c:4251
VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
Identical to rb_econv_str_convert(), except it converts only a part of the passed string.
Definition transcode.c:1904
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1709
VALUE rb_econv_append(rb_econv_t *ec, const char *bytesrc, long bytesize, VALUE dst, int flags)
Converts the passed C's pointer according to the passed converter, then append the conversion result ...
Definition transcode.c:1826
void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
Puts back the bytes.
Definition transcode.c:1760
int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname)
Assigns the replacement string.
Definition transcode.c:2238
VALUE rb_funcallv_public(VALUE recv, ID mid, int argc, const VALUE *argv)
Identical to rb_funcallv(), except it only takes public methods into account.
Definition vm_eval.c:1154
#define rb_check_frozen
Just another name of rb_check_frozen.
Definition error.h:264
VALUE rb_proc_call(VALUE recv, VALUE args)
Evaluates the passed proc with the passed arguments.
Definition proc.c:1003
VALUE rb_obj_is_method(VALUE recv)
Queries if the given object is a method.
Definition proc.c:1637
VALUE rb_method_call(int argc, const VALUE *argv, VALUE recv)
Evaluates the passed method with the passed arguments.
Definition proc.c:2489
VALUE rb_obj_is_proc(VALUE recv)
Queries if the given object is a proc.
Definition proc.c:175
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1565
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1618
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:871
VALUE rb_str_new_frozen(VALUE str)
Creates a frozen copy of the string, if necessary.
Definition string.c:1382
VALUE rb_str_dup(VALUE str)
Duplicates a string.
Definition string.c:1834
void rb_str_modify(VALUE str)
Declares that the string is about to be modified.
Definition string.c:2437
void rb_str_set_len(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3019
VALUE rb_str_resize(VALUE str, long len)
Overwrites the length of the string.
Definition string.c:3036
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2445
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:6791
VALUE rb_str_buf_new(long capa)
Allocates a "string buffer".
Definition string.c:1532
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5223
VALUE rb_attr_get(VALUE obj, ID name)
Identical to rb_ivar_get()
Definition variable.c:1223
VALUE rb_ivar_set(VALUE obj, ID name, VALUE val)
Identical to rb_iv_set(), except it accepts the name as an ID instead of a C string.
Definition variable.c:1593
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:2823
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
Definition symbol.c:942
void rb_define_const(VALUE klass, const char *name, VALUE val)
Defines a Ruby level constant under a namespace.
Definition variable.c:3427
VALUE rb_sprintf(const char *fmt,...)
Ruby's extended sprintf(3).
Definition sprintf.c:1219
VALUE rb_str_catf(VALUE dst, const char *fmt,...)
Identical to rb_sprintf(), except it renders the output to the specified object rather than creating ...
Definition sprintf.c:1242
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:366
#define ALLOCA_N(type, n)
Definition memory.h:286
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:161
#define MEMMOVE(p1, p2, type, n)
Handy macro to call memmove.
Definition memory.h:378
#define RARRAY_LEN
Just another name of rb_array_len.
Definition rarray.h:68
#define RARRAY_AREF(a, i)
Definition rarray.h:583
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:71
@ RSTRING_EMBED_LEN_MAX
Max possible number of characters that can be embedded.
Definition rstring.h:215
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:72
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:95
#define TypedData_Get_Struct(obj, type, data_type, sval)
Obtains a C struct from inside of a wrapper Ruby object.
Definition rtypeddata.h:507
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:441
const char * rb_obj_classname(VALUE obj)
Queries the name of the class of the passed object.
Definition variable.c:322
#define InitVM(ext)
This macro is for internal use.
Definition ruby.h:230
#define RTEST
This is an old name of RB_TEST.
Definition st.h:79
Definition string.c:7746
Definition transcode.c:175
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52