Ruby  2.1.3p242(2014-09-19revision47630)
transcode.c
Go to the documentation of this file.
1 /**********************************************************************
2 
3  transcode.c -
4 
5  $Author: naruse $
6  created at: Tue Oct 30 16:10:22 JST 2007
7 
8  Copyright (C) 2007 Martin Duerst
9 
10 **********************************************************************/
11 
12 #include "ruby/ruby.h"
13 #include "ruby/encoding.h"
14 #include "internal.h"
15 #include "transcode_data.h"
16 #include <ctype.h>
17 
18 #define ENABLE_ECONV_NEWLINE_OPTION 1
19 
20 /* VALUE rb_cEncoding = rb_define_class("Encoding", rb_cObject); */
24 
26 
32 #ifdef ENABLE_ECONV_NEWLINE_OPTION
34 #endif
36 
44 
45 static unsigned char *
46 allocate_converted_string(const char *sname, const char *dname,
47  const unsigned char *str, size_t len,
48  unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
49  size_t *dst_len_ptr);
50 
51 /* dynamic structure, one per conversion (similar to iconv_t) */
52 /* may carry conversion state (e.g. for iso-2022-jp) */
53 typedef struct rb_transcoding {
55 
56  int flags;
57 
59  unsigned int next_table;
61  unsigned char next_byte;
62  unsigned int output_index;
63 
64  ssize_t recognized_len; /* already interpreted */
65  ssize_t readagain_len; /* not yet interpreted */
66  union {
67  unsigned char ary[8]; /* max_input <= sizeof(ary) */
68  unsigned char *ptr; /* length: max_input */
69  } readbuf; /* recognized_len + readagain_len used */
70 
71  ssize_t writebuf_off;
72  ssize_t writebuf_len;
73  union {
74  unsigned char ary[8]; /* max_output <= sizeof(ary) */
75  unsigned char *ptr; /* length: max_output */
76  } writebuf;
77 
78  union rb_transcoding_state_t { /* opaque data for stateful encoding */
79  void *ptr;
80  char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
82  } state;
84 #define TRANSCODING_READBUF(tc) \
85  ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
86  (tc)->readbuf.ary : \
87  (tc)->readbuf.ptr)
88 #define TRANSCODING_WRITEBUF(tc) \
89  ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
90  (tc)->writebuf.ary : \
91  (tc)->writebuf.ptr)
92 #define TRANSCODING_WRITEBUF_SIZE(tc) \
93  ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
94  sizeof((tc)->writebuf.ary) : \
95  (size_t)(tc)->transcoder->max_output)
96 #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
97 #define TRANSCODING_STATE(tc) \
98  ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
99  (tc)->state.ary : \
100  (tc)->state.ptr)
101 
102 typedef struct {
104  unsigned char *out_buf_start;
105  unsigned char *out_data_start;
106  unsigned char *out_data_end;
107  unsigned char *out_buf_end;
110 
111 struct rb_econv_t {
112  int flags;
113  const char *source_encoding_name;
115 
116  int started;
117 
118  const unsigned char *replacement_str;
120  const char *replacement_enc;
122 
123  unsigned char *in_buf_start;
124  unsigned char *in_data_start;
125  unsigned char *in_data_end;
126  unsigned char *in_buf_end;
132 
133  /* last error */
134  struct {
137  const char *source_encoding;
138  const char *destination_encoding;
139  const unsigned char *error_bytes_start;
142  } last_error;
143 
144  /* The following fields are only for Encoding::Converter.
145  * rb_econv_open set them NULL. */
148 };
149 
150 /*
151  * Dispatch data and logic
152  */
153 
154 #define DECORATOR_P(sname, dname) (*(sname) == '\0')
155 
156 typedef struct {
157  const char *sname;
158  const char *dname;
159  const char *lib; /* null means means no need to load a library */
162 
164 
165 static transcoder_entry_t *
166 make_transcoder_entry(const char *sname, const char *dname)
167 {
168  st_data_t val;
169  st_table *table2;
170 
171  if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
173  st_add_direct(transcoder_table, (st_data_t)sname, val);
174  }
175  table2 = (st_table *)val;
176  if (!st_lookup(table2, (st_data_t)dname, &val)) {
178  entry->sname = sname;
179  entry->dname = dname;
180  entry->lib = NULL;
181  entry->transcoder = NULL;
182  val = (st_data_t)entry;
183  st_add_direct(table2, (st_data_t)dname, val);
184  }
185  return (transcoder_entry_t *)val;
186 }
187 
188 static transcoder_entry_t *
189 get_transcoder_entry(const char *sname, const char *dname)
190 {
191  st_data_t val;
192  st_table *table2;
193 
194  if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
195  return NULL;
196  }
197  table2 = (st_table *)val;
198  if (!st_lookup(table2, (st_data_t)dname, &val)) {
199  return NULL;
200  }
201  return (transcoder_entry_t *)val;
202 }
203 
204 void
206 {
207  const char *const sname = tr->src_encoding;
208  const char *const dname = tr->dst_encoding;
209 
210  transcoder_entry_t *entry;
211 
212  entry = make_transcoder_entry(sname, dname);
213  if (entry->transcoder) {
214  rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
215  sname, dname);
216  }
217 
218  entry->transcoder = tr;
219 }
220 
221 static void
222 declare_transcoder(const char *sname, const char *dname, const char *lib)
223 {
224  transcoder_entry_t *entry;
225 
226  entry = make_transcoder_entry(sname, dname);
227  entry->lib = lib;
228 }
229 
230 static const char transcoder_lib_prefix[] = "enc/trans/";
231 
232 void
233 rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
234 {
235  if (!lib) {
236  rb_raise(rb_eArgError, "invalid library name - (null)");
237  }
238  declare_transcoder(enc1, enc2, lib);
239 }
240 
241 #define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
242 
243 typedef struct search_path_queue_tag {
245  const char *enc;
247 
248 typedef struct {
252  const char *base_enc;
254 
255 static int
257 {
258  const char *dname = (const char *)key;
259  search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
261 
262  if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
263  return ST_CONTINUE;
264  }
265 
267  q->enc = dname;
268  q->next = NULL;
269  *bfs->queue_last_ptr = q;
270  bfs->queue_last_ptr = &q->next;
271 
272  st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
273  return ST_CONTINUE;
274 }
275 
276 static int
277 transcode_search_path(const char *sname, const char *dname,
278  void (*callback)(const char *sname, const char *dname, int depth, void *arg),
279  void *arg)
280 {
281  search_path_bfs_t bfs;
283  st_data_t val;
284  st_table *table2;
285  int found;
286  int pathlen = -1;
287 
288  if (encoding_equal(sname, dname))
289  return -1;
290 
292  q->enc = sname;
293  q->next = NULL;
294  bfs.queue_last_ptr = &q->next;
295  bfs.queue = q;
296 
299 
300  while (bfs.queue) {
301  q = bfs.queue;
302  bfs.queue = q->next;
303  if (!bfs.queue)
304  bfs.queue_last_ptr = &bfs.queue;
305 
306  if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
307  xfree(q);
308  continue;
309  }
310  table2 = (st_table *)val;
311 
312  if (st_lookup(table2, (st_data_t)dname, &val)) {
313  st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
314  xfree(q);
315  found = 1;
316  goto cleanup;
317  }
318 
319  bfs.base_enc = q->enc;
321  bfs.base_enc = NULL;
322 
323  xfree(q);
324  }
325  found = 0;
326 
327  cleanup:
328  while (bfs.queue) {
329  q = bfs.queue;
330  bfs.queue = q->next;
331  xfree(q);
332  }
333 
334  if (found) {
335  const char *enc = dname;
336  int depth;
337  pathlen = 0;
338  while (1) {
339  st_lookup(bfs.visited, (st_data_t)enc, &val);
340  if (!val)
341  break;
342  pathlen++;
343  enc = (const char *)val;
344  }
345  depth = pathlen;
346  enc = dname;
347  while (1) {
348  st_lookup(bfs.visited, (st_data_t)enc, &val);
349  if (!val)
350  break;
351  callback((const char *)val, enc, --depth, arg);
352  enc = (const char *)val;
353  }
354  }
355 
356  st_free_table(bfs.visited);
357 
358  return pathlen; /* is -1 if not found */
359 }
360 
361 static const rb_transcoder *
363 {
364  if (entry->transcoder)
365  return entry->transcoder;
366 
367  if (entry->lib) {
368  const char *const lib = entry->lib;
369  const size_t len = strlen(lib);
370  const size_t total_len = sizeof(transcoder_lib_prefix) - 1 + len;
371  const VALUE fn = rb_str_new(0, total_len);
372  char *const path = RSTRING_PTR(fn);
373  const int safe = rb_safe_level();
374 
375  entry->lib = NULL;
376 
377  memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
378  memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len);
379  rb_str_set_len(fn, total_len);
380  FL_UNSET(fn, FL_TAINT);
381  OBJ_FREEZE(fn);
382  if (!rb_require_safe(fn, safe > 3 ? 3 : safe))
383  return NULL;
384  }
385 
386  if (entry->transcoder)
387  return entry->transcoder;
388 
389  return NULL;
390 }
391 
392 static const char*
393 get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
394 {
395  if (encoding_equal(encname, "UTF-8")) {
396  *len_ret = 3;
397  *repl_encname_ptr = "UTF-8";
398  return "\xEF\xBF\xBD";
399  }
400  else {
401  *len_ret = 1;
402  *repl_encname_ptr = "US-ASCII";
403  return "?";
404  }
405 }
406 
407 /*
408  * Transcoding engine logic
409  */
410 
411 static const unsigned char *
413  const unsigned char *in_start,
414  const unsigned char *inchar_start,
415  const unsigned char *in_p,
416  size_t *char_len_ptr)
417 {
418  const unsigned char *ptr;
419  if (inchar_start - in_start < tc->recognized_len) {
421  inchar_start, unsigned char, in_p - inchar_start);
422  ptr = TRANSCODING_READBUF(tc);
423  }
424  else {
425  ptr = inchar_start - tc->recognized_len;
426  }
427  *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
428  return ptr;
429 }
430 
431 static rb_econv_result_t
432 transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
433  const unsigned char *in_stop, unsigned char *out_stop,
434  rb_transcoding *tc,
435  const int opt)
436 {
437  const rb_transcoder *tr = tc->transcoder;
438  int unitlen = tr->input_unit_length;
439  ssize_t readagain_len = 0;
440 
441  const unsigned char *inchar_start;
442  const unsigned char *in_p;
443 
444  unsigned char *out_p;
445 
446  in_p = inchar_start = *in_pos;
447 
448  out_p = *out_pos;
449 
450 #define SUSPEND(ret, num) \
451  do { \
452  tc->resume_position = (num); \
453  if (0 < in_p - inchar_start) \
454  MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
455  inchar_start, unsigned char, in_p - inchar_start); \
456  *in_pos = in_p; \
457  *out_pos = out_p; \
458  tc->recognized_len += in_p - inchar_start; \
459  if (readagain_len) { \
460  tc->recognized_len -= readagain_len; \
461  tc->readagain_len = readagain_len; \
462  } \
463  return (ret); \
464  resume_label ## num:; \
465  } while (0)
466 #define SUSPEND_OBUF(num) \
467  do { \
468  while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
469  } while (0)
470 
471 #define SUSPEND_AFTER_OUTPUT(num) \
472  if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
473  SUSPEND(econv_after_output, num); \
474  }
475 
476 #define next_table (tc->next_table)
477 #define next_info (tc->next_info)
478 #define next_byte (tc->next_byte)
479 #define writebuf_len (tc->writebuf_len)
480 #define writebuf_off (tc->writebuf_off)
481 
482  switch (tc->resume_position) {
483  case 0: break;
484  case 1: goto resume_label1;
485  case 2: goto resume_label2;
486  case 3: goto resume_label3;
487  case 4: goto resume_label4;
488  case 5: goto resume_label5;
489  case 6: goto resume_label6;
490  case 7: goto resume_label7;
491  case 8: goto resume_label8;
492  case 9: goto resume_label9;
493  case 10: goto resume_label10;
494  case 11: goto resume_label11;
495  case 12: goto resume_label12;
496  case 13: goto resume_label13;
497  case 14: goto resume_label14;
498  case 15: goto resume_label15;
499  case 16: goto resume_label16;
500  case 17: goto resume_label17;
501  case 18: goto resume_label18;
502  case 19: goto resume_label19;
503  case 20: goto resume_label20;
504  case 21: goto resume_label21;
505  case 22: goto resume_label22;
506  case 23: goto resume_label23;
507  case 24: goto resume_label24;
508  case 25: goto resume_label25;
509  case 26: goto resume_label26;
510  case 27: goto resume_label27;
511  case 28: goto resume_label28;
512  case 29: goto resume_label29;
513  case 30: goto resume_label30;
514  case 31: goto resume_label31;
515  case 32: goto resume_label32;
516  case 33: goto resume_label33;
517  case 34: goto resume_label34;
518  }
519 
520  while (1) {
521  inchar_start = in_p;
522  tc->recognized_len = 0;
523  next_table = tr->conv_tree_start;
524 
526 
527  if (in_stop <= in_p) {
528  if (!(opt & ECONV_PARTIAL_INPUT))
529  break;
531  continue;
532  }
533 
534 #define BYTE_ADDR(index) (tr->byte_array + (index))
535 #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
536 #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
537 #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
538 #define BL_MIN_BYTE (BL_BASE[0])
539 #define BL_MAX_BYTE (BL_BASE[1])
540 #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
541 #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
542 
543  next_byte = (unsigned char)*in_p++;
544  follow_byte:
545  if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
546  next_info = INVALID;
547  else {
548  next_info = (VALUE)BL_ACTION(next_byte);
549  }
550  follow_info:
551  switch (next_info & 0x1F) {
552  case NOMAP:
553  {
554  const unsigned char *p = inchar_start;
555  writebuf_off = 0;
556  while (p < in_p) {
557  TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
558  }
559  writebuf_len = writebuf_off;
560  writebuf_off = 0;
561  while (writebuf_off < writebuf_len) {
562  SUSPEND_OBUF(3);
563  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
564  }
565  }
566  continue;
567  case 0x00: case 0x04: case 0x08: case 0x0C:
568  case 0x10: case 0x14: case 0x18: case 0x1C:
570  while (in_p >= in_stop) {
571  if (!(opt & ECONV_PARTIAL_INPUT))
572  goto incomplete;
574  }
575  next_byte = (unsigned char)*in_p++;
576  next_table = (unsigned int)next_info;
577  goto follow_byte;
578  case ZERObt: /* drop input */
579  continue;
580  case ONEbt:
581  SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
582  continue;
583  case TWObt:
584  SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
585  SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
586  continue;
587  case THREEbt:
588  SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
589  SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
590  SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
591  continue;
592  case FOURbt:
593  SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
594  SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
595  SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
596  SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
597  continue;
598  case GB4bt:
599  SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
600  SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
601  SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
602  SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
603  continue;
604  case STR1:
605  tc->output_index = 0;
606  while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
607  SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
608  tc->output_index++;
609  }
610  continue;
611  case FUNii:
612  next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
613  goto follow_info;
614  case FUNsi:
615  {
616  const unsigned char *char_start;
617  size_t char_len;
618  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
619  next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
620  goto follow_info;
621  }
622  case FUNio:
623  SUSPEND_OBUF(13);
624  if (tr->max_output <= out_stop - out_p)
625  out_p += tr->func_io(TRANSCODING_STATE(tc),
626  next_info, out_p, out_stop - out_p);
627  else {
628  writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
629  next_info,
631  writebuf_off = 0;
632  while (writebuf_off < writebuf_len) {
633  SUSPEND_OBUF(20);
634  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
635  }
636  }
637  break;
638  case FUNso:
639  {
640  const unsigned char *char_start;
641  size_t char_len;
642  SUSPEND_OBUF(14);
643  if (tr->max_output <= out_stop - out_p) {
644  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
645  out_p += tr->func_so(TRANSCODING_STATE(tc),
646  char_start, (size_t)char_len,
647  out_p, out_stop - out_p);
648  }
649  else {
650  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
651  writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
652  char_start, (size_t)char_len,
654  writebuf_off = 0;
655  while (writebuf_off < writebuf_len) {
656  SUSPEND_OBUF(22);
657  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
658  }
659  }
660  break;
661  }
662  case FUNsio:
663  {
664  const unsigned char *char_start;
665  size_t char_len;
666  SUSPEND_OBUF(33);
667  if (tr->max_output <= out_stop - out_p) {
668  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
669  out_p += tr->func_sio(TRANSCODING_STATE(tc),
670  char_start, (size_t)char_len, next_info,
671  out_p, out_stop - out_p);
672  }
673  else {
674  char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
675  writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
676  char_start, (size_t)char_len, next_info,
678  writebuf_off = 0;
679  while (writebuf_off < writebuf_len) {
680  SUSPEND_OBUF(34);
681  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
682  }
683  }
684  break;
685  }
686  case INVALID:
687  if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
688  if (tc->recognized_len + (in_p - inchar_start) < unitlen)
690  while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
691  in_p = in_stop;
693  }
694  if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
695  in_p = in_stop;
696  }
697  else {
698  in_p = inchar_start + (unitlen - tc->recognized_len);
699  }
700  }
701  else {
702  ssize_t invalid_len; /* including the last byte which causes invalid */
703  ssize_t discard_len;
704  invalid_len = tc->recognized_len + (in_p - inchar_start);
705  discard_len = ((invalid_len - 1) / unitlen) * unitlen;
706  readagain_len = invalid_len - discard_len;
707  }
708  goto invalid;
709  case UNDEF:
710  goto undef;
711  default:
712  rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
713  }
714  continue;
715 
716  invalid:
718  continue;
719 
720  incomplete:
722  continue;
723 
724  undef:
726  continue;
727  }
728 
729  /* cleanup */
730  if (tr->finish_func) {
731  SUSPEND_OBUF(4);
732  if (tr->max_output <= out_stop - out_p) {
733  out_p += tr->finish_func(TRANSCODING_STATE(tc),
734  out_p, out_stop - out_p);
735  }
736  else {
737  writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
739  writebuf_off = 0;
740  while (writebuf_off < writebuf_len) {
741  SUSPEND_OBUF(23);
742  *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
743  }
744  }
745  }
746  while (1)
748 #undef SUSPEND
749 #undef next_table
750 #undef next_info
751 #undef next_byte
752 #undef writebuf_len
753 #undef writebuf_off
754 }
755 
756 static rb_econv_result_t
757 transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
758  const unsigned char *in_stop, unsigned char *out_stop,
759  rb_transcoding *tc,
760  const int opt)
761 {
762  if (tc->readagain_len) {
763  unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
764  const unsigned char *readagain_pos = readagain_buf;
765  const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
766  rb_econv_result_t res;
767 
768  MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
769  unsigned char, tc->readagain_len);
770  tc->readagain_len = 0;
771  res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
772  if (res != econv_source_buffer_empty) {
774  readagain_pos, unsigned char, readagain_stop - readagain_pos);
775  tc->readagain_len += readagain_stop - readagain_pos;
776  return res;
777  }
778  }
779  return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
780 }
781 
782 static rb_transcoding *
784 {
785  rb_transcoding *tc;
786 
787  tc = ALLOC(rb_transcoding);
788  tc->transcoder = tr;
789  tc->flags = flags;
790  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
791  tc->state.ptr = xmalloc(tr->state_size);
792  if (tr->state_init_func) {
793  (tr->state_init_func)(TRANSCODING_STATE(tc)); /* xxx: check return value */
794  }
795  tc->resume_position = 0;
796  tc->recognized_len = 0;
797  tc->readagain_len = 0;
798  tc->writebuf_len = 0;
799  tc->writebuf_off = 0;
800  if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
801  tc->readbuf.ptr = xmalloc(tr->max_input);
802  }
803  if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
804  tc->writebuf.ptr = xmalloc(tr->max_output);
805  }
806  return tc;
807 }
808 
809 static rb_econv_result_t
811  const unsigned char **input_ptr, const unsigned char *input_stop,
812  unsigned char **output_ptr, unsigned char *output_stop,
813  int flags)
814 {
815  return transcode_restartable(
816  input_ptr, output_ptr,
817  input_stop, output_stop,
818  tc, flags);
819 }
820 
821 static void
823 {
824  const rb_transcoder *tr = tc->transcoder;
825  if (tr->state_fini_func) {
826  (tr->state_fini_func)(TRANSCODING_STATE(tc)); /* check return value? */
827  }
828  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
829  xfree(tc->state.ptr);
830  if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
831  xfree(tc->readbuf.ptr);
832  if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
833  xfree(tc->writebuf.ptr);
834  xfree(tc);
835 }
836 
837 static size_t
839 {
840  size_t size = sizeof(rb_transcoding);
841  const rb_transcoder *tr = tc->transcoder;
842 
843  if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
844  size += tr->state_size;
845  }
846  if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
847  size += tr->max_input;
848  }
849  if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
850  size += tr->max_output;
851  }
852  return size;
853 }
854 
855 static rb_econv_t *
856 rb_econv_alloc(int n_hint)
857 {
858  rb_econv_t *ec;
859 
860  if (n_hint <= 0)
861  n_hint = 1;
862 
863  ec = ALLOC(rb_econv_t);
864  ec->flags = 0;
867  ec->started = 0;
868  ec->replacement_str = NULL;
869  ec->replacement_len = 0;
870  ec->replacement_enc = NULL;
871  ec->replacement_allocated = 0;
872  ec->in_buf_start = NULL;
873  ec->in_data_start = NULL;
874  ec->in_data_end = NULL;
875  ec->in_buf_end = NULL;
876  ec->num_allocated = n_hint;
877  ec->num_trans = 0;
879  ec->num_finished = 0;
880  ec->last_tc = NULL;
882  ec->last_error.error_tc = NULL;
886  ec->last_error.error_bytes_len = 0;
887  ec->last_error.readagain_len = 0;
888  ec->source_encoding = NULL;
890  return ec;
891 }
892 
893 static int
895 {
896  int n, j;
897  int bufsize = 4096;
898  unsigned char *p;
899 
900  if (ec->num_trans == ec->num_allocated) {
901  n = ec->num_allocated * 2;
903  ec->num_allocated = n;
904  }
905 
906  p = xmalloc(bufsize);
907 
908  MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
909 
911  ec->elems[i].out_buf_start = p;
912  ec->elems[i].out_buf_end = p + bufsize;
913  ec->elems[i].out_data_start = p;
914  ec->elems[i].out_data_end = p;
916 
917  ec->num_trans++;
918 
919  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
920  for (j = ec->num_trans-1; i <= j; j--) {
921  rb_transcoding *tc = ec->elems[j].tc;
922  const rb_transcoder *tr2 = tc->transcoder;
923  if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
924  ec->last_tc = tc;
925  break;
926  }
927  }
928 
929  return 0;
930 }
931 
932 static rb_econv_t *
934 {
935  rb_econv_t *ec;
936  int i, ret;
937 
938  for (i = 0; i < n; i++) {
939  const rb_transcoder *tr;
940  tr = load_transcoder_entry(entries[i]);
941  if (!tr)
942  return NULL;
943  }
944 
945  ec = rb_econv_alloc(n);
946 
947  for (i = 0; i < n; i++) {
948  const rb_transcoder *tr = load_transcoder_entry(entries[i]);
949  ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
950  if (ret == -1) {
951  rb_econv_close(ec);
952  return NULL;
953  }
954  }
955 
956  return ec;
957 }
958 
959 struct trans_open_t {
962 };
963 
964 static void
965 trans_open_i(const char *sname, const char *dname, int depth, void *arg)
966 {
967  struct trans_open_t *toarg = arg;
968 
969  if (!toarg->entries) {
970  toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
971  }
972  toarg->entries[depth] = get_transcoder_entry(sname, dname);
973 }
974 
975 static rb_econv_t *
976 rb_econv_open0(const char *sname, const char *dname, int ecflags)
977 {
979  int num_trans;
980  rb_econv_t *ec;
981 
982  int sidx, didx;
983 
984  if (*sname) {
985  sidx = rb_enc_find_index(sname);
986  if (0 <= sidx) {
987  rb_enc_from_index(sidx);
988  }
989  }
990 
991  if (*dname) {
992  didx = rb_enc_find_index(dname);
993  if (0 <= didx) {
994  rb_enc_from_index(didx);
995  }
996  }
997 
998  if (*sname == '\0' && *dname == '\0') {
999  num_trans = 0;
1000  entries = NULL;
1001  }
1002  else {
1003  struct trans_open_t toarg;
1004  toarg.entries = NULL;
1005  toarg.num_additional = 0;
1006  num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
1007  entries = toarg.entries;
1008  if (num_trans < 0) {
1009  xfree(entries);
1010  return NULL;
1011  }
1012  }
1013 
1014  ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
1015  xfree(entries);
1016  if (!ec)
1017  return NULL;
1018 
1019  ec->flags = ecflags;
1020  ec->source_encoding_name = sname;
1021  ec->destination_encoding_name = dname;
1022 
1023  return ec;
1024 }
1025 
1026 #define MAX_ECFLAGS_DECORATORS 32
1027 
1028 static int
1029 decorator_names(int ecflags, const char **decorators_ret)
1030 {
1031  int num_decorators;
1032 
1033  switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
1037  case 0:
1038  break;
1039  default:
1040  return -1;
1041  }
1042 
1043  if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
1045  return -1;
1046 
1047  num_decorators = 0;
1048 
1049  if (ecflags & ECONV_XML_TEXT_DECORATOR)
1050  decorators_ret[num_decorators++] = "xml_text_escape";
1051  if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
1052  decorators_ret[num_decorators++] = "xml_attr_content_escape";
1053  if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
1054  decorators_ret[num_decorators++] = "xml_attr_quote";
1055 
1056  if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
1057  decorators_ret[num_decorators++] = "crlf_newline";
1058  if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
1059  decorators_ret[num_decorators++] = "cr_newline";
1060  if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)
1061  decorators_ret[num_decorators++] = "universal_newline";
1062 
1063  return num_decorators;
1064 }
1065 
1066 rb_econv_t *
1067 rb_econv_open(const char *sname, const char *dname, int ecflags)
1068 {
1069  rb_econv_t *ec;
1070  int num_decorators;
1071  const char *decorators[MAX_ECFLAGS_DECORATORS];
1072  int i;
1073 
1074  num_decorators = decorator_names(ecflags, decorators);
1075  if (num_decorators == -1)
1076  return NULL;
1077 
1078  ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
1079  if (!ec)
1080  return NULL;
1081 
1082  for (i = 0; i < num_decorators; i++)
1083  if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
1084  rb_econv_close(ec);
1085  return NULL;
1086  }
1087 
1088  ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
1089 
1090  return ec;
1091 }
1092 
1093 static int
1095  const unsigned char **input_ptr, const unsigned char *input_stop,
1096  unsigned char **output_ptr, unsigned char *output_stop,
1097  int flags,
1098  int start)
1099 {
1100  int try;
1101  int i, f;
1102 
1103  const unsigned char **ipp, *is, *iold;
1104  unsigned char **opp, *os, *oold;
1105  rb_econv_result_t res;
1106 
1107  try = 1;
1108  while (try) {
1109  try = 0;
1110  for (i = start; i < ec->num_trans; i++) {
1111  rb_econv_elem_t *te = &ec->elems[i];
1112 
1113  if (i == 0) {
1114  ipp = input_ptr;
1115  is = input_stop;
1116  }
1117  else {
1118  rb_econv_elem_t *prev_te = &ec->elems[i-1];
1119  ipp = (const unsigned char **)&prev_te->out_data_start;
1120  is = prev_te->out_data_end;
1121  }
1122 
1123  if (i == ec->num_trans-1) {
1124  opp = output_ptr;
1125  os = output_stop;
1126  }
1127  else {
1128  if (te->out_buf_start != te->out_data_start) {
1129  ssize_t len = te->out_data_end - te->out_data_start;
1130  ssize_t off = te->out_data_start - te->out_buf_start;
1131  MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
1132  te->out_data_start = te->out_buf_start;
1133  te->out_data_end -= off;
1134  }
1135  opp = &te->out_data_end;
1136  os = te->out_buf_end;
1137  }
1138 
1139  f = flags;
1140  if (ec->num_finished != i)
1141  f |= ECONV_PARTIAL_INPUT;
1142  if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
1143  start = 1;
1144  flags &= ~ECONV_AFTER_OUTPUT;
1145  }
1146  if (i != 0)
1147  f &= ~ECONV_AFTER_OUTPUT;
1148  iold = *ipp;
1149  oold = *opp;
1150  te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
1151  if (iold != *ipp || oold != *opp)
1152  try = 1;
1153 
1154  switch (res) {
1158  case econv_after_output:
1159  return i;
1160 
1163  break;
1164 
1165  case econv_finished:
1166  ec->num_finished = i+1;
1167  break;
1168  }
1169  }
1170  }
1171  return -1;
1172 }
1173 
1174 static rb_econv_result_t
1176  const unsigned char **input_ptr, const unsigned char *input_stop,
1177  unsigned char **output_ptr, unsigned char *output_stop,
1178  int flags,
1179  int *result_position_ptr)
1180 {
1181  int i;
1182  int needreport_index;
1183  int sweep_start;
1184 
1185  unsigned char empty_buf;
1186  unsigned char *empty_ptr = &empty_buf;
1187 
1188  if (!input_ptr) {
1189  input_ptr = (const unsigned char **)&empty_ptr;
1190  input_stop = empty_ptr;
1191  }
1192 
1193  if (!output_ptr) {
1194  output_ptr = &empty_ptr;
1195  output_stop = empty_ptr;
1196  }
1197 
1198  if (ec->elems[0].last_result == econv_after_output)
1200 
1201  needreport_index = -1;
1202  for (i = ec->num_trans-1; 0 <= i; i--) {
1203  switch (ec->elems[i].last_result) {
1207  case econv_after_output:
1208  case econv_finished:
1209  sweep_start = i+1;
1210  needreport_index = i;
1211  goto found_needreport;
1212 
1215  break;
1216 
1217  default:
1218  rb_bug("unexpected transcode last result");
1219  }
1220  }
1221 
1222  /* /^[sd]+$/ is confirmed. but actually /^s*d*$/. */
1223 
1225  (flags & ECONV_AFTER_OUTPUT)) {
1226  rb_econv_result_t res;
1227 
1228  res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
1229  (flags & ~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT,
1230  result_position_ptr);
1231 
1232  if (res == econv_source_buffer_empty)
1233  return econv_after_output;
1234  return res;
1235  }
1236 
1237  sweep_start = 0;
1238 
1239  found_needreport:
1240 
1241  do {
1242  needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
1243  sweep_start = needreport_index + 1;
1244  } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
1245 
1246  for (i = ec->num_trans-1; 0 <= i; i--) {
1248  rb_econv_result_t res = ec->elems[i].last_result;
1249  if (res == econv_invalid_byte_sequence ||
1250  res == econv_incomplete_input ||
1251  res == econv_undefined_conversion ||
1252  res == econv_after_output) {
1254  }
1255  if (result_position_ptr)
1256  *result_position_ptr = i;
1257  return res;
1258  }
1259  }
1260  if (result_position_ptr)
1261  *result_position_ptr = -1;
1263 }
1264 
1265 static rb_econv_result_t
1267  const unsigned char **input_ptr, const unsigned char *input_stop,
1268  unsigned char **output_ptr, unsigned char *output_stop,
1269  int flags)
1270 {
1271  rb_econv_result_t res;
1272  int result_position;
1273  int has_output = 0;
1274 
1275  memset(&ec->last_error, 0, sizeof(ec->last_error));
1276 
1277  if (ec->num_trans == 0) {
1278  size_t len;
1279  if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
1280  if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
1281  len = output_stop - *output_ptr;
1282  memcpy(*output_ptr, ec->in_data_start, len);
1283  *output_ptr = output_stop;
1284  ec->in_data_start += len;
1286  goto gotresult;
1287  }
1288  len = ec->in_data_end - ec->in_data_start;
1289  memcpy(*output_ptr, ec->in_data_start, len);
1290  *output_ptr += len;
1291  ec->in_data_start = ec->in_data_end = ec->in_buf_start;
1292  if (flags & ECONV_AFTER_OUTPUT) {
1293  res = econv_after_output;
1294  goto gotresult;
1295  }
1296  }
1297  if (output_stop - *output_ptr < input_stop - *input_ptr) {
1298  len = output_stop - *output_ptr;
1299  }
1300  else {
1301  len = input_stop - *input_ptr;
1302  }
1303  if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
1304  *(*output_ptr)++ = *(*input_ptr)++;
1305  res = econv_after_output;
1306  goto gotresult;
1307  }
1308  memcpy(*output_ptr, *input_ptr, len);
1309  *output_ptr += len;
1310  *input_ptr += len;
1311  if (*input_ptr != input_stop)
1313  else if (flags & ECONV_PARTIAL_INPUT)
1315  else
1316  res = econv_finished;
1317  goto gotresult;
1318  }
1319 
1320  if (ec->elems[ec->num_trans-1].out_data_start) {
1321  unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
1322  unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
1323  if (data_start != data_end) {
1324  size_t len;
1325  if (output_stop - *output_ptr < data_end - data_start) {
1326  len = output_stop - *output_ptr;
1327  memcpy(*output_ptr, data_start, len);
1328  *output_ptr = output_stop;
1329  ec->elems[ec->num_trans-1].out_data_start += len;
1331  goto gotresult;
1332  }
1333  len = data_end - data_start;
1334  memcpy(*output_ptr, data_start, len);
1335  *output_ptr += len;
1336  ec->elems[ec->num_trans-1].out_data_start =
1337  ec->elems[ec->num_trans-1].out_data_end =
1338  ec->elems[ec->num_trans-1].out_buf_start;
1339  has_output = 1;
1340  }
1341  }
1342 
1343  if (ec->in_buf_start &&
1344  ec->in_data_start != ec->in_data_end) {
1345  res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
1346  (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
1347  if (res != econv_source_buffer_empty)
1348  goto gotresult;
1349  }
1350 
1351  if (has_output &&
1352  (flags & ECONV_AFTER_OUTPUT) &&
1353  *input_ptr != input_stop) {
1354  input_stop = *input_ptr;
1355  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1356  if (res == econv_source_buffer_empty)
1357  res = econv_after_output;
1358  }
1359  else if ((flags & ECONV_AFTER_OUTPUT) ||
1360  ec->num_trans == 1) {
1361  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1362  }
1363  else {
1364  flags |= ECONV_AFTER_OUTPUT;
1365  do {
1366  res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
1367  } while (res == econv_after_output);
1368  }
1369 
1370  gotresult:
1371  ec->last_error.result = res;
1372  if (res == econv_invalid_byte_sequence ||
1373  res == econv_incomplete_input ||
1374  res == econv_undefined_conversion) {
1375  rb_transcoding *error_tc = ec->elems[result_position].tc;
1376  ec->last_error.error_tc = error_tc;
1380  ec->last_error.error_bytes_len = error_tc->recognized_len;
1381  ec->last_error.readagain_len = error_tc->readagain_len;
1382  }
1383 
1384  return res;
1385 }
1386 
1388 
1389 static int
1391 {
1392  int ret;
1393  unsigned char utfbuf[1024];
1394  const unsigned char *utf;
1395  size_t utf_len;
1396  int utf_allocated = 0;
1397  char charef_buf[16];
1398  const unsigned char *p;
1399 
1400  if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
1401  utf = ec->last_error.error_bytes_start;
1402  utf_len = ec->last_error.error_bytes_len;
1403  }
1404  else {
1407  utfbuf, sizeof(utfbuf),
1408  &utf_len);
1409  if (!utf)
1410  return -1;
1411  if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
1412  utf_allocated = 1;
1413  }
1414 
1415  if (utf_len % 4 != 0)
1416  goto fail;
1417 
1418  p = utf;
1419  while (4 <= utf_len) {
1420  unsigned int u = 0;
1421  u += p[0] << 24;
1422  u += p[1] << 16;
1423  u += p[2] << 8;
1424  u += p[3];
1425  snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
1426 
1427  ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
1428  if (ret == -1)
1429  goto fail;
1430 
1431  p += 4;
1432  utf_len -= 4;
1433  }
1434 
1435  if (utf_allocated)
1436  xfree((void *)utf);
1437  return 0;
1438 
1439  fail:
1440  if (utf_allocated)
1441  xfree((void *)utf);
1442  return -1;
1443 }
1444 
1447  const unsigned char **input_ptr, const unsigned char *input_stop,
1448  unsigned char **output_ptr, unsigned char *output_stop,
1449  int flags)
1450 {
1451  rb_econv_result_t ret;
1452 
1453  unsigned char empty_buf;
1454  unsigned char *empty_ptr = &empty_buf;
1455 
1456  ec->started = 1;
1457 
1458  if (!input_ptr) {
1459  input_ptr = (const unsigned char **)&empty_ptr;
1460  input_stop = empty_ptr;
1461  }
1462 
1463  if (!output_ptr) {
1464  output_ptr = &empty_ptr;
1465  output_stop = empty_ptr;
1466  }
1467 
1468  resume:
1469  ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
1470 
1471  if (ret == econv_invalid_byte_sequence ||
1472  ret == econv_incomplete_input) {
1473  /* deal with invalid byte sequence */
1474  /* todo: add more alternative behaviors */
1475  switch (ec->flags & ECONV_INVALID_MASK) {
1476  case ECONV_INVALID_REPLACE:
1477  if (output_replacement_character(ec) == 0)
1478  goto resume;
1479  }
1480  }
1481 
1482  if (ret == econv_undefined_conversion) {
1483  /* valid character in source encoding
1484  * but no related character(s) in destination encoding */
1485  /* todo: add more alternative behaviors */
1486  switch (ec->flags & ECONV_UNDEF_MASK) {
1487  case ECONV_UNDEF_REPLACE:
1488  if (output_replacement_character(ec) == 0)
1489  goto resume;
1490  break;
1491 
1493  if (output_hex_charref(ec) == 0)
1494  goto resume;
1495  break;
1496  }
1497  }
1498 
1499  return ret;
1500 }
1501 
1502 const char *
1504 {
1505  rb_transcoding *tc = ec->last_tc;
1506  const rb_transcoder *tr;
1507 
1508  if (tc == NULL)
1509  return "";
1510 
1511  tr = tc->transcoder;
1512 
1514  return tr->src_encoding;
1515  return tr->dst_encoding;
1516 }
1517 
1518 static unsigned char *
1519 allocate_converted_string(const char *sname, const char *dname,
1520  const unsigned char *str, size_t len,
1521  unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
1522  size_t *dst_len_ptr)
1523 {
1524  unsigned char *dst_str;
1525  size_t dst_len;
1526  size_t dst_bufsize;
1527 
1528  rb_econv_t *ec;
1529  rb_econv_result_t res;
1530 
1531  const unsigned char *sp;
1532  unsigned char *dp;
1533 
1534  if (caller_dst_buf)
1535  dst_bufsize = caller_dst_bufsize;
1536  else if (len == 0)
1537  dst_bufsize = 1;
1538  else
1539  dst_bufsize = len;
1540 
1541  ec = rb_econv_open(sname, dname, 0);
1542  if (ec == NULL)
1543  return NULL;
1544  if (caller_dst_buf)
1545  dst_str = caller_dst_buf;
1546  else
1547  dst_str = xmalloc(dst_bufsize);
1548  dst_len = 0;
1549  sp = str;
1550  dp = dst_str+dst_len;
1551  res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1552  dst_len = dp - dst_str;
1553  while (res == econv_destination_buffer_full) {
1554  if (SIZE_MAX/2 < dst_bufsize) {
1555  goto fail;
1556  }
1557  dst_bufsize *= 2;
1558  if (dst_str == caller_dst_buf) {
1559  unsigned char *tmp;
1560  tmp = xmalloc(dst_bufsize);
1561  memcpy(tmp, dst_str, dst_bufsize/2);
1562  dst_str = tmp;
1563  }
1564  else {
1565  dst_str = xrealloc(dst_str, dst_bufsize);
1566  }
1567  dp = dst_str+dst_len;
1568  res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
1569  dst_len = dp - dst_str;
1570  }
1571  if (res != econv_finished) {
1572  goto fail;
1573  }
1574  rb_econv_close(ec);
1575  *dst_len_ptr = dst_len;
1576  return dst_str;
1577 
1578  fail:
1579  if (dst_str != caller_dst_buf)
1580  xfree(dst_str);
1581  rb_econv_close(ec);
1582  return NULL;
1583 }
1584 
1585 /* result: 0:success -1:failure */
1586 int
1588  const unsigned char *str, size_t len, const char *str_encoding)
1589 {
1590  const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
1591  unsigned char insert_buf[4096];
1592  const unsigned char *insert_str = NULL;
1593  size_t insert_len;
1594 
1595  int last_trans_index;
1596  rb_transcoding *tc;
1597 
1598  unsigned char **buf_start_p;
1599  unsigned char **data_start_p;
1600  unsigned char **data_end_p;
1601  unsigned char **buf_end_p;
1602 
1603  size_t need;
1604 
1605  ec->started = 1;
1606 
1607  if (len == 0)
1608  return 0;
1609 
1610  if (encoding_equal(insert_encoding, str_encoding)) {
1611  insert_str = str;
1612  insert_len = len;
1613  }
1614  else {
1615  insert_str = allocate_converted_string(str_encoding, insert_encoding,
1616  str, len, insert_buf, sizeof(insert_buf), &insert_len);
1617  if (insert_str == NULL)
1618  return -1;
1619  }
1620 
1621  need = insert_len;
1622 
1623  last_trans_index = ec->num_trans-1;
1624  if (ec->num_trans == 0) {
1625  tc = NULL;
1626  buf_start_p = &ec->in_buf_start;
1627  data_start_p = &ec->in_data_start;
1628  data_end_p = &ec->in_data_end;
1629  buf_end_p = &ec->in_buf_end;
1630  }
1631  else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
1632  tc = ec->elems[last_trans_index].tc;
1633  need += tc->readagain_len;
1634  if (need < insert_len)
1635  goto fail;
1636  if (last_trans_index == 0) {
1637  buf_start_p = &ec->in_buf_start;
1638  data_start_p = &ec->in_data_start;
1639  data_end_p = &ec->in_data_end;
1640  buf_end_p = &ec->in_buf_end;
1641  }
1642  else {
1643  rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
1644  buf_start_p = &ee->out_buf_start;
1645  data_start_p = &ee->out_data_start;
1646  data_end_p = &ee->out_data_end;
1647  buf_end_p = &ee->out_buf_end;
1648  }
1649  }
1650  else {
1651  rb_econv_elem_t *ee = &ec->elems[last_trans_index];
1652  buf_start_p = &ee->out_buf_start;
1653  data_start_p = &ee->out_data_start;
1654  data_end_p = &ee->out_data_end;
1655  buf_end_p = &ee->out_buf_end;
1656  tc = ec->elems[last_trans_index].tc;
1657  }
1658 
1659  if (*buf_start_p == NULL) {
1660  unsigned char *buf = xmalloc(need);
1661  *buf_start_p = buf;
1662  *data_start_p = buf;
1663  *data_end_p = buf;
1664  *buf_end_p = buf+need;
1665  }
1666  else if ((size_t)(*buf_end_p - *data_end_p) < need) {
1667  MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
1668  *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
1669  *data_start_p = *buf_start_p;
1670  if ((size_t)(*buf_end_p - *data_end_p) < need) {
1671  unsigned char *buf;
1672  size_t s = (*data_end_p - *buf_start_p) + need;
1673  if (s < need)
1674  goto fail;
1675  buf = xrealloc(*buf_start_p, s);
1676  *data_start_p = buf;
1677  *data_end_p = buf + (*data_end_p - *buf_start_p);
1678  *buf_start_p = buf;
1679  *buf_end_p = buf + s;
1680  }
1681  }
1682 
1683  memcpy(*data_end_p, insert_str, insert_len);
1684  *data_end_p += insert_len;
1685  if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
1686  memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
1687  *data_end_p += tc->readagain_len;
1688  tc->readagain_len = 0;
1689  }
1690 
1691  if (insert_str != str && insert_str != insert_buf)
1692  xfree((void*)insert_str);
1693  return 0;
1694 
1695  fail:
1696  if (insert_str != str && insert_str != insert_buf)
1697  xfree((void*)insert_str);
1698  return -1;
1699 }
1700 
1701 void
1703 {
1704  int i;
1705 
1706  if (ec->replacement_allocated) {
1707  xfree((void *)ec->replacement_str);
1708  }
1709  for (i = 0; i < ec->num_trans; i++) {
1710  rb_transcoding_close(ec->elems[i].tc);
1711  if (ec->elems[i].out_buf_start)
1712  xfree(ec->elems[i].out_buf_start);
1713  }
1714  xfree(ec->in_buf_start);
1715  xfree(ec->elems);
1716  xfree(ec);
1717 }
1718 
1719 size_t
1721 {
1722  size_t size = sizeof(rb_econv_t);
1723  int i;
1724 
1725  if (ec->replacement_allocated) {
1726  size += ec->replacement_len;
1727  }
1728  for (i = 0; i < ec->num_trans; i++) {
1729  size += rb_transcoding_memsize(ec->elems[i].tc);
1730 
1731  if (ec->elems[i].out_buf_start) {
1732  size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
1733  }
1734  }
1735  size += ec->in_buf_end - ec->in_buf_start;
1736  size += sizeof(rb_econv_elem_t) * ec->num_allocated;
1737 
1738  return size;
1739 }
1740 
1741 int
1743 {
1744  if (ec->num_trans == 0)
1745  return 0;
1746 #if SIZEOF_SIZE_T > SIZEOF_INT
1747  if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
1748 #endif
1749  return (int)ec->elems[0].tc->readagain_len;
1750 }
1751 
1752 void
1753 rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
1754 {
1755  rb_transcoding *tc;
1756  if (ec->num_trans == 0 || n == 0)
1757  return;
1758  tc = ec->elems[0].tc;
1759  memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
1760  tc->readagain_len -= n;
1761 }
1762 
1764  const char *ascii_compat_name;
1765  const char *ascii_incompat_name;
1766 };
1767 
1768 static int
1770 {
1771  struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
1772  transcoder_entry_t *entry = (transcoder_entry_t *)val;
1773  const rb_transcoder *tr;
1774 
1775  if (DECORATOR_P(entry->sname, entry->dname))
1776  return ST_CONTINUE;
1777  tr = load_transcoder_entry(entry);
1778  if (tr && tr->asciicompat_type == asciicompat_decoder) {
1779  data->ascii_compat_name = tr->dst_encoding;
1780  return ST_STOP;
1781  }
1782  return ST_CONTINUE;
1783 }
1784 
1785 const char *
1787 {
1788  st_data_t v;
1789  st_table *table2;
1790  struct asciicompat_encoding_t data;
1791 
1792  if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
1793  return NULL;
1794  table2 = (st_table *)v;
1795 
1796  /*
1797  * Assumption:
1798  * There is at most one transcoder for
1799  * converting from ASCII incompatible encoding.
1800  *
1801  * For ISO-2022-JP, there is ISO-2022-JP -> stateless-ISO-2022-JP and no others.
1802  */
1803  if (table2->num_entries != 1)
1804  return NULL;
1805 
1807  data.ascii_compat_name = NULL;
1808  st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
1809  return data.ascii_compat_name;
1810 }
1811 
1812 VALUE
1813 rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
1814 {
1815  unsigned const char *sp, *se;
1816  unsigned char *ds, *dp, *de;
1817  rb_econv_result_t res;
1818  int max_output;
1819 
1820  if (NIL_P(dst)) {
1821  dst = rb_str_buf_new(len);
1822  if (ec->destination_encoding)
1824  }
1825 
1826  if (ec->last_tc)
1827  max_output = ec->last_tc->transcoder->max_output;
1828  else
1829  max_output = 1;
1830 
1831  do {
1832  long dlen = RSTRING_LEN(dst);
1833  if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
1834  unsigned long new_capa = (unsigned long)dlen + len + max_output;
1835  if (LONG_MAX < new_capa)
1836  rb_raise(rb_eArgError, "too long string");
1837  rb_str_resize(dst, new_capa);
1838  rb_str_set_len(dst, dlen);
1839  }
1840  sp = (const unsigned char *)ss;
1841  se = sp + len;
1842  ds = (unsigned char *)RSTRING_PTR(dst);
1843  de = ds + rb_str_capacity(dst);
1844  dp = ds += dlen;
1845  res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
1846  len -= (const char *)sp - ss;
1847  ss = (const char *)sp;
1848  rb_str_set_len(dst, dlen + (dp - ds));
1850  } while (res == econv_destination_buffer_full);
1851 
1852  return dst;
1853 }
1854 
1855 VALUE
1856 rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
1857 {
1858  src = rb_str_new_frozen(src);
1859  dst = rb_econv_append(ec, RSTRING_PTR(src) + off, len, dst, flags);
1860  RB_GC_GUARD(src);
1861  return dst;
1862 }
1863 
1864 VALUE
1865 rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
1866 {
1867  return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
1868 }
1869 
1870 VALUE
1871 rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
1872 {
1873  return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
1874 }
1875 
1876 VALUE
1878 {
1879  return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
1880 }
1881 
1882 static int
1883 rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
1884 {
1885  transcoder_entry_t *entry;
1886  const rb_transcoder *tr;
1887 
1888  if (ec->started != 0)
1889  return -1;
1890 
1891  entry = get_transcoder_entry(sname, dname);
1892  if (!entry)
1893  return -1;
1894 
1895  tr = load_transcoder_entry(entry);
1896  if (!tr) return -1;
1897 
1898  return rb_econv_add_transcoder_at(ec, tr, n);
1899 }
1900 
1901 static int
1902 rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
1903 {
1904  return rb_econv_add_converter(ec, "", decorator_name, n);
1905 }
1906 
1907 int
1908 rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
1909 {
1910  const rb_transcoder *tr;
1911 
1912  if (ec->num_trans == 0)
1913  return rb_econv_decorate_at(ec, decorator_name, 0);
1914 
1915  tr = ec->elems[0].tc->transcoder;
1916 
1917  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1919  return rb_econv_decorate_at(ec, decorator_name, 1);
1920 
1921  return rb_econv_decorate_at(ec, decorator_name, 0);
1922 }
1923 
1924 int
1925 rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
1926 {
1927  const rb_transcoder *tr;
1928 
1929  if (ec->num_trans == 0)
1930  return rb_econv_decorate_at(ec, decorator_name, 0);
1931 
1932  tr = ec->elems[ec->num_trans-1].tc->transcoder;
1933 
1934  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
1936  return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
1937 
1938  return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
1939 }
1940 
1941 void
1943 {
1944  const char *dname = 0;
1945 
1946  switch (ec->flags & ECONV_NEWLINE_DECORATOR_MASK) {
1948  dname = "universal_newline";
1949  break;
1951  dname = "crlf_newline";
1952  break;
1954  dname = "cr_newline";
1955  break;
1956  }
1957 
1958  if (dname) {
1959  const rb_transcoder *transcoder = get_transcoder_entry("", dname)->transcoder;
1960  int num_trans = ec->num_trans;
1961  int i, j = 0;
1962 
1963  for (i=0; i < num_trans; i++) {
1964  if (transcoder == ec->elems[i].tc->transcoder) {
1965  rb_transcoding_close(ec->elems[i].tc);
1966  xfree(ec->elems[i].out_buf_start);
1967  ec->num_trans--;
1968  }
1969  else
1970  ec->elems[j++] = ec->elems[i];
1971  }
1972  }
1973 
1975 }
1976 
1977 static VALUE
1978 econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
1979 {
1980  int has_description = 0;
1981 
1982  if (NIL_P(mesg))
1983  mesg = rb_str_new(NULL, 0);
1984 
1985  if (*sname != '\0' || *dname != '\0') {
1986  if (*sname == '\0')
1987  rb_str_cat2(mesg, dname);
1988  else if (*dname == '\0')
1989  rb_str_cat2(mesg, sname);
1990  else
1991  rb_str_catf(mesg, "%s to %s", sname, dname);
1992  has_description = 1;
1993  }
1994 
1995  if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
1999  const char *pre = "";
2000  if (has_description)
2001  rb_str_cat2(mesg, " with ");
2002  if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
2003  rb_str_cat2(mesg, pre); pre = ",";
2004  rb_str_cat2(mesg, "universal_newline");
2005  }
2006  if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
2007  rb_str_cat2(mesg, pre); pre = ",";
2008  rb_str_cat2(mesg, "crlf_newline");
2009  }
2010  if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
2011  rb_str_cat2(mesg, pre); pre = ",";
2012  rb_str_cat2(mesg, "cr_newline");
2013  }
2014  if (ecflags & ECONV_XML_TEXT_DECORATOR) {
2015  rb_str_cat2(mesg, pre); pre = ",";
2016  rb_str_cat2(mesg, "xml_text");
2017  }
2018  if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
2019  rb_str_cat2(mesg, pre); pre = ",";
2020  rb_str_cat2(mesg, "xml_attr_content");
2021  }
2022  if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
2023  rb_str_cat2(mesg, pre); pre = ",";
2024  rb_str_cat2(mesg, "xml_attr_quote");
2025  }
2026  has_description = 1;
2027  }
2028  if (!has_description) {
2029  rb_str_cat2(mesg, "no-conversion");
2030  }
2031 
2032  return mesg;
2033 }
2034 
2035 VALUE
2036 rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
2037 {
2038  VALUE mesg, exc;
2039  mesg = rb_str_new_cstr("code converter not found (");
2040  econv_description(sname, dname, ecflags, mesg);
2041  rb_str_cat2(mesg, ")");
2043  return exc;
2044 }
2045 
2046 static VALUE
2048 {
2049  VALUE mesg, exc;
2052  const char *err = (const char *)ec->last_error.error_bytes_start;
2053  size_t error_len = ec->last_error.error_bytes_len;
2054  VALUE bytes = rb_str_new(err, error_len);
2055  VALUE dumped = rb_str_dump(bytes);
2056  size_t readagain_len = ec->last_error.readagain_len;
2057  VALUE bytes2 = Qnil;
2058  VALUE dumped2;
2059  int idx;
2061  mesg = rb_sprintf("incomplete %s on %s",
2062  StringValueCStr(dumped),
2064  }
2065  else if (readagain_len) {
2066  bytes2 = rb_str_new(err+error_len, readagain_len);
2067  dumped2 = rb_str_dump(bytes2);
2068  mesg = rb_sprintf("%s followed by %s on %s",
2069  StringValueCStr(dumped),
2070  StringValueCStr(dumped2),
2072  }
2073  else {
2074  mesg = rb_sprintf("%s on %s",
2075  StringValueCStr(dumped),
2077  }
2078 
2080  rb_ivar_set(exc, rb_intern("error_bytes"), bytes);
2081  rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2);
2082  rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse);
2083 
2084  set_encs:
2085  rb_ivar_set(exc, rb_intern("source_encoding_name"), rb_str_new2(ec->last_error.source_encoding));
2086  rb_ivar_set(exc, rb_intern("destination_encoding_name"), rb_str_new2(ec->last_error.destination_encoding));
2088  if (0 <= idx)
2089  rb_ivar_set(exc, rb_intern("source_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
2091  if (0 <= idx)
2092  rb_ivar_set(exc, rb_intern("destination_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
2093  return exc;
2094  }
2096  VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
2098  VALUE dumped = Qnil;
2099  int idx;
2100  if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
2101  rb_encoding *utf8 = rb_utf8_encoding();
2102  const char *start, *end;
2103  int n;
2104  start = (const char *)ec->last_error.error_bytes_start;
2105  end = start + ec->last_error.error_bytes_len;
2106  n = rb_enc_precise_mbclen(start, end, utf8);
2107  if (MBCLEN_CHARFOUND_P(n) &&
2108  (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
2109  unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
2110  dumped = rb_sprintf("U+%04X", cc);
2111  }
2112  }
2113  if (dumped == Qnil)
2114  dumped = rb_str_dump(bytes);
2115  if (strcmp(ec->last_error.source_encoding,
2116  ec->source_encoding_name) == 0 &&
2117  strcmp(ec->last_error.destination_encoding,
2118  ec->destination_encoding_name) == 0) {
2119  mesg = rb_sprintf("%s from %s to %s",
2120  StringValueCStr(dumped),
2123  }
2124  else {
2125  int i;
2126  mesg = rb_sprintf("%s to %s in conversion from %s",
2127  StringValueCStr(dumped),
2129  ec->source_encoding_name);
2130  for (i = 0; i < ec->num_trans; i++) {
2131  const rb_transcoder *tr = ec->elems[i].tc->transcoder;
2132  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
2133  rb_str_catf(mesg, " to %s",
2134  ec->elems[i].tc->transcoder->dst_encoding);
2135  }
2136  }
2139  if (0 <= idx)
2140  rb_enc_associate_index(bytes, idx);
2141  rb_ivar_set(exc, rb_intern("error_char"), bytes);
2142  goto set_encs;
2143  }
2144  return Qnil;
2145 }
2146 
2147 static void
2149  VALUE destination,
2150  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2151  int max_output,
2152  unsigned char **out_start_ptr,
2153  unsigned char **out_pos,
2154  unsigned char **out_stop_ptr)
2155 {
2156  size_t len = (*out_pos - *out_start_ptr);
2157  size_t new_len = (len + max_output) * 2;
2158  *out_start_ptr = resize_destination(destination, len, new_len);
2159  *out_pos = *out_start_ptr + len;
2160  *out_stop_ptr = *out_start_ptr + new_len;
2161 }
2162 
2163 static int
2165 {
2166  rb_transcoding *tc;
2167  const rb_transcoder *tr;
2168  const unsigned char *replacement;
2169  const char *repl_enc;
2170  const char *ins_enc;
2171  size_t len;
2172 
2173  if (ec->replacement_str)
2174  return 0;
2175 
2176  ins_enc = rb_econv_encoding_to_insert_output(ec);
2177 
2178  tc = ec->last_tc;
2179  if (*ins_enc) {
2180  tr = tc->transcoder;
2182  replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
2183  }
2184  else {
2185  replacement = (unsigned char *)"?";
2186  len = 1;
2187  repl_enc = "";
2188  }
2189 
2190  ec->replacement_str = replacement;
2191  ec->replacement_len = len;
2192  ec->replacement_enc = repl_enc;
2193  ec->replacement_allocated = 0;
2194  return 0;
2195 }
2196 
2197 int
2199  const unsigned char *str, size_t len, const char *encname)
2200 {
2201  unsigned char *str2;
2202  size_t len2;
2203  const char *encname2;
2204 
2205  encname2 = rb_econv_encoding_to_insert_output(ec);
2206 
2207  if (encoding_equal(encname, encname2)) {
2208  str2 = xmalloc(len);
2209  MEMCPY(str2, str, unsigned char, len); /* xxx: str may be invalid */
2210  len2 = len;
2211  encname2 = encname;
2212  }
2213  else {
2214  str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
2215  if (!str2)
2216  return -1;
2217  }
2218 
2219  if (ec->replacement_allocated) {
2220  xfree((void *)ec->replacement_str);
2221  }
2222  ec->replacement_allocated = 1;
2223  ec->replacement_str = str2;
2224  ec->replacement_len = len2;
2225  ec->replacement_enc = encname2;
2226  return 0;
2227 }
2228 
2229 static int
2231 {
2232  int ret;
2233 
2234  if (make_replacement(ec) == -1)
2235  return -1;
2236 
2238  if (ret == -1)
2239  return -1;
2240 
2241  return 0;
2242 }
2243 
2244 #if 1
2245 #define hash_fallback rb_hash_aref
2246 
2247 static VALUE
2249 {
2250  return rb_proc_call(fallback, rb_ary_new4(1, &c));
2251 }
2252 
2253 static VALUE
2255 {
2256  return rb_method_call(1, &c, fallback);
2257 }
2258 
2259 static VALUE
2261 {
2262  return rb_funcall3(fallback, sym_aref, 1, &c);
2263 }
2264 
2265 static void
2266 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2267  const unsigned char *in_stop, unsigned char *out_stop,
2268  VALUE destination,
2269  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2270  const char *src_encoding,
2271  const char *dst_encoding,
2272  int ecflags,
2273  VALUE ecopts)
2274 {
2275  rb_econv_t *ec;
2276  rb_transcoding *last_tc;
2277  rb_econv_result_t ret;
2278  unsigned char *out_start = *out_pos;
2279  int max_output;
2280  VALUE exc;
2281  VALUE fallback = Qnil;
2282  VALUE (*fallback_func)(VALUE, VALUE) = 0;
2283 
2284  ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2285  if (!ec)
2286  rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2287 
2288  if (!NIL_P(ecopts) && RB_TYPE_P(ecopts, T_HASH)) {
2289  fallback = rb_hash_aref(ecopts, sym_fallback);
2290  if (RB_TYPE_P(fallback, T_HASH)) {
2291  fallback_func = hash_fallback;
2292  }
2293  else if (rb_obj_is_proc(fallback)) {
2294  fallback_func = proc_fallback;
2295  }
2296  else if (rb_obj_is_method(fallback)) {
2297  fallback_func = method_fallback;
2298  }
2299  else {
2300  fallback_func = aref_fallback;
2301  }
2302  }
2303  last_tc = ec->last_tc;
2304  max_output = last_tc ? last_tc->transcoder->max_output : 1;
2305 
2306  resume:
2307  ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
2308 
2309  if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
2310  VALUE rep = rb_enc_str_new(
2311  (const char *)ec->last_error.error_bytes_start,
2314  rep = (*fallback_func)(fallback, rep);
2315  if (rep != Qundef && !NIL_P(rep)) {
2316  StringValue(rep);
2317  ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
2318  RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
2319  if ((int)ret == -1) {
2320  rb_raise(rb_eArgError, "too big fallback string");
2321  }
2322  goto resume;
2323  }
2324  }
2325 
2326  if (ret == econv_invalid_byte_sequence ||
2327  ret == econv_incomplete_input ||
2328  ret == econv_undefined_conversion) {
2329  exc = make_econv_exception(ec);
2330  rb_econv_close(ec);
2331  rb_exc_raise(exc);
2332  }
2333 
2334  if (ret == econv_destination_buffer_full) {
2335  more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2336  goto resume;
2337  }
2338 
2339  rb_econv_close(ec);
2340  return;
2341 }
2342 #else
2343 /* sample transcode_loop implementation in byte-by-byte stream style */
2344 static void
2345 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
2346  const unsigned char *in_stop, unsigned char *out_stop,
2347  VALUE destination,
2348  unsigned char *(*resize_destination)(VALUE, size_t, size_t),
2349  const char *src_encoding,
2350  const char *dst_encoding,
2351  int ecflags,
2352  VALUE ecopts)
2353 {
2354  rb_econv_t *ec;
2355  rb_transcoding *last_tc;
2356  rb_econv_result_t ret;
2357  unsigned char *out_start = *out_pos;
2358  const unsigned char *ptr;
2359  int max_output;
2360  VALUE exc;
2361 
2362  ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
2363  if (!ec)
2364  rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
2365 
2366  last_tc = ec->last_tc;
2367  max_output = last_tc ? last_tc->transcoder->max_output : 1;
2368 
2370  ptr = *in_pos;
2371  while (ret != econv_finished) {
2372  unsigned char input_byte;
2373  const unsigned char *p = &input_byte;
2374 
2375  if (ret == econv_source_buffer_empty) {
2376  if (ptr < in_stop) {
2377  input_byte = *ptr;
2378  ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2379  }
2380  else {
2381  ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
2382  }
2383  }
2384  else {
2385  ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
2386  }
2387  if (&input_byte != p)
2388  ptr += p - &input_byte;
2389  switch (ret) {
2393  exc = make_econv_exception(ec);
2394  rb_econv_close(ec);
2395  rb_exc_raise(exc);
2396  break;
2397 
2399  more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
2400  break;
2401 
2403  break;
2404 
2405  case econv_finished:
2406  break;
2407  }
2408  }
2409  rb_econv_close(ec);
2410  *in_pos = in_stop;
2411  return;
2412 }
2413 #endif
2414 
2415 
2416 /*
2417  * String-specific code
2418  */
2419 
2420 static unsigned char *
2421 str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
2422 {
2423  rb_str_resize(destination, new_len);
2424  return (unsigned char *)RSTRING_PTR(destination);
2425 }
2426 
2427 static int
2428 econv_opts(VALUE opt, int ecflags)
2429 {
2430  VALUE v;
2431 
2432  v = rb_hash_aref(opt, sym_invalid);
2433  if (NIL_P(v)) {
2434  }
2435  else if (v==sym_replace) {
2436  ecflags |= ECONV_INVALID_REPLACE;
2437  }
2438  else {
2439  rb_raise(rb_eArgError, "unknown value for invalid character option");
2440  }
2441 
2442  v = rb_hash_aref(opt, sym_undef);
2443  if (NIL_P(v)) {
2444  }
2445  else if (v==sym_replace) {
2446  ecflags |= ECONV_UNDEF_REPLACE;
2447  }
2448  else {
2449  rb_raise(rb_eArgError, "unknown value for undefined character option");
2450  }
2451 
2452  v = rb_hash_aref(opt, sym_replace);
2453  if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
2454  ecflags |= ECONV_UNDEF_REPLACE;
2455  }
2456 
2457  v = rb_hash_aref(opt, sym_xml);
2458  if (!NIL_P(v)) {
2459  if (v==sym_text) {
2461  }
2462  else if (v==sym_attr) {
2464  }
2465  else if (RB_TYPE_P(v, T_SYMBOL)) {
2466  rb_raise(rb_eArgError, "unexpected value for xml option: %s", rb_id2name(SYM2ID(v)));
2467  }
2468  else {
2469  rb_raise(rb_eArgError, "unexpected value for xml option");
2470  }
2471  }
2472 
2473 #ifdef ENABLE_ECONV_NEWLINE_OPTION
2474  v = rb_hash_aref(opt, sym_newline);
2475  if (!NIL_P(v)) {
2476  ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2477  if (v == sym_universal) {
2479  }
2480  else if (v == sym_crlf) {
2481  ecflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2482  }
2483  else if (v == sym_cr) {
2484  ecflags |= ECONV_CR_NEWLINE_DECORATOR;
2485  }
2486  else if (v == sym_lf) {
2487  /* ecflags |= ECONV_LF_NEWLINE_DECORATOR; */
2488  }
2489  else if (SYMBOL_P(v)) {
2490  rb_raise(rb_eArgError, "unexpected value for newline option: %s",
2491  rb_id2name(SYM2ID(v)));
2492  }
2493  else {
2494  rb_raise(rb_eArgError, "unexpected value for newline option");
2495  }
2496  }
2497  else
2498 #endif
2499  {
2500  int setflags = 0, newlineflag = 0;
2501 
2503  if (RTEST(v))
2505  newlineflag |= !NIL_P(v);
2506 
2507  v = rb_hash_aref(opt, sym_crlf_newline);
2508  if (RTEST(v))
2509  setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
2510  newlineflag |= !NIL_P(v);
2511 
2512  v = rb_hash_aref(opt, sym_cr_newline);
2513  if (RTEST(v))
2514  setflags |= ECONV_CR_NEWLINE_DECORATOR;
2515  newlineflag |= !NIL_P(v);
2516 
2517  if (newlineflag) {
2518  ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
2519  ecflags |= setflags;
2520  }
2521  }
2522 
2523  return ecflags;
2524 }
2525 
2526 int
2527 rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
2528 {
2529  VALUE newhash = Qnil;
2530  VALUE v;
2531 
2532  if (NIL_P(opthash)) {
2533  *opts = Qnil;
2534  return ecflags;
2535  }
2536  ecflags = econv_opts(opthash, ecflags);
2537 
2538  v = rb_hash_aref(opthash, sym_replace);
2539  if (!NIL_P(v)) {
2540  StringValue(v);
2542  VALUE dumped = rb_str_dump(v);
2543  rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
2544  StringValueCStr(dumped),
2545  rb_enc_name(rb_enc_get(v)));
2546  }
2547  v = rb_str_new_frozen(v);
2548  newhash = rb_hash_new();
2549  rb_hash_aset(newhash, sym_replace, v);
2550  }
2551 
2552  v = rb_hash_aref(opthash, sym_fallback);
2553  if (!NIL_P(v)) {
2554  VALUE h = rb_check_hash_type(v);
2555  if (NIL_P(h)
2557  : (v = h, 1)) {
2558  if (NIL_P(newhash))
2559  newhash = rb_hash_new();
2560  rb_hash_aset(newhash, sym_fallback, v);
2561  }
2562  }
2563 
2564  if (!NIL_P(newhash))
2565  rb_hash_freeze(newhash);
2566  *opts = newhash;
2567 
2568  return ecflags;
2569 }
2570 
2571 int
2573 {
2574  return rb_econv_prepare_options(opthash, opts, 0);
2575 }
2576 
2577 rb_econv_t *
2578 rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
2579 {
2580  rb_econv_t *ec;
2581  VALUE replacement;
2582 
2583  if (NIL_P(opthash)) {
2584  replacement = Qnil;
2585  }
2586  else {
2587  if (!RB_TYPE_P(opthash, T_HASH) || !OBJ_FROZEN(opthash))
2588  rb_bug("rb_econv_open_opts called with invalid opthash");
2589  replacement = rb_hash_aref(opthash, sym_replace);
2590  }
2591 
2592  ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
2593  if (!ec)
2594  return ec;
2595 
2596  if (!NIL_P(replacement)) {
2597  int ret;
2598  rb_encoding *enc = rb_enc_get(replacement);
2599 
2600  ret = rb_econv_set_replacement(ec,
2601  (const unsigned char *)RSTRING_PTR(replacement),
2602  RSTRING_LEN(replacement),
2603  rb_enc_name(enc));
2604  if (ret == -1) {
2605  rb_econv_close(ec);
2606  return NULL;
2607  }
2608  }
2609  return ec;
2610 }
2611 
2612 static int
2613 enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p)
2614 {
2615  rb_encoding *enc;
2616  const char *n;
2617  int encidx;
2618  VALUE encval;
2619 
2620  if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
2621  !(enc = rb_enc_from_index(encidx))) {
2622  enc = NULL;
2623  encidx = 0;
2624  n = StringValueCStr(*arg);
2625  }
2626  else {
2627  n = rb_enc_name(enc);
2628  }
2629 
2630  *name_p = n;
2631  *enc_p = enc;
2632 
2633  return encidx;
2634 }
2635 
2636 static int
2637 str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2,
2638  const char **sname_p, rb_encoding **senc_p,
2639  const char **dname_p, rb_encoding **denc_p)
2640 {
2641  rb_encoding *senc, *denc;
2642  const char *sname, *dname;
2643  int sencidx, dencidx;
2644 
2645  dencidx = enc_arg(arg1, &dname, &denc);
2646 
2647  if (NIL_P(*arg2)) {
2648  sencidx = rb_enc_get_index(str);
2649  senc = rb_enc_from_index(sencidx);
2650  sname = rb_enc_name(senc);
2651  }
2652  else {
2653  sencidx = enc_arg(arg2, &sname, &senc);
2654  }
2655 
2656  *sname_p = sname;
2657  *senc_p = senc;
2658  *dname_p = dname;
2659  *denc_p = denc;
2660  return dencidx;
2661 }
2662 
2663 static int
2664 str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
2665 {
2666  VALUE dest;
2667  VALUE str = *self;
2668  volatile VALUE arg1, arg2;
2669  long blen, slen;
2670  unsigned char *buf, *bp, *sp;
2671  const unsigned char *fromp;
2672  rb_encoding *senc, *denc;
2673  const char *sname, *dname;
2674  int dencidx;
2675  int explicitly_invalid_replace = TRUE;
2676 
2677  rb_check_arity(argc, 0, 2);
2678 
2679  if (argc == 0) {
2680  arg1 = rb_enc_default_internal();
2681  if (NIL_P(arg1)) {
2682  if (!ecflags) return -1;
2683  arg1 = rb_obj_encoding(str);
2684  }
2685  if (!(ecflags & ECONV_INVALID_MASK)) {
2686  explicitly_invalid_replace = FALSE;
2687  }
2689  }
2690  else {
2691  arg1 = argv[0];
2692  }
2693  arg2 = argc<=1 ? Qnil : argv[1];
2694  dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
2695 
2696  if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
2700  if (senc && senc == denc) {
2701  if ((ecflags & ECONV_INVALID_MASK) && explicitly_invalid_replace) {
2702  VALUE rep = Qnil;
2703  if (!NIL_P(ecopts)) {
2704  rep = rb_hash_aref(ecopts, sym_replace);
2705  }
2706  dest = rb_str_scrub(str, rep);
2707  if (NIL_P(dest)) dest = str;
2708  *self = dest;
2709  return dencidx;
2710  }
2711  return NIL_P(arg2) ? -1 : dencidx;
2712  }
2713  if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
2715  return dencidx;
2716  }
2717  }
2718  if (encoding_equal(sname, dname)) {
2719  return NIL_P(arg2) ? -1 : dencidx;
2720  }
2721  }
2722  else {
2723  if (encoding_equal(sname, dname)) {
2724  sname = "";
2725  dname = "";
2726  }
2727  }
2728 
2729  fromp = sp = (unsigned char *)RSTRING_PTR(str);
2730  slen = RSTRING_LEN(str);
2731  blen = slen + 30; /* len + margin */
2732  dest = rb_str_tmp_new(blen);
2733  bp = (unsigned char *)RSTRING_PTR(dest);
2734 
2735  transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
2736  if (fromp != sp+slen) {
2737  rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
2738  }
2739  buf = (unsigned char *)RSTRING_PTR(dest);
2740  *bp = '\0';
2741  rb_str_set_len(dest, bp - buf);
2742 
2743  /* set encoding */
2744  if (!denc) {
2745  dencidx = rb_define_dummy_encoding(dname);
2746  }
2747  *self = dest;
2748 
2749  return dencidx;
2750 }
2751 
2752 static int
2754 {
2755  VALUE opt;
2756  int ecflags = 0;
2757  VALUE ecopts = Qnil;
2758 
2759  argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
2760  if (!NIL_P(opt)) {
2761  ecflags = rb_econv_prepare_opts(opt, &ecopts);
2762  }
2763  return str_transcode0(argc, argv, self, ecflags, ecopts);
2764 }
2765 
2766 static inline VALUE
2767 str_encode_associate(VALUE str, int encidx)
2768 {
2769  int cr = 0;
2770 
2771  rb_enc_associate_index(str, encidx);
2772 
2773  /* transcoded string never be broken. */
2774  if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
2776  }
2777  else {
2778  cr = ENC_CODERANGE_VALID;
2779  }
2780  ENC_CODERANGE_SET(str, cr);
2781  return str;
2782 }
2783 
2784 /*
2785  * call-seq:
2786  * str.encode!(encoding [, options] ) -> str
2787  * str.encode!(dst_encoding, src_encoding [, options] ) -> str
2788  *
2789  * The first form transcodes the contents of <i>str</i> from
2790  * str.encoding to +encoding+.
2791  * The second form transcodes the contents of <i>str</i> from
2792  * src_encoding to dst_encoding.
2793  * The options Hash gives details for conversion. See String#encode
2794  * for details.
2795  * Returns the string even if no changes were made.
2796  */
2797 
2798 static VALUE
2800 {
2801  VALUE newstr;
2802  int encidx;
2803 
2804  rb_check_frozen(str);
2805 
2806  newstr = str;
2807  encidx = str_transcode(argc, argv, &newstr);
2808 
2809  if (encidx < 0) return str;
2810  if (newstr == str) {
2811  rb_enc_associate_index(str, encidx);
2812  return str;
2813  }
2814  rb_str_shared_replace(str, newstr);
2815  return str_encode_associate(str, encidx);
2816 }
2817 
2818 static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
2819 
2820 /*
2821  * call-seq:
2822  * str.encode(encoding [, options] ) -> str
2823  * str.encode(dst_encoding, src_encoding [, options] ) -> str
2824  * str.encode([options]) -> str
2825  *
2826  * The first form returns a copy of +str+ transcoded
2827  * to encoding +encoding+.
2828  * The second form returns a copy of +str+ transcoded
2829  * from src_encoding to dst_encoding.
2830  * The last form returns a copy of +str+ transcoded to
2831  * <tt>Encoding.default_internal</tt>.
2832  *
2833  * By default, the first and second form raise
2834  * Encoding::UndefinedConversionError for characters that are
2835  * undefined in the destination encoding, and
2836  * Encoding::InvalidByteSequenceError for invalid byte sequences
2837  * in the source encoding. The last form by default does not raise
2838  * exceptions but uses replacement strings.
2839  *
2840  * The +options+ Hash gives details for conversion and can have the following
2841  * keys:
2842  *
2843  * :invalid ::
2844  * If the value is +:replace+, #encode replaces invalid byte sequences in
2845  * +str+ with the replacement character. The default is to raise the
2846  * Encoding::InvalidByteSequenceError exception
2847  * :undef ::
2848  * If the value is +:replace+, #encode replaces characters which are
2849  * undefined in the destination encoding with the replacement character.
2850  * The default is to raise the Encoding::UndefinedConversionError.
2851  * :replace ::
2852  * Sets the replacement string to the given value. The default replacement
2853  * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
2854  * :fallback ::
2855  * Sets the replacement string by the given object for undefined
2856  * character. The object should be a Hash, a Proc, a Method, or an
2857  * object which has [] method.
2858  * Its key is an undefined character encoded in the source encoding
2859  * of current transcoder. Its value can be any encoding until it
2860  * can be converted into the destination encoding of the transcoder.
2861  * :xml ::
2862  * The value must be +:text+ or +:attr+.
2863  * If the value is +:text+ #encode replaces undefined characters with their
2864  * (upper-case hexadecimal) numeric character references. '&', '<', and '>'
2865  * are converted to "&amp;", "&lt;", and "&gt;", respectively.
2866  * If the value is +:attr+, #encode also quotes the replacement result
2867  * (using '"'), and replaces '"' with "&quot;".
2868  * :cr_newline ::
2869  * Replaces LF ("\n") with CR ("\r") if value is true.
2870  * :crlf_newline ::
2871  * Replaces LF ("\n") with CRLF ("\r\n") if value is true.
2872  * :universal_newline ::
2873  * Replaces CRLF ("\r\n") and CR ("\r") with LF ("\n") if value is true.
2874  */
2875 
2876 static VALUE
2878 {
2879  VALUE newstr = str;
2880  int encidx = str_transcode(argc, argv, &newstr);
2881  return encoded_dup(newstr, str, encidx);
2882 }
2883 
2884 VALUE
2885 rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
2886 {
2887  int argc = 1;
2888  VALUE *argv = &to;
2889  VALUE newstr = str;
2890  int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
2891  return encoded_dup(newstr, str, encidx);
2892 }
2893 
2894 static VALUE
2895 encoded_dup(VALUE newstr, VALUE str, int encidx)
2896 {
2897  if (encidx < 0) return rb_str_dup(str);
2898  if (newstr == str) {
2899  newstr = rb_str_dup(str);
2900  rb_enc_associate_index(newstr, encidx);
2901  return newstr;
2902  }
2903  else {
2904  RBASIC_SET_CLASS(newstr, rb_obj_class(str));
2905  }
2906  return str_encode_associate(newstr, encidx);
2907 }
2908 
2909 static void
2910 econv_free(void *ptr)
2911 {
2912  rb_econv_t *ec = ptr;
2913  rb_econv_close(ec);
2914 }
2915 
2916 static size_t
2917 econv_memsize(const void *ptr)
2918 {
2919  return ptr ? sizeof(rb_econv_t) : 0;
2920 }
2921 
2923  "econv",
2926 };
2927 
2928 static VALUE
2930 {
2931  return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
2932 }
2933 
2934 static rb_encoding *
2936 {
2937  rb_encoding *enc;
2938  int idx;
2939  idx = rb_define_dummy_encoding(name);
2940  enc = rb_enc_from_index(idx);
2941  return enc;
2942 }
2943 
2944 static rb_encoding *
2945 make_encoding(const char *name)
2946 {
2947  rb_encoding *enc;
2948  enc = rb_enc_find(name);
2949  if (!enc)
2950  enc = make_dummy_encoding(name);
2951  return enc;
2952 }
2953 
2954 static VALUE
2955 make_encobj(const char *name)
2956 {
2957  return rb_enc_from_encoding(make_encoding(name));
2958 }
2959 
2960 /*
2961  * call-seq:
2962  * Encoding::Converter.asciicompat_encoding(string) -> encoding or nil
2963  * Encoding::Converter.asciicompat_encoding(encoding) -> encoding or nil
2964  *
2965  * Returns the corresponding ASCII compatible encoding.
2966  *
2967  * Returns nil if the argument is an ASCII compatible encoding.
2968  *
2969  * "corresponding ASCII compatible encoding" is an ASCII compatible encoding which
2970  * can represents exactly the same characters as the given ASCII incompatible encoding.
2971  * So, no conversion undefined error occurs when converting between the two encodings.
2972  *
2973  * Encoding::Converter.asciicompat_encoding("ISO-2022-JP") #=> #<Encoding:stateless-ISO-2022-JP>
2974  * Encoding::Converter.asciicompat_encoding("UTF-16BE") #=> #<Encoding:UTF-8>
2975  * Encoding::Converter.asciicompat_encoding("UTF-8") #=> nil
2976  *
2977  */
2978 static VALUE
2980 {
2981  const char *arg_name, *result_name;
2982  rb_encoding *arg_enc, *result_enc;
2983 
2984  enc_arg(&arg, &arg_name, &arg_enc);
2985 
2986  result_name = rb_econv_asciicompat_encoding(arg_name);
2987 
2988  if (result_name == NULL)
2989  return Qnil;
2990 
2991  result_enc = make_encoding(result_name);
2992 
2993  return rb_enc_from_encoding(result_enc);
2994 }
2995 
2996 static void
2998  volatile VALUE *snamev_p, volatile VALUE *dnamev_p,
2999  const char **sname_p, const char **dname_p,
3000  rb_encoding **senc_p, rb_encoding **denc_p,
3001  int *ecflags_p,
3002  VALUE *ecopts_p)
3003 {
3004  VALUE opt, flags_v, ecopts;
3005  int sidx, didx;
3006  const char *sname, *dname;
3007  rb_encoding *senc, *denc;
3008  int ecflags;
3009 
3010  argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
3011 
3012  if (!NIL_P(flags_v)) {
3013  if (!NIL_P(opt)) {
3014  rb_error_arity(argc + 1, 2, 3);
3015  }
3016  ecflags = NUM2INT(rb_to_int(flags_v));
3017  ecopts = Qnil;
3018  }
3019  else if (!NIL_P(opt)) {
3020  ecflags = rb_econv_prepare_opts(opt, &ecopts);
3021  }
3022  else {
3023  ecflags = 0;
3024  ecopts = Qnil;
3025  }
3026 
3027  senc = NULL;
3028  sidx = rb_to_encoding_index(*snamev_p);
3029  if (0 <= sidx) {
3030  senc = rb_enc_from_index(sidx);
3031  }
3032  else {
3033  StringValue(*snamev_p);
3034  }
3035 
3036  denc = NULL;
3037  didx = rb_to_encoding_index(*dnamev_p);
3038  if (0 <= didx) {
3039  denc = rb_enc_from_index(didx);
3040  }
3041  else {
3042  StringValue(*dnamev_p);
3043  }
3044 
3045  sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
3046  dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
3047 
3048  *sname_p = sname;
3049  *dname_p = dname;
3050  *senc_p = senc;
3051  *denc_p = denc;
3052  *ecflags_p = ecflags;
3053  *ecopts_p = ecopts;
3054 }
3055 
3056 static int
3057 decorate_convpath(VALUE convpath, int ecflags)
3058 {
3059  int num_decorators;
3060  const char *decorators[MAX_ECFLAGS_DECORATORS];
3061  int i;
3062  int n, len;
3063 
3064  num_decorators = decorator_names(ecflags, decorators);
3065  if (num_decorators == -1)
3066  return -1;
3067 
3068  len = n = RARRAY_LENINT(convpath);
3069  if (n != 0) {
3070  VALUE pair = RARRAY_AREF(convpath, n-1);
3071  if (RB_TYPE_P(pair, T_ARRAY)) {
3072  const char *sname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 0)));
3073  const char *dname = rb_enc_name(rb_to_encoding(RARRAY_AREF(pair, 1)));
3074  transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
3075  const rb_transcoder *tr = load_transcoder_entry(entry);
3076  if (!tr)
3077  return -1;
3078  if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
3080  n--;
3081  rb_ary_store(convpath, len + num_decorators - 1, pair);
3082  }
3083  }
3084  else {
3085  rb_ary_store(convpath, len + num_decorators - 1, pair);
3086  }
3087  }
3088 
3089  for (i = 0; i < num_decorators; i++)
3090  rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
3091 
3092  return 0;
3093 }
3094 
3095 static void
3096 search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3097 {
3098  VALUE *ary_p = arg;
3099  VALUE v;
3100 
3101  if (*ary_p == Qnil) {
3102  *ary_p = rb_ary_new();
3103  }
3104 
3105  if (DECORATOR_P(sname, dname)) {
3106  v = rb_str_new_cstr(dname);
3107  }
3108  else {
3109  v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
3110  }
3111  rb_ary_store(*ary_p, depth, v);
3112 }
3113 
3114 /*
3115  * call-seq:
3116  * Encoding::Converter.search_convpath(source_encoding, destination_encoding) -> ary
3117  * Encoding::Converter.search_convpath(source_encoding, destination_encoding, opt) -> ary
3118  *
3119  * Returns a conversion path.
3120  *
3121  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP")
3122  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3123  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>]]
3124  *
3125  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", universal_newline: true)
3126  * or
3127  * p Encoding::Converter.search_convpath("ISO-8859-1", "EUC-JP", newline: :universal)
3128  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3129  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3130  * # "universal_newline"]
3131  *
3132  * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", universal_newline: true)
3133  * or
3134  * p Encoding::Converter.search_convpath("ISO-8859-1", "UTF-32BE", newline: :universal)
3135  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3136  * # "universal_newline",
3137  * # [#<Encoding:UTF-8>, #<Encoding:UTF-32BE>]]
3138  */
3139 static VALUE
3141 {
3142  volatile VALUE snamev, dnamev;
3143  const char *sname, *dname;
3144  rb_encoding *senc, *denc;
3145  int ecflags;
3146  VALUE ecopts;
3147  VALUE convpath;
3148 
3149  econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3150 
3151  convpath = Qnil;
3152  transcode_search_path(sname, dname, search_convpath_i, &convpath);
3153 
3154  if (NIL_P(convpath))
3155  rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
3156 
3157  if (decorate_convpath(convpath, ecflags) == -1)
3158  rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
3159 
3160  return convpath;
3161 }
3162 
3163 /*
3164  * Check the existence of a conversion path.
3165  * Returns the number of converters in the conversion path.
3166  * result: >=0:success -1:failure
3167  */
3168 int
3169 rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
3170 {
3171  VALUE convpath = Qnil;
3172  transcode_search_path(from_encoding, to_encoding, search_convpath_i,
3173  &convpath);
3174  return RTEST(convpath);
3175 }
3176 
3179  int index;
3180  int ret;
3181 };
3182 
3183 static void
3184 rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
3185 {
3186  struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg;
3187  int ret;
3188 
3189  if (a->ret == -1)
3190  return;
3191 
3192  ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
3193 
3194  a->ret = ret;
3195  return;
3196 }
3197 
3198 static rb_econv_t *
3200  const char **sname_p, const char **dname_p,
3201  rb_encoding **senc_p, rb_encoding**denc_p)
3202 {
3203  rb_econv_t *ec;
3204  long i;
3205  int ret, first=1;
3206  VALUE elt;
3207  rb_encoding *senc = 0, *denc = 0;
3208  const char *sname, *dname;
3209 
3210  ec = rb_econv_alloc(RARRAY_LENINT(convpath));
3211  DATA_PTR(self) = ec;
3212 
3213  for (i = 0; i < RARRAY_LEN(convpath); i++) {
3214  volatile VALUE snamev, dnamev;
3215  VALUE pair;
3216  elt = rb_ary_entry(convpath, i);
3217  if (!NIL_P(pair = rb_check_array_type(elt))) {
3218  if (RARRAY_LEN(pair) != 2)
3219  rb_raise(rb_eArgError, "not a 2-element array in convpath");
3220  snamev = rb_ary_entry(pair, 0);
3221  enc_arg(&snamev, &sname, &senc);
3222  dnamev = rb_ary_entry(pair, 1);
3223  enc_arg(&dnamev, &dname, &denc);
3224  }
3225  else {
3226  sname = "";
3227  dname = StringValueCStr(elt);
3228  }
3229  if (DECORATOR_P(sname, dname)) {
3230  ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
3231  if (ret == -1)
3232  rb_raise(rb_eArgError, "decoration failed: %s", dname);
3233  }
3234  else {
3235  int j = ec->num_trans;
3236  struct rb_econv_init_by_convpath_t arg;
3237  arg.ec = ec;
3238  arg.index = ec->num_trans;
3239  arg.ret = 0;
3240  ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
3241  if (ret == -1 || arg.ret == -1)
3242  rb_raise(rb_eArgError, "adding conversion failed: %s to %s", sname, dname);
3243  if (first) {
3244  first = 0;
3245  *senc_p = senc;
3246  *sname_p = ec->elems[j].tc->transcoder->src_encoding;
3247  }
3248  *denc_p = denc;
3249  *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
3250  }
3251  }
3252 
3253  if (first) {
3254  *senc_p = NULL;
3255  *denc_p = NULL;
3256  *sname_p = "";
3257  *dname_p = "";
3258  }
3259 
3260  ec->source_encoding_name = *sname_p;
3261  ec->destination_encoding_name = *dname_p;
3262 
3263  return ec;
3264 }
3265 
3266 /*
3267  * call-seq:
3268  * Encoding::Converter.new(source_encoding, destination_encoding)
3269  * Encoding::Converter.new(source_encoding, destination_encoding, opt)
3270  * Encoding::Converter.new(convpath)
3271  *
3272  * possible options elements:
3273  * hash form:
3274  * :invalid => nil # raise error on invalid byte sequence (default)
3275  * :invalid => :replace # replace invalid byte sequence
3276  * :undef => nil # raise error on undefined conversion (default)
3277  * :undef => :replace # replace undefined conversion
3278  * :replace => string # replacement string ("?" or "\uFFFD" if not specified)
3279  * :newline => :universal # decorator for converting CRLF and CR to LF
3280  * :newline => :crlf # decorator for converting LF to CRLF
3281  * :newline => :cr # decorator for converting LF to CR
3282  * :universal_newline => true # decorator for converting CRLF and CR to LF
3283  * :crlf_newline => true # decorator for converting LF to CRLF
3284  * :cr_newline => true # decorator for converting LF to CR
3285  * :xml => :text # escape as XML CharData.
3286  * :xml => :attr # escape as XML AttValue
3287  * integer form:
3288  * Encoding::Converter::INVALID_REPLACE
3289  * Encoding::Converter::UNDEF_REPLACE
3290  * Encoding::Converter::UNDEF_HEX_CHARREF
3291  * Encoding::Converter::UNIVERSAL_NEWLINE_DECORATOR
3292  * Encoding::Converter::CRLF_NEWLINE_DECORATOR
3293  * Encoding::Converter::CR_NEWLINE_DECORATOR
3294  * Encoding::Converter::XML_TEXT_DECORATOR
3295  * Encoding::Converter::XML_ATTR_CONTENT_DECORATOR
3296  * Encoding::Converter::XML_ATTR_QUOTE_DECORATOR
3297  *
3298  * Encoding::Converter.new creates an instance of Encoding::Converter.
3299  *
3300  * Source_encoding and destination_encoding should be a string or
3301  * Encoding object.
3302  *
3303  * opt should be nil, a hash or an integer.
3304  *
3305  * convpath should be an array.
3306  * convpath may contain
3307  * - two-element arrays which contain encodings or encoding names, or
3308  * - strings representing decorator names.
3309  *
3310  * Encoding::Converter.new optionally takes an option.
3311  * The option should be a hash or an integer.
3312  * The option hash can contain :invalid => nil, etc.
3313  * The option integer should be logical-or of constants such as
3314  * Encoding::Converter::INVALID_REPLACE, etc.
3315  *
3316  * [:invalid => nil]
3317  * Raise error on invalid byte sequence. This is a default behavior.
3318  * [:invalid => :replace]
3319  * Replace invalid byte sequence by replacement string.
3320  * [:undef => nil]
3321  * Raise an error if a character in source_encoding is not defined in destination_encoding.
3322  * This is a default behavior.
3323  * [:undef => :replace]
3324  * Replace undefined character in destination_encoding with replacement string.
3325  * [:replace => string]
3326  * Specify the replacement string.
3327  * If not specified, "\uFFFD" is used for Unicode encodings and "?" for others.
3328  * [:universal_newline => true]
3329  * Convert CRLF and CR to LF.
3330  * [:crlf_newline => true]
3331  * Convert LF to CRLF.
3332  * [:cr_newline => true]
3333  * Convert LF to CR.
3334  * [:xml => :text]
3335  * Escape as XML CharData.
3336  * This form can be used as a HTML 4.0 #PCDATA.
3337  * - '&' -> '&amp;'
3338  * - '<' -> '&lt;'
3339  * - '>' -> '&gt;'
3340  * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3341  * [:xml => :attr]
3342  * Escape as XML AttValue.
3343  * The converted result is quoted as "...".
3344  * This form can be used as a HTML 4.0 attribute value.
3345  * - '&' -> '&amp;'
3346  * - '<' -> '&lt;'
3347  * - '>' -> '&gt;'
3348  * - '"' -> '&quot;'
3349  * - undefined characters in destination_encoding -> hexadecimal CharRef such as &#xHH;
3350  *
3351  * Examples:
3352  * # UTF-16BE to UTF-8
3353  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3354  *
3355  * # Usually, decorators such as newline conversion are inserted last.
3356  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8", :universal_newline => true)
3357  * p ec.convpath #=> [[#<Encoding:UTF-16BE>, #<Encoding:UTF-8>],
3358  * # "universal_newline"]
3359  *
3360  * # But, if the last encoding is ASCII incompatible,
3361  * # decorators are inserted before the last conversion.
3362  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE", :crlf_newline => true)
3363  * p ec.convpath #=> ["crlf_newline",
3364  * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3365  *
3366  * # Conversion path can be specified directly.
3367  * ec = Encoding::Converter.new(["universal_newline", ["EUC-JP", "UTF-8"], ["UTF-8", "UTF-16BE"]])
3368  * p ec.convpath #=> ["universal_newline",
3369  * # [#<Encoding:EUC-JP>, #<Encoding:UTF-8>],
3370  * # [#<Encoding:UTF-8>, #<Encoding:UTF-16BE>]]
3371  */
3372 static VALUE
3374 {
3375  VALUE ecopts;
3376  volatile VALUE snamev, dnamev;
3377  const char *sname, *dname;
3378  rb_encoding *senc, *denc;
3379  rb_econv_t *ec;
3380  int ecflags;
3381  VALUE convpath;
3382 
3383  if (rb_check_typeddata(self, &econv_data_type)) {
3384  rb_raise(rb_eTypeError, "already initialized");
3385  }
3386 
3387  if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
3388  ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
3389  ecflags = 0;
3390  ecopts = Qnil;
3391  }
3392  else {
3393  econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
3394  ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
3395  }
3396 
3397  if (!ec) {
3398  rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
3399  }
3400 
3401  if (!DECORATOR_P(sname, dname)) {
3402  if (!senc)
3403  senc = make_dummy_encoding(sname);
3404  if (!denc)
3405  denc = make_dummy_encoding(dname);
3406  }
3407 
3408  ec->source_encoding = senc;
3409  ec->destination_encoding = denc;
3410 
3411  DATA_PTR(self) = ec;
3412 
3413  return self;
3414 }
3415 
3416 /*
3417  * call-seq:
3418  * ec.inspect -> string
3419  *
3420  * Returns a printable version of <i>ec</i>
3421  *
3422  * ec = Encoding::Converter.new("iso-8859-1", "utf-8")
3423  * puts ec.inspect #=> #<Encoding::Converter: ISO-8859-1 to UTF-8>
3424  *
3425  */
3426 static VALUE
3428 {
3429  const char *cname = rb_obj_classname(self);
3430  rb_econv_t *ec;
3431 
3432  TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3433  if (!ec)
3434  return rb_sprintf("#<%s: uninitialized>", cname);
3435  else {
3436  const char *sname = ec->source_encoding_name;
3437  const char *dname = ec->destination_encoding_name;
3438  VALUE str;
3439  str = rb_sprintf("#<%s: ", cname);
3440  econv_description(sname, dname, ec->flags, str);
3441  rb_str_cat2(str, ">");
3442  return str;
3443  }
3444 }
3445 
3446 static rb_econv_t *
3448 {
3449  rb_econv_t *ec;
3450 
3451  TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
3452  if (!ec) {
3453  rb_raise(rb_eTypeError, "uninitialized encoding converter");
3454  }
3455  return ec;
3456 }
3457 
3458 /*
3459  * call-seq:
3460  * ec.source_encoding -> encoding
3461  *
3462  * Returns the source encoding as an Encoding object.
3463  */
3464 static VALUE
3466 {
3467  rb_econv_t *ec = check_econv(self);
3468  if (!ec->source_encoding)
3469  return Qnil;
3471 }
3472 
3473 /*
3474  * call-seq:
3475  * ec.destination_encoding -> encoding
3476  *
3477  * Returns the destination encoding as an Encoding object.
3478  */
3479 static VALUE
3481 {
3482  rb_econv_t *ec = check_econv(self);
3483  if (!ec->destination_encoding)
3484  return Qnil;
3486 }
3487 
3488 /*
3489  * call-seq:
3490  * ec.convpath -> ary
3491  *
3492  * Returns the conversion path of ec.
3493  *
3494  * The result is an array of conversions.
3495  *
3496  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP", crlf_newline: true)
3497  * p ec.convpath
3498  * #=> [[#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>],
3499  * # [#<Encoding:UTF-8>, #<Encoding:EUC-JP>],
3500  * # "crlf_newline"]
3501  *
3502  * Each element of the array is a pair of encodings or a string.
3503  * A pair means an encoding conversion.
3504  * A string means a decorator.
3505  *
3506  * In the above example, [#<Encoding:ISO-8859-1>, #<Encoding:UTF-8>] means
3507  * a converter from ISO-8859-1 to UTF-8.
3508  * "crlf_newline" means newline converter from LF to CRLF.
3509  */
3510 static VALUE
3512 {
3513  rb_econv_t *ec = check_econv(self);
3514  VALUE result;
3515  int i;
3516 
3517  result = rb_ary_new();
3518  for (i = 0; i < ec->num_trans; i++) {
3519  const rb_transcoder *tr = ec->elems[i].tc->transcoder;
3520  VALUE v;
3521  if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
3522  v = rb_str_new_cstr(tr->dst_encoding);
3523  else
3525  rb_ary_push(result, v);
3526  }
3527  return result;
3528 }
3529 
3530 /*
3531  * call-seq:
3532  * ec == other -> true or false
3533  */
3534 static VALUE
3536 {
3537  rb_econv_t *ec1 = check_econv(self);
3538  rb_econv_t *ec2;
3539  int i;
3540 
3541  if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
3542  return Qnil;
3543  }
3544  ec2 = DATA_PTR(other);
3545  if (!ec2) return Qfalse;
3546  if (ec1->source_encoding_name != ec2->source_encoding_name &&
3547  strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
3548  return Qfalse;
3551  return Qfalse;
3552  if (ec1->flags != ec2->flags) return Qfalse;
3553  if (ec1->replacement_enc != ec2->replacement_enc &&
3554  strcmp(ec1->replacement_enc, ec2->replacement_enc))
3555  return Qfalse;
3556  if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
3557  if (ec1->replacement_str != ec2->replacement_str &&
3559  return Qfalse;
3560 
3561  if (ec1->num_trans != ec2->num_trans) return Qfalse;
3562  for (i = 0; i < ec1->num_trans; i++) {
3563  if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
3564  return Qfalse;
3565  }
3566  return Qtrue;
3567 }
3568 
3569 static VALUE
3571 {
3572  switch (res) {
3578  case econv_finished: return sym_finished;
3579  case econv_after_output: return sym_after_output;
3580  default: return INT2NUM(res); /* should not be reached */
3581  }
3582 }
3583 
3584 /*
3585  * call-seq:
3586  * ec.primitive_convert(source_buffer, destination_buffer) -> symbol
3587  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset) -> symbol
3588  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize) -> symbol
3589  * ec.primitive_convert(source_buffer, destination_buffer, destination_byteoffset, destination_bytesize, opt) -> symbol
3590  *
3591  * possible opt elements:
3592  * hash form:
3593  * :partial_input => true # source buffer may be part of larger source
3594  * :after_output => true # stop conversion after output before input
3595  * integer form:
3596  * Encoding::Converter::PARTIAL_INPUT
3597  * Encoding::Converter::AFTER_OUTPUT
3598  *
3599  * possible results:
3600  * :invalid_byte_sequence
3601  * :incomplete_input
3602  * :undefined_conversion
3603  * :after_output
3604  * :destination_buffer_full
3605  * :source_buffer_empty
3606  * :finished
3607  *
3608  * primitive_convert converts source_buffer into destination_buffer.
3609  *
3610  * source_buffer should be a string or nil.
3611  * nil means an empty string.
3612  *
3613  * destination_buffer should be a string.
3614  *
3615  * destination_byteoffset should be an integer or nil.
3616  * nil means the end of destination_buffer.
3617  * If it is omitted, nil is assumed.
3618  *
3619  * destination_bytesize should be an integer or nil.
3620  * nil means unlimited.
3621  * If it is omitted, nil is assumed.
3622  *
3623  * opt should be nil, a hash or an integer.
3624  * nil means no flags.
3625  * If it is omitted, nil is assumed.
3626  *
3627  * primitive_convert converts the content of source_buffer from beginning
3628  * and store the result into destination_buffer.
3629  *
3630  * destination_byteoffset and destination_bytesize specify the region which
3631  * the converted result is stored.
3632  * destination_byteoffset specifies the start position in destination_buffer in bytes.
3633  * If destination_byteoffset is nil,
3634  * destination_buffer.bytesize is used for appending the result.
3635  * destination_bytesize specifies maximum number of bytes.
3636  * If destination_bytesize is nil,
3637  * destination size is unlimited.
3638  * After conversion, destination_buffer is resized to
3639  * destination_byteoffset + actually produced number of bytes.
3640  * Also destination_buffer's encoding is set to destination_encoding.
3641  *
3642  * primitive_convert drops the converted part of source_buffer.
3643  * the dropped part is converted in destination_buffer or
3644  * buffered in Encoding::Converter object.
3645  *
3646  * primitive_convert stops conversion when one of following condition met.
3647  * - invalid byte sequence found in source buffer (:invalid_byte_sequence)
3648  * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3649  * - unexpected end of source buffer (:incomplete_input)
3650  * this occur only when :partial_input is not specified.
3651  * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3652  * - character not representable in output encoding (:undefined_conversion)
3653  * +primitive_errinfo+ and +last_error+ methods returns the detail of the error.
3654  * - after some output is generated, before input is done (:after_output)
3655  * this occur only when :after_output is specified.
3656  * - destination buffer is full (:destination_buffer_full)
3657  * this occur only when destination_bytesize is non-nil.
3658  * - source buffer is empty (:source_buffer_empty)
3659  * this occur only when :partial_input is specified.
3660  * - conversion is finished (:finished)
3661  *
3662  * example:
3663  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3664  * ret = ec.primitive_convert(src="pi", dst="", nil, 100)
3665  * p [ret, src, dst] #=> [:finished, "", "\x00p\x00i"]
3666  *
3667  * ec = Encoding::Converter.new("UTF-8", "UTF-16BE")
3668  * ret = ec.primitive_convert(src="pi", dst="", nil, 1)
3669  * p [ret, src, dst] #=> [:destination_buffer_full, "i", "\x00"]
3670  * ret = ec.primitive_convert(src, dst="", nil, 1)
3671  * p [ret, src, dst] #=> [:destination_buffer_full, "", "p"]
3672  * ret = ec.primitive_convert(src, dst="", nil, 1)
3673  * p [ret, src, dst] #=> [:destination_buffer_full, "", "\x00"]
3674  * ret = ec.primitive_convert(src, dst="", nil, 1)
3675  * p [ret, src, dst] #=> [:finished, "", "i"]
3676  *
3677  */
3678 static VALUE
3680 {
3681  VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
3682  rb_econv_t *ec = check_econv(self);
3683  rb_econv_result_t res;
3684  const unsigned char *ip, *is;
3685  unsigned char *op, *os;
3686  long output_byteoffset, output_bytesize;
3687  unsigned long output_byteend;
3688  int flags;
3689 
3690  argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
3691 
3692  if (NIL_P(output_byteoffset_v))
3693  output_byteoffset = 0; /* dummy */
3694  else
3695  output_byteoffset = NUM2LONG(output_byteoffset_v);
3696 
3697  if (NIL_P(output_bytesize_v))
3698  output_bytesize = 0; /* dummy */
3699  else
3700  output_bytesize = NUM2LONG(output_bytesize_v);
3701 
3702  if (!NIL_P(flags_v)) {
3703  if (!NIL_P(opt)) {
3704  rb_error_arity(argc + 1, 2, 5);
3705  }
3706  flags = NUM2INT(rb_to_int(flags_v));
3707  }
3708  else if (!NIL_P(opt)) {
3709  VALUE v;
3710  flags = 0;
3711  v = rb_hash_aref(opt, sym_partial_input);
3712  if (RTEST(v))
3713  flags |= ECONV_PARTIAL_INPUT;
3714  v = rb_hash_aref(opt, sym_after_output);
3715  if (RTEST(v))
3716  flags |= ECONV_AFTER_OUTPUT;
3717  }
3718  else {
3719  flags = 0;
3720  }
3721 
3722  StringValue(output);
3723  if (!NIL_P(input))
3724  StringValue(input);
3725  rb_str_modify(output);
3726 
3727  if (NIL_P(output_bytesize_v)) {
3728  output_bytesize = RSTRING_EMBED_LEN_MAX;
3729  if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
3730  output_bytesize = RSTRING_LEN(input);
3731  }
3732 
3733  retry:
3734 
3735  if (NIL_P(output_byteoffset_v))
3736  output_byteoffset = RSTRING_LEN(output);
3737 
3738  if (output_byteoffset < 0)
3739  rb_raise(rb_eArgError, "negative output_byteoffset");
3740 
3741  if (RSTRING_LEN(output) < output_byteoffset)
3742  rb_raise(rb_eArgError, "output_byteoffset too big");
3743 
3744  if (output_bytesize < 0)
3745  rb_raise(rb_eArgError, "negative output_bytesize");
3746 
3747  output_byteend = (unsigned long)output_byteoffset +
3748  (unsigned long)output_bytesize;
3749 
3750  if (output_byteend < (unsigned long)output_byteoffset ||
3751  LONG_MAX < output_byteend)
3752  rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
3753 
3754  if (rb_str_capacity(output) < output_byteend)
3755  rb_str_resize(output, output_byteend);
3756 
3757  if (NIL_P(input)) {
3758  ip = is = NULL;
3759  }
3760  else {
3761  ip = (const unsigned char *)RSTRING_PTR(input);
3762  is = ip + RSTRING_LEN(input);
3763  }
3764 
3765  op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
3766  os = op + output_bytesize;
3767 
3768  res = rb_econv_convert(ec, &ip, is, &op, os, flags);
3769  rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
3770  if (!NIL_P(input))
3771  rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
3772 
3773  if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
3774  if (LONG_MAX / 2 < output_bytesize)
3775  rb_raise(rb_eArgError, "too long conversion result");
3776  output_bytesize *= 2;
3777  output_byteoffset_v = Qnil;
3778  goto retry;
3779  }
3780 
3781  if (ec->destination_encoding) {
3783  }
3784 
3785  return econv_result_to_symbol(res);
3786 }
3787 
3788 /*
3789  * call-seq:
3790  * ec.convert(source_string) -> destination_string
3791  *
3792  * Convert source_string and return destination_string.
3793  *
3794  * source_string is assumed as a part of source.
3795  * i.e. :partial_input=>true is specified internally.
3796  * finish method should be used last.
3797  *
3798  * ec = Encoding::Converter.new("utf-8", "euc-jp")
3799  * puts ec.convert("\u3042").dump #=> "\xA4\xA2"
3800  * puts ec.finish.dump #=> ""
3801  *
3802  * ec = Encoding::Converter.new("euc-jp", "utf-8")
3803  * puts ec.convert("\xA4").dump #=> ""
3804  * puts ec.convert("\xA2").dump #=> "\xE3\x81\x82"
3805  * puts ec.finish.dump #=> ""
3806  *
3807  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3808  * puts ec.convert("\xE3").dump #=> "".force_encoding("ISO-2022-JP")
3809  * puts ec.convert("\x81").dump #=> "".force_encoding("ISO-2022-JP")
3810  * puts ec.convert("\x82").dump #=> "\e$B$\"".force_encoding("ISO-2022-JP")
3811  * puts ec.finish.dump #=> "\e(B".force_encoding("ISO-2022-JP")
3812  *
3813  * If a conversion error occur,
3814  * Encoding::UndefinedConversionError or
3815  * Encoding::InvalidByteSequenceError is raised.
3816  * Encoding::Converter#convert doesn't supply methods to recover or restart
3817  * from these exceptions.
3818  * When you want to handle these conversion errors,
3819  * use Encoding::Converter#primitive_convert.
3820  *
3821  */
3822 static VALUE
3823 econv_convert(VALUE self, VALUE source_string)
3824 {
3825  VALUE ret, dst;
3826  VALUE av[5];
3827  int ac;
3828  rb_econv_t *ec = check_econv(self);
3829 
3830  StringValue(source_string);
3831 
3832  dst = rb_str_new(NULL, 0);
3833 
3834  av[0] = rb_str_dup(source_string);
3835  av[1] = dst;
3836  av[2] = Qnil;
3837  av[3] = Qnil;
3838  av[4] = INT2NUM(ECONV_PARTIAL_INPUT);
3839  ac = 5;
3840 
3841  ret = econv_primitive_convert(ac, av, self);
3842 
3843  if (ret == sym_invalid_byte_sequence ||
3844  ret == sym_undefined_conversion ||
3845  ret == sym_incomplete_input) {
3846  VALUE exc = make_econv_exception(ec);
3847  rb_exc_raise(exc);
3848  }
3849 
3850  if (ret == sym_finished) {
3851  rb_raise(rb_eArgError, "converter already finished");
3852  }
3853 
3854  if (ret != sym_source_buffer_empty) {
3855  rb_bug("unexpected result of econv_primitive_convert");
3856  }
3857 
3858  return dst;
3859 }
3860 
3861 /*
3862  * call-seq:
3863  * ec.finish -> string
3864  *
3865  * Finishes the converter.
3866  * It returns the last part of the converted string.
3867  *
3868  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
3869  * p ec.convert("\u3042") #=> "\e$B$\""
3870  * p ec.finish #=> "\e(B"
3871  */
3872 static VALUE
3874 {
3875  VALUE ret, dst;
3876  VALUE av[5];
3877  int ac;
3878  rb_econv_t *ec = check_econv(self);
3879 
3880  dst = rb_str_new(NULL, 0);
3881 
3882  av[0] = Qnil;
3883  av[1] = dst;
3884  av[2] = Qnil;
3885  av[3] = Qnil;
3886  av[4] = INT2FIX(0);
3887  ac = 5;
3888 
3889  ret = econv_primitive_convert(ac, av, self);
3890 
3891  if (ret == sym_invalid_byte_sequence ||
3892  ret == sym_undefined_conversion ||
3893  ret == sym_incomplete_input) {
3894  VALUE exc = make_econv_exception(ec);
3895  rb_exc_raise(exc);
3896  }
3897 
3898  if (ret != sym_finished) {
3899  rb_bug("unexpected result of econv_primitive_convert");
3900  }
3901 
3902  return dst;
3903 }
3904 
3905 /*
3906  * call-seq:
3907  * ec.primitive_errinfo -> array
3908  *
3909  * primitive_errinfo returns important information regarding the last error
3910  * as a 5-element array:
3911  *
3912  * [result, enc1, enc2, error_bytes, readagain_bytes]
3913  *
3914  * result is the last result of primitive_convert.
3915  *
3916  * Other elements are only meaningful when result is
3917  * :invalid_byte_sequence, :incomplete_input or :undefined_conversion.
3918  *
3919  * enc1 and enc2 indicate a conversion step as a pair of strings.
3920  * For example, a converter from EUC-JP to ISO-8859-1 converts
3921  * a string as follows: EUC-JP -> UTF-8 -> ISO-8859-1.
3922  * So [enc1, enc2] is either ["EUC-JP", "UTF-8"] or ["UTF-8", "ISO-8859-1"].
3923  *
3924  * error_bytes and readagain_bytes indicate the byte sequences which caused the error.
3925  * error_bytes is discarded portion.
3926  * readagain_bytes is buffered portion which is read again on next conversion.
3927  *
3928  * Example:
3929  *
3930  * # \xff is invalid as EUC-JP.
3931  * ec = Encoding::Converter.new("EUC-JP", "Shift_JIS")
3932  * ec.primitive_convert(src="\xff", dst="", nil, 10)
3933  * p ec.primitive_errinfo
3934  * #=> [:invalid_byte_sequence, "EUC-JP", "UTF-8", "\xFF", ""]
3935  *
3936  * # HIRAGANA LETTER A (\xa4\xa2 in EUC-JP) is not representable in ISO-8859-1.
3937  * # Since this error is occur in UTF-8 to ISO-8859-1 conversion,
3938  * # error_bytes is HIRAGANA LETTER A in UTF-8 (\xE3\x81\x82).
3939  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3940  * ec.primitive_convert(src="\xa4\xa2", dst="", nil, 10)
3941  * p ec.primitive_errinfo
3942  * #=> [:undefined_conversion, "UTF-8", "ISO-8859-1", "\xE3\x81\x82", ""]
3943  *
3944  * # partial character is invalid
3945  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3946  * ec.primitive_convert(src="\xa4", dst="", nil, 10)
3947  * p ec.primitive_errinfo
3948  * #=> [:incomplete_input, "EUC-JP", "UTF-8", "\xA4", ""]
3949  *
3950  * # Encoding::Converter::PARTIAL_INPUT prevents invalid errors by
3951  * # partial characters.
3952  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
3953  * ec.primitive_convert(src="\xa4", dst="", nil, 10, Encoding::Converter::PARTIAL_INPUT)
3954  * p ec.primitive_errinfo
3955  * #=> [:source_buffer_empty, nil, nil, nil, nil]
3956  *
3957  * # \xd8\x00\x00@ is invalid as UTF-16BE because
3958  * # no low surrogate after high surrogate (\xd8\x00).
3959  * # It is detected by 3rd byte (\00) which is part of next character.
3960  * # So the high surrogate (\xd8\x00) is discarded and
3961  * # the 3rd byte is read again later.
3962  * # Since the byte is buffered in ec, it is dropped from src.
3963  * ec = Encoding::Converter.new("UTF-16BE", "UTF-8")
3964  * ec.primitive_convert(src="\xd8\x00\x00@", dst="", nil, 10)
3965  * p ec.primitive_errinfo
3966  * #=> [:invalid_byte_sequence, "UTF-16BE", "UTF-8", "\xD8\x00", "\x00"]
3967  * p src
3968  * #=> "@"
3969  *
3970  * # Similar to UTF-16BE, \x00\xd8@\x00 is invalid as UTF-16LE.
3971  * # The problem is detected by 4th byte.
3972  * ec = Encoding::Converter.new("UTF-16LE", "UTF-8")
3973  * ec.primitive_convert(src="\x00\xd8@\x00", dst="", nil, 10)
3974  * p ec.primitive_errinfo
3975  * #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "@\x00"]
3976  * p src
3977  * #=> ""
3978  *
3979  */
3980 static VALUE
3982 {
3983  rb_econv_t *ec = check_econv(self);
3984 
3985  VALUE ary;
3986 
3987  ary = rb_ary_new2(5);
3988 
3990  rb_ary_store(ary, 4, Qnil);
3991 
3992  if (ec->last_error.source_encoding)
3994 
3997 
3998  if (ec->last_error.error_bytes_start) {
4001  }
4002 
4003  return ary;
4004 }
4005 
4006 /*
4007  * call-seq:
4008  * ec.insert_output(string) -> nil
4009  *
4010  * Inserts string into the encoding converter.
4011  * The string will be converted to the destination encoding and
4012  * output on later conversions.
4013  *
4014  * If the destination encoding is stateful,
4015  * string is converted according to the state and the state is updated.
4016  *
4017  * This method should be used only when a conversion error occurs.
4018  *
4019  * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4020  * src = "HIRAGANA LETTER A is \u{3042}."
4021  * dst = ""
4022  * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4023  * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is ", "."]
4024  * ec.insert_output("<err>")
4025  * p ec.primitive_convert(src, dst) #=> :finished
4026  * puts "[#{dst.dump}, #{src.dump}]" #=> ["HIRAGANA LETTER A is <err>.", ""]
4027  *
4028  * ec = Encoding::Converter.new("utf-8", "iso-2022-jp")
4029  * src = "\u{306F 3041 3068 2661 3002}" # U+2661 is not representable in iso-2022-jp
4030  * dst = ""
4031  * p ec.primitive_convert(src, dst) #=> :undefined_conversion
4032  * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H".force_encoding("ISO-2022-JP"), "\xE3\x80\x82"]
4033  * ec.insert_output "?" # state change required to output "?".
4034  * p ec.primitive_convert(src, dst) #=> :finished
4035  * puts "[#{dst.dump}, #{src.dump}]" #=> ["\e$B$O$!$H\e(B?\e$B!#\e(B".force_encoding("ISO-2022-JP"), ""]
4036  *
4037  */
4038 static VALUE
4040 {
4041  const char *insert_enc;
4042 
4043  int ret;
4044 
4045  rb_econv_t *ec = check_econv(self);
4046 
4047  StringValue(string);
4048  insert_enc = rb_econv_encoding_to_insert_output(ec);
4049  string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
4050 
4051  ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
4052  if (ret == -1) {
4053  rb_raise(rb_eArgError, "too big string");
4054  }
4055 
4056  return Qnil;
4057 }
4058 
4059 /*
4060  * call-seq
4061  * ec.putback -> string
4062  * ec.putback(max_numbytes) -> string
4063  *
4064  * Put back the bytes which will be converted.
4065  *
4066  * The bytes are caused by invalid_byte_sequence error.
4067  * When invalid_byte_sequence error, some bytes are discarded and
4068  * some bytes are buffered to be converted later.
4069  * The latter bytes can be put back.
4070  * It can be observed by
4071  * Encoding::InvalidByteSequenceError#readagain_bytes and
4072  * Encoding::Converter#primitive_errinfo.
4073  *
4074  * ec = Encoding::Converter.new("utf-16le", "iso-8859-1")
4075  * src = "\x00\xd8\x61\x00"
4076  * dst = ""
4077  * p ec.primitive_convert(src, dst) #=> :invalid_byte_sequence
4078  * p ec.primitive_errinfo #=> [:invalid_byte_sequence, "UTF-16LE", "UTF-8", "\x00\xD8", "a\x00"]
4079  * p ec.putback #=> "a\x00"
4080  * p ec.putback #=> "" # no more bytes to put back
4081  *
4082  */
4083 static VALUE
4085 {
4086  rb_econv_t *ec = check_econv(self);
4087  int n;
4088  int putbackable;
4089  VALUE str, max;
4090 
4091  rb_scan_args(argc, argv, "01", &max);
4092 
4093  if (NIL_P(max))
4094  n = rb_econv_putbackable(ec);
4095  else {
4096  n = NUM2INT(max);
4097  putbackable = rb_econv_putbackable(ec);
4098  if (putbackable < n)
4099  n = putbackable;
4100  }
4101 
4102  str = rb_str_new(NULL, n);
4103  rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
4104 
4105  if (ec->source_encoding) {
4107  }
4108 
4109  return str;
4110 }
4111 
4112 /*
4113  * call-seq:
4114  * ec.last_error -> exception or nil
4115  *
4116  * Returns an exception object for the last conversion.
4117  * Returns nil if the last conversion did not produce an error.
4118  *
4119  * "error" means that
4120  * Encoding::InvalidByteSequenceError and Encoding::UndefinedConversionError for
4121  * Encoding::Converter#convert and
4122  * :invalid_byte_sequence, :incomplete_input and :undefined_conversion for
4123  * Encoding::Converter#primitive_convert.
4124  *
4125  * ec = Encoding::Converter.new("utf-8", "iso-8859-1")
4126  * p ec.primitive_convert(src="\xf1abcd", dst="") #=> :invalid_byte_sequence
4127  * p ec.last_error #=> #<Encoding::InvalidByteSequenceError: "\xF1" followed by "a" on UTF-8>
4128  * p ec.primitive_convert(src, dst, nil, 1) #=> :destination_buffer_full
4129  * p ec.last_error #=> nil
4130  *
4131  */
4132 static VALUE
4134 {
4135  rb_econv_t *ec = check_econv(self);
4136  VALUE exc;
4137 
4138  exc = make_econv_exception(ec);
4139  if (NIL_P(exc))
4140  return Qnil;
4141  return exc;
4142 }
4143 
4144 /*
4145  * call-seq:
4146  * ec.replacement -> string
4147  *
4148  * Returns the replacement string.
4149  *
4150  * ec = Encoding::Converter.new("euc-jp", "us-ascii")
4151  * p ec.replacement #=> "?"
4152  *
4153  * ec = Encoding::Converter.new("euc-jp", "utf-8")
4154  * p ec.replacement #=> "\uFFFD"
4155  */
4156 static VALUE
4158 {
4159  rb_econv_t *ec = check_econv(self);
4160  int ret;
4161  rb_encoding *enc;
4162 
4163  ret = make_replacement(ec);
4164  if (ret == -1) {
4165  rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4166  }
4167 
4168  enc = rb_enc_find(ec->replacement_enc);
4169  return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
4170 }
4171 
4172 /*
4173  * call-seq:
4174  * ec.replacement = string
4175  *
4176  * Sets the replacement string.
4177  *
4178  * ec = Encoding::Converter.new("utf-8", "us-ascii", :undef => :replace)
4179  * ec.replacement = "<undef>"
4180  * p ec.convert("a \u3042 b") #=> "a <undef> b"
4181  */
4182 static VALUE
4184 {
4185  rb_econv_t *ec = check_econv(self);
4186  VALUE string = arg;
4187  int ret;
4188  rb_encoding *enc;
4189 
4190  StringValue(string);
4191  enc = rb_enc_get(string);
4192 
4193  ret = rb_econv_set_replacement(ec,
4194  (const unsigned char *)RSTRING_PTR(string),
4195  RSTRING_LEN(string),
4196  rb_enc_name(enc));
4197 
4198  if (ret == -1) {
4199  /* xxx: rb_eInvalidByteSequenceError? */
4200  rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
4201  }
4202 
4203  return arg;
4204 }
4205 
4206 VALUE
4208 {
4209  return make_econv_exception(ec);
4210 }
4211 
4212 void
4214 {
4215  VALUE exc;
4216 
4217  exc = make_econv_exception(ec);
4218  if (NIL_P(exc))
4219  return;
4220  rb_exc_raise(exc);
4221 }
4222 
4223 /*
4224  * call-seq:
4225  * ecerr.source_encoding_name -> string
4226  *
4227  * Returns the source encoding name as a string.
4228  */
4229 static VALUE
4231 {
4232  return rb_attr_get(self, rb_intern("source_encoding_name"));
4233 }
4234 
4235 /*
4236  * call-seq:
4237  * ecerr.source_encoding -> encoding
4238  *
4239  * Returns the source encoding as an encoding object.
4240  *
4241  * Note that the result may not be equal to the source encoding of
4242  * the encoding converter if the conversion has multiple steps.
4243  *
4244  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP") # ISO-8859-1 -> UTF-8 -> EUC-JP
4245  * begin
4246  * ec.convert("\xa0") # NO-BREAK SPACE, which is available in UTF-8 but not in EUC-JP.
4247  * rescue Encoding::UndefinedConversionError
4248  * p $!.source_encoding #=> #<Encoding:UTF-8>
4249  * p $!.destination_encoding #=> #<Encoding:EUC-JP>
4250  * p $!.source_encoding_name #=> "UTF-8"
4251  * p $!.destination_encoding_name #=> "EUC-JP"
4252  * end
4253  *
4254  */
4255 static VALUE
4257 {
4258  return rb_attr_get(self, rb_intern("source_encoding"));
4259 }
4260 
4261 /*
4262  * call-seq:
4263  * ecerr.destination_encoding_name -> string
4264  *
4265  * Returns the destination encoding name as a string.
4266  */
4267 static VALUE
4269 {
4270  return rb_attr_get(self, rb_intern("destination_encoding_name"));
4271 }
4272 
4273 /*
4274  * call-seq:
4275  * ecerr.destination_encoding -> string
4276  *
4277  * Returns the destination encoding as an encoding object.
4278  */
4279 static VALUE
4281 {
4282  return rb_attr_get(self, rb_intern("destination_encoding"));
4283 }
4284 
4285 /*
4286  * call-seq:
4287  * ecerr.error_char -> string
4288  *
4289  * Returns the one-character string which cause Encoding::UndefinedConversionError.
4290  *
4291  * ec = Encoding::Converter.new("ISO-8859-1", "EUC-JP")
4292  * begin
4293  * ec.convert("\xa0")
4294  * rescue Encoding::UndefinedConversionError
4295  * puts $!.error_char.dump #=> "\xC2\xA0"
4296  * p $!.error_char.encoding #=> #<Encoding:UTF-8>
4297  * end
4298  *
4299  */
4300 static VALUE
4302 {
4303  return rb_attr_get(self, rb_intern("error_char"));
4304 }
4305 
4306 /*
4307  * call-seq:
4308  * ecerr.error_bytes -> string
4309  *
4310  * Returns the discarded bytes when Encoding::InvalidByteSequenceError occurs.
4311  *
4312  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4313  * begin
4314  * ec.convert("abc\xA1\xFFdef")
4315  * rescue Encoding::InvalidByteSequenceError
4316  * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "\xFF" on EUC-JP>
4317  * puts $!.error_bytes.dump #=> "\xA1"
4318  * puts $!.readagain_bytes.dump #=> "\xFF"
4319  * end
4320  */
4321 static VALUE
4323 {
4324  return rb_attr_get(self, rb_intern("error_bytes"));
4325 }
4326 
4327 /*
4328  * call-seq:
4329  * ecerr.readagain_bytes -> string
4330  *
4331  * Returns the bytes to be read again when Encoding::InvalidByteSequenceError occurs.
4332  */
4333 static VALUE
4335 {
4336  return rb_attr_get(self, rb_intern("readagain_bytes"));
4337 }
4338 
4339 /*
4340  * call-seq:
4341  * ecerr.incomplete_input? -> true or false
4342  *
4343  * Returns true if the invalid byte sequence error is caused by
4344  * premature end of string.
4345  *
4346  * ec = Encoding::Converter.new("EUC-JP", "ISO-8859-1")
4347  *
4348  * begin
4349  * ec.convert("abc\xA1z")
4350  * rescue Encoding::InvalidByteSequenceError
4351  * p $! #=> #<Encoding::InvalidByteSequenceError: "\xA1" followed by "z" on EUC-JP>
4352  * p $!.incomplete_input? #=> false
4353  * end
4354  *
4355  * begin
4356  * ec.convert("abc\xA1")
4357  * ec.finish
4358  * rescue Encoding::InvalidByteSequenceError
4359  * p $! #=> #<Encoding::InvalidByteSequenceError: incomplete "\xA1" on EUC-JP>
4360  * p $!.incomplete_input? #=> true
4361  * end
4362  */
4363 static VALUE
4365 {
4366  return rb_attr_get(self, rb_intern("incomplete_input"));
4367 }
4368 
4369 /*
4370  * Document-class: Encoding::UndefinedConversionError
4371  *
4372  * Raised by Encoding and String methods when a transcoding operation
4373  * fails.
4374  */
4375 
4376 /*
4377  * Document-class: Encoding::InvalidByteSequenceError
4378  *
4379  * Raised by Encoding and String methods when the string being
4380  * transcoded contains a byte invalid for the either the source or
4381  * target encoding.
4382  */
4383 
4384 /*
4385  * Document-class: Encoding::ConverterNotFoundError
4386  *
4387  * Raised by transcoding methods when a named encoding does not
4388  * correspond with a known converter.
4389  */
4390 
4391 void
4393 {
4397 
4398  transcoder_table = st_init_strcasetable();
4399 
4400  sym_invalid = ID2SYM(rb_intern("invalid"));
4401  sym_undef = ID2SYM(rb_intern("undef"));
4402  sym_replace = ID2SYM(rb_intern("replace"));
4403  sym_fallback = ID2SYM(rb_intern("fallback"));
4404  sym_aref = ID2SYM(rb_intern("[]"));
4405  sym_xml = ID2SYM(rb_intern("xml"));
4406  sym_text = ID2SYM(rb_intern("text"));
4407  sym_attr = ID2SYM(rb_intern("attr"));
4408 
4409  sym_invalid_byte_sequence = ID2SYM(rb_intern("invalid_byte_sequence"));
4410  sym_undefined_conversion = ID2SYM(rb_intern("undefined_conversion"));
4411  sym_destination_buffer_full = ID2SYM(rb_intern("destination_buffer_full"));
4412  sym_source_buffer_empty = ID2SYM(rb_intern("source_buffer_empty"));
4413  sym_finished = ID2SYM(rb_intern("finished"));
4414  sym_after_output = ID2SYM(rb_intern("after_output"));
4415  sym_incomplete_input = ID2SYM(rb_intern("incomplete_input"));
4416  sym_universal_newline = ID2SYM(rb_intern("universal_newline"));
4417  sym_crlf_newline = ID2SYM(rb_intern("crlf_newline"));
4418  sym_cr_newline = ID2SYM(rb_intern("cr_newline"));
4419  sym_partial_input = ID2SYM(rb_intern("partial_input"));
4420 
4421 #ifdef ENABLE_ECONV_NEWLINE_OPTION
4422  sym_newline = ID2SYM(rb_intern("newline"));
4423  sym_universal = ID2SYM(rb_intern("universal"));
4424  sym_crlf = ID2SYM(rb_intern("crlf"));
4425  sym_cr = ID2SYM(rb_intern("cr"));
4426  sym_lf = ID2SYM(rb_intern("lf"));
4427 #endif
4428 
4429  rb_define_method(rb_cString, "encode", str_encode, -1);
4430  rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
4431 
4451 
4452  /* Document-const: INVALID_MASK
4453  *
4454  * Mask for invalid byte sequences
4455  */
4457 
4458  /* Document-const: INVALID_REPLACE
4459  *
4460  * Replace invalid byte sequences
4461  */
4463 
4464  /* Document-const: UNDEF_MASK
4465  *
4466  * Mask for a valid character in the source encoding but no related
4467  * character(s) in destination encoding.
4468  */
4470 
4471  /* Document-const: UNDEF_REPLACE
4472  *
4473  * Replace byte sequences that are undefined in the destination encoding.
4474  */
4476 
4477  /* Document-const: UNDEF_HEX_CHARREF
4478  *
4479  * Replace byte sequences that are undefined in the destination encoding
4480  * with an XML hexadecimal character reference. This is valid for XML
4481  * conversion.
4482  */
4484 
4485  /* Document-const: PARTIAL_INPUT
4486  *
4487  * Indicates the source may be part of a larger string. See
4488  * primitive_convert for an example.
4489  */
4491 
4492  /* Document-const: AFTER_OUTPUT
4493  *
4494  * Stop converting after some output is complete but before all of the
4495  * input was consumed. See primitive_convert for an example.
4496  */
4498 
4499  /* Document-const: UNIVERSAL_NEWLINE_DECORATOR
4500  *
4501  * Decorator for converting CRLF and CR to LF
4502  */
4504 
4505  /* Document-const: CRLF_NEWLINE_DECORATOR
4506  *
4507  * Decorator for converting LF to CRLF
4508  */
4510 
4511  /* Document-const: CR_NEWLINE_DECORATOR
4512  *
4513  * Decorator for converting LF to CR
4514  */
4516 
4517  /* Document-const: XML_TEXT_DECORATOR
4518  *
4519  * Escape as XML CharData
4520  */
4522 
4523  /* Document-const: XML_ATTR_CONTENT_DECORATOR
4524  *
4525  * Escape as XML AttValue
4526  */
4528 
4529  /* Document-const: XML_ATTR_QUOTE_DECORATOR
4530  *
4531  * Escape as XML AttValue
4532  */
4534 
4540 
4548 
4549  Init_newline();
4550 }
RUBY_EXTERN VALUE rb_cString
Definition: ruby.h:1583
#define BL_ACTION(byte)
#define FOURbt
static VALUE sym_replace
Definition: transcode.c:27
const char * ascii_incompat_name
Definition: transcode.c:1765
unsigned char ary[8]
Definition: transcode.c:67
int rb_econv_prepare_opts(VALUE opthash, VALUE *opts)
Definition: transcode.c:2572
#define ECONV_XML_TEXT_DECORATOR
Definition: encoding.h:335
#define T_SYMBOL
Definition: ruby.h:494
Definition: string.c:5340
#define FUNio
VALUE(* func_si)(void *, const unsigned char *, size_t)
search_path_queue_t * queue
Definition: transcode.c:250
int rb_enc_get_index(VALUE obj)
Definition: encoding.c:739
void rb_econv_check_error(rb_econv_t *ec)
Definition: transcode.c:4213
VALUE next_info
Definition: transcode.c:60
RUBY_EXTERN VALUE rb_cData
Definition: ruby.h:1560
static VALUE econv_destination_encoding(VALUE self)
Definition: transcode.c:3480
#define MBCLEN_CHARFOUND_P(ret)
Definition: encoding.h:139
static VALUE sym_undefined_conversion
Definition: transcode.c:38
#define NOMAP
VALUE rb_eConverterNotFoundError
Definition: transcode.c:23
rb_econv_result_t
Definition: encoding.h:252
int(* state_fini_func)(void *)
VALUE rb_ary_entry(VALUE ary, long offset)
Definition: array.c:1171
#define MBCLEN_CHARFOUND_LEN(ret)
Definition: encoding.h:140
unsigned char * in_buf_end
Definition: transcode.c:126
const unsigned char * error_bytes_start
Definition: transcode.c:139
#define RARRAY_LEN(a)
Definition: ruby.h:878
void rb_bug(const char *fmt,...)
Definition: error.c:327
rb_econv_result_t last_result
Definition: transcode.c:108
#define rb_enc_mbc_to_codepoint(p, e, enc)
Definition: encoding.h:156
VALUE rb_econv_make_exception(rb_econv_t *ec)
Definition: transcode.c:4207
#define FALSE
Definition: nkf.h:174
#define RUBY_TYPED_FREE_IMMEDIATELY
Definition: ruby.h:1015
const char * dst_encoding
rb_econv_result_t result
Definition: transcode.c:135
long rb_str_coderange_scan_restartable(const char *, const char *, rb_encoding *, int *)
Definition: string.c:340
static VALUE sym_invalid_byte_sequence
Definition: transcode.c:37
size_t strlen(const char *)
#define INT2NUM(x)
Definition: ruby.h:1288
struct search_path_queue_tag search_path_queue_t
#define DECORATOR_P(sname, dname)
Definition: transcode.c:154
Definition: st.h:69
#define GB4bt
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags)
Definition: transcode.c:1446
VALUE rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
Definition: transcode.c:2036
Definition: st.h:100
VALUE rb_cEncoding
Definition: encoding.c:37
VALUE rb_econv_append(rb_econv_t *ec, const char *ss, long len, VALUE dst, int flags)
Definition: transcode.c:1813
#define NUM2INT(x)
Definition: ruby.h:630
static int max(int a, int b)
Definition: strftime.c:141
#define ZERObt
void rb_define_singleton_method(VALUE obj, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a singleton method for obj.
Definition: class.c:1655
static void transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, const unsigned char *in_stop, unsigned char *out_stop, VALUE destination, unsigned char *(*resize_destination)(VALUE, size_t, size_t), const char *src_encoding, const char *dst_encoding, int ecflags, VALUE ecopts)
Definition: transcode.c:2266
VALUE rb_eInvalidByteSequenceError
Definition: transcode.c:22
#define ECONV_XML_ATTR_CONTENT_DECORATOR
Definition: encoding.h:336
static void econv_args(int argc, VALUE *argv, volatile VALUE *snamev_p, volatile VALUE *dnamev_p, const char **sname_p, const char **dname_p, rb_encoding **senc_p, rb_encoding **denc_p, int *ecflags_p, VALUE *ecopts_p)
Definition: transcode.c:2997
int(* state_init_func)(void *)
#define getGB4bt1(a)
#define FL_TAINT
Definition: ruby.h:1137
void rb_econv_binmode(rb_econv_t *ec)
Definition: transcode.c:1942
ssize_t writebuf_len
Definition: transcode.c:72
static void rb_transcoding_close(rb_transcoding *tc)
Definition: transcode.c:822
rb_encoding * source_encoding
Definition: transcode.c:146
static VALUE sym_newline
Definition: transcode.c:33
#define Qtrue
Definition: ruby.h:426
unsigned char * out_data_start
Definition: transcode.c:105
static int decorate_convpath(VALUE convpath, int ecflags)
Definition: transcode.c:3057
static int enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p)
Definition: transcode.c:2613
static VALUE sym_crlf_newline
Definition: transcode.c:30
void Init_newline(void)
Definition: newline.c:183
#define TypedData_Wrap_Struct(klass, data_type, sval)
Definition: ruby.h:1027
#define MAX_ECFLAGS_DECORATORS
Definition: transcode.c:1026
static size_t rb_transcoding_memsize(rb_transcoding *tc)
Definition: transcode.c:838
#define ENC_CODERANGE_SET(obj, cr)
Definition: encoding.h:54
#define TypedData_Get_Struct(obj, type, data_type, sval)
Definition: ruby.h:1041
int rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
Definition: transcode.c:2527
unsigned char * in_data_start
Definition: transcode.c:124
#define ECONV_ERROR_HANDLER_MASK
Definition: encoding.h:318
int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
Definition: transcode.c:1925
static int str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2, const char **sname_p, rb_encoding **senc_p, const char **dname_p, rb_encoding **denc_p)
Definition: transcode.c:2637
VALUE rb_method_call(int, VALUE *, VALUE)
Definition: proc.c:1800
rb_encoding * rb_to_encoding(VALUE enc)
Definition: encoding.c:219
#define getBT3(a)
rb_encoding * destination_encoding
Definition: transcode.c:147
#define ECONV_XML_ATTR_QUOTE_DECORATOR
Definition: encoding.h:339
struct rb_transcoding * tc
Definition: transcode.c:103
#define SUSPEND(ret, num)
VALUE rb_enc_from_encoding(rb_encoding *encoding)
Definition: encoding.c:102
static VALUE sym_cr_newline
Definition: transcode.c:31
VALUE rb_eTypeError
Definition: error.c:548
static int str_transcode(int argc, VALUE *argv, VALUE *self)
Definition: transcode.c:2753
#define rb_check_arity
Definition: intern.h:296
static VALUE sym_aref
Definition: transcode.c:27
VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
Definition: transcode.c:1856
VALUE rb_ary_push(VALUE ary, VALUE item)
Definition: array.c:896
VALUE rb_eEncodingError
Definition: error.c:554
void st_free_table(st_table *)
Definition: st.c:334
static VALUE econv_last_error(VALUE self)
Definition: transcode.c:4133
#define SYM2ID(x)
Definition: ruby.h:356
VALUE rb_obj_is_method(VALUE)
Definition: proc.c:1116
#define UNDEF
struct rb_transcoding * error_tc
Definition: transcode.c:136
static rb_econv_t * rb_econv_alloc(int n_hint)
Definition: transcode.c:856
void rb_str_set_len(VALUE, long)
Definition: string.c:2008
#define RBASIC_SET_CLASS(obj, cls)
Definition: internal.h:609
int rb_enc_str_coderange(VALUE)
Definition: string.c:435
static rb_econv_t * rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
Definition: transcode.c:933
VALUE rb_define_class_under(VALUE outer, const char *name, VALUE super)
Defines a class under the namespace of outer.
Definition: class.c:676
VALUE rb_to_int(VALUE)
Definition: object.c:2680
void rb_raise(VALUE exc, const char *fmt,...)
Definition: error.c:1854
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:826
unsigned int conv_tree_start
static void rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
Definition: transcode.c:3184
#define RB_GC_GUARD(v)
Definition: ruby.h:523
void rb_define_alloc_func(VALUE, rb_alloc_func_t)
#define T_HASH
Definition: ruby.h:485
const char * lib
Definition: transcode.c:159
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Definition: transcode.c:2885
#define THREEbt
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
Definition: transcode.c:2578
#define STR1
#define DATA_PTR(dta)
Definition: ruby.h:992
const rb_transcoder * transcoder
Definition: transcode.c:160
#define next_info
static int output_replacement_character(rb_econv_t *ec)
Definition: transcode.c:2230
#define T_ARRAY
Definition: ruby.h:484
const char * dname
Definition: transcode.c:158
static rb_econv_result_t rb_trans_conv(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags, int *result_position_ptr)
Definition: transcode.c:1175
void callback(ffi_cif *cif, void *resp, void **args, void *ctx)
Definition: closure.c:59
static rb_econv_result_t transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos, const unsigned char *in_stop, unsigned char *out_stop, rb_transcoding *tc, const int opt)
Definition: transcode.c:757
static VALUE econv_finish(VALUE self)
Definition: transcode.c:3873
rb_encoding * rb_utf8_encoding(void)
Definition: encoding.c:1242
static VALUE econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
Definition: transcode.c:1978
static transcoder_entry_t * make_transcoder_entry(const char *sname, const char *dname)
Definition: transcode.c:166
static const rb_transcoder * load_transcoder_entry(transcoder_entry_t *entry)
Definition: transcode.c:362
VALUE rb_str_tmp_new(long)
Definition: string.c:919
static int transcode_search_path(const char *sname, const char *dname, void(*callback)(const char *sname, const char *dname, int depth, void *arg), void *arg)
Definition: transcode.c:277
struct rb_econv_t::@157 last_error
unsigned char * in_buf_start
Definition: transcode.c:123
static rb_econv_t * rb_econv_open0(const char *sname, const char *dname, int ecflags)
Definition: transcode.c:976
static void econv_free(void *ptr)
Definition: transcode.c:2910
const char * enc
Definition: transcode.c:245
static VALUE sym_source_buffer_empty
Definition: transcode.c:40
void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
Definition: transcode.c:1753
#define FUNsio
ssize_t(* func_so)(void *, const unsigned char *, size_t, unsigned char *, size_t)
#define ENC_CODERANGE_7BIT
Definition: encoding.h:49
size_t error_bytes_len
Definition: transcode.c:140
const char * rb_obj_classname(VALUE)
Definition: variable.c:406
#define rb_ary_new2
Definition: intern.h:90
#define getGB4bt2(a)
static VALUE sym_crlf
Definition: transcode.c:33
static VALUE econv_convert(VALUE self, VALUE source_string)
Definition: transcode.c:3823
static VALUE sym_partial_input
Definition: transcode.c:35
static const char transcoder_lib_prefix[]
Definition: transcode.c:230
RUBY_SYMBOL_EXPORT_BEGIN typedef unsigned long st_data_t
Definition: st.h:20
static rb_econv_t * rb_econv_init_by_convpath(VALUE self, VALUE convpath, const char **sname_p, const char **dname_p, rb_encoding **senc_p, rb_encoding **denc_p)
Definition: transcode.c:3199
void rb_exc_raise(VALUE mesg)
Definition: eval.c:567
static unsigned char * output
Definition: nkf.c:32
static const char * get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
Definition: transcode.c:393
static VALUE str_encode_associate(VALUE str, int encidx)
Definition: transcode.c:2767
st_table * st_init_strcasetable(void)
Definition: st.c:296
#define FUNii
st_table * visited
Definition: transcode.c:249
#define RB_TYPE_P(obj, type)
Definition: ruby.h:1664
static VALUE ecerr_incomplete_input(VALUE self)
Definition: transcode.c:4364
int rb_econv_has_convpath_p(const char *from_encoding, const char *to_encoding)
Definition: transcode.c:3169
#define fail()
int st_lookup(st_table *, st_data_t, st_data_t *)
static unsigned char * str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
Definition: transcode.c:2421
int rb_to_encoding_index(VALUE enc)
Definition: encoding.c:171
ssize_t readagain_len
Definition: transcode.c:65
static VALUE econv_primitive_errinfo(VALUE self)
Definition: transcode.c:3981
ssize_t(* finish_func)(void *, unsigned char *, size_t)
unsigned int output_index
Definition: transcode.c:62
unsigned int input
Definition: nkf.c:4311
#define TRANSCODING_READBUF(tc)
Definition: transcode.c:84
static size_t econv_memsize(const void *ptr)
Definition: transcode.c:2917
#define ALLOC_N(type, n)
Definition: ruby.h:1333
void Init_transcode(void)
Definition: transcode.c:4392
VALUE rb_hash_aset(VALUE hash, VALUE key, VALUE val)
Definition: hash.c:1393
unsigned char * in_data_end
Definition: transcode.c:125
static VALUE str_encode_bang(int argc, VALUE *argv, VALUE str)
Definition: transcode.c:2799
Definition: transcode.c:156
static VALUE str_encode(int argc, VALUE *argv, VALUE str)
Definition: transcode.c:2877
int num_finished
Definition: transcode.c:130
const char * destination_encoding
Definition: transcode.c:138
static int rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
Definition: transcode.c:1902
#define val
int resume_position
Definition: transcode.c:58
#define ECONV_INVALID_MASK
Definition: encoding.h:320
VALUE rb_eRuntimeError
Definition: error.c:547
#define RSTRING_END(str)
Definition: ruby.h:849
struct rb_econv_t rb_econv_t
Definition: encoding.h:262
#define SUSPEND_AFTER_OUTPUT(num)
#define getGB4bt3(a)
int rb_typeddata_is_kind_of(VALUE obj, const rb_data_type_t *data_type)
Definition: error.c:510
VALUE rb_str_cat2(VALUE, const char *)
Definition: string.c:2159
#define ECONV_INVALID_REPLACE
Definition: encoding.h:321
void rb_econv_close(rb_econv_t *ec)
Definition: transcode.c:1702
VALUE rb_ary_new(void)
Definition: array.c:495
#define dp(v)
Definition: vm_debug.h:21
static VALUE econv_get_replacement(VALUE self)
Definition: transcode.c:4157
#define ECONV_PARTIAL_INPUT
Definition: encoding.h:350
#define ECONV_AFTER_OUTPUT
Definition: encoding.h:351
#define snprintf
Definition: subst.h:6
#define NIL_P(v)
Definition: ruby.h:438
void st_add_direct(st_table *, st_data_t, st_data_t)
Definition: st.c:629
static void more_output_buffer(VALUE destination, unsigned char *(*resize_destination)(VALUE, size_t, size_t), int max_output, unsigned char **out_start_ptr, unsigned char **out_pos, unsigned char **out_stop_ptr)
Definition: transcode.c:2148
union rb_transcoding::@155 readbuf
void rb_define_const(VALUE, const char *, VALUE)
Definition: variable.c:2225
void rb_ary_store(VALUE ary, long idx, VALUE val)
Definition: array.c:790
static VALUE sym_attr
Definition: transcode.c:28
static VALUE econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
Definition: transcode.c:3140
#define OBJ_FROZEN(x)
Definition: ruby.h:1185
VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
Definition: transcode.c:1865
static st_table * transcoder_table
Definition: transcode.c:163
int rb_econv_insert_output(rb_econv_t *ec, const unsigned char *str, size_t len, const char *str_encoding)
Definition: transcode.c:1587
const char * sname
Definition: transcode.c:157
int argc
Definition: ruby.c:131
#define Qfalse
Definition: ruby.h:425
static VALUE make_econv_exception(rb_econv_t *ec)
Definition: transcode.c:2047
VALUE rb_cEncodingConverter
Definition: transcode.c:25
VALUE rb_require_safe(VALUE, int)
Definition: load.c:943
static const rb_data_type_t econv_data_type
Definition: transcode.c:2922
ssize_t(* func_sio)(void *, const unsigned char *, size_t, VALUE, unsigned char *, size_t)
#define ALLOCA_N(type, n)
Definition: ruby.h:1337
static VALUE econv_set_replacement(VALUE self, VALUE arg)
Definition: transcode.c:4183
#define TRANSCODING_STATE(tc)
Definition: transcode.c:97
#define LONG_MAX
Definition: ruby.h:191
#define MEMCPY(p1, p2, type, n)
Definition: ruby.h:1352
#define ENC_CODERANGE_BROKEN
Definition: encoding.h:51
static VALUE sym_fallback
Definition: transcode.c:27
char ary[sizeof(double) > sizeof(void *)?sizeof(double):sizeof(void *)]
Definition: transcode.c:80
VALUE rb_enc_associate_index(VALUE obj, int idx)
Definition: encoding.c:798
#define rb_ary_new4
Definition: intern.h:92
#define rb_str_new2
Definition: intern.h:840
int err
Definition: win32.c:114
#define OBJ_FREEZE(x)
Definition: ruby.h:1186
static VALUE method_fallback(VALUE fallback, VALUE c)
Definition: transcode.c:2254
rb_transcoder_asciicompat_type_t asciicompat_type
void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
Definition: transcode.c:233
#define PRIdPTRDIFF
Definition: ruby.h:161
static VALUE econv_equal(VALUE self, VALUE other)
Definition: transcode.c:3535
int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
Definition: transcode.c:1908
#define ENC_CODERANGE_VALID
Definition: encoding.h:50
#define ECONV_UNDEF_MASK
Definition: encoding.h:323
#define ALLOC(type)
Definition: ruby.h:1334
#define SUSPEND_OBUF(num)
VALUE rb_str_resize(VALUE, long)
Definition: string.c:2025
static int str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
Definition: transcode.c:2664
void rb_register_transcoder(const rb_transcoder *tr)
Definition: transcode.c:205
size_t rb_str_capacity(VALUE)
Definition: string.c:468
unsigned char * out_buf_start
Definition: transcode.c:104
static int transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
Definition: transcode.c:256
#define getGB4bt0(a)
static VALUE econv_putback(int argc, VALUE *argv, VALUE self)
Definition: transcode.c:4084
ssize_t recognized_len
Definition: transcode.c:64
static VALUE sym_xml
Definition: transcode.c:28
int num_trans
Definition: transcode.c:129
#define FUNso
static void search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
Definition: transcode.c:3096
#define RSTRING_LEN(str)
Definition: ruby.h:841
static rb_econv_t * check_econv(VALUE self)
Definition: transcode.c:3447
int num_additional
Definition: transcode.c:961
#define REALLOC_N(var, type, n)
Definition: ruby.h:1335
#define TRUE
Definition: nkf.h:175
VALUE rb_obj_is_proc(VALUE)
Definition: proc.c:94
static VALUE econv_s_allocate(VALUE klass)
Definition: transcode.c:2929
search_path_queue_t ** queue_last_ptr
Definition: transcode.c:251
VALUE rb_sprintf(const char *format,...)
Definition: sprintf.c:1250
int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
Definition: encoding.c:958
static VALUE econv_insert_output(VALUE self, VALUE string)
Definition: transcode.c:4039
static VALUE ecerr_destination_encoding(VALUE self)
Definition: transcode.c:4280
int rb_econv_putbackable(rb_econv_t *ec)
Definition: transcode.c:1742
#define rb_enc_name(enc)
Definition: encoding.h:125
#define RSTRING_EMBED_LEN_MAX
Definition: ruby.h:819
unsigned char * out_buf_end
Definition: transcode.c:107
static int decorator_names(int ecflags, const char **decorators_ret)
Definition: transcode.c:1029
unsigned char next_byte
Definition: transcode.c:61
int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname)
Definition: transcode.c:2198
struct rb_transcoding * last_tc
Definition: transcode.c:131
#define MEMMOVE(p1, p2, type, n)
Definition: ruby.h:1353
#define STR1_BYTEINDEX(w)
VALUE rb_hash_new(void)
Definition: hash.c:298
static VALUE aref_fallback(VALUE fallback, VALUE c)
Definition: transcode.c:2260
static VALUE make_encobj(const char *name)
Definition: transcode.c:2955
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Definition: class.c:1728
const char * base_enc
Definition: transcode.c:252
VALUE rb_ivar_set(VALUE, ID, VALUE)
Definition: variable.c:1133
VALUE rb_check_hash_type(VALUE hash)
Definition: hash.c:588
unsigned char buf[MIME_BUF_SIZE]
Definition: nkf.c:4308
VALUE rb_assoc_new(VALUE car, VALUE cdr)
Definition: array.c:616
#define ECONV_CRLF_NEWLINE_DECORATOR
Definition: encoding.h:333
const char * source_encoding
Definition: transcode.c:137
#define Qnil
Definition: ruby.h:427
static VALUE sym_lf
Definition: transcode.c:33
int rb_define_dummy_encoding(const char *name)
Definition: encoding.c:437
VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
Definition: transcode.c:1871
static VALUE econv_init(int argc, VALUE *argv, VALUE self)
Definition: transcode.c:3373
unsigned long VALUE
Definition: ruby.h:88
static VALUE result
Definition: nkf.c:40
static VALUE sym_universal_newline
Definition: transcode.c:29
union rb_transcoding::rb_transcoding_state_t state
#define ECONV_NEWLINE_DECORATOR_MASK
Definition: encoding.h:328
const char * src_encoding
VALUE rb_obj_encoding(VALUE obj)
Definition: encoding.c:930
#define ECONV_UNDEF_HEX_CHARREF
Definition: encoding.h:325
#define getBT1(a)
static void trans_open_i(const char *sname, const char *dname, int depth, void *arg)
Definition: transcode.c:965
#define rb_enc_asciicompat(enc)
Definition: encoding.h:188
static VALUE sym_universal
Definition: transcode.c:33
VALUE rb_str_new_cstr(const char *)
Definition: string.c:560
int memcmp(const void *s1, const void *s2, size_t len)
Definition: memcmp.c:7
static VALUE ecerr_error_char(VALUE self)
Definition: transcode.c:4301
VALUE rb_str_dump(VALUE)
Definition: string.c:4899
VALUE rb_proc_call(VALUE, VALUE)
Definition: proc.c:754
const char * ascii_compat_name
Definition: transcode.c:1764
unsigned char * ptr
Definition: transcode.c:68
static rb_encoding * make_encoding(const char *name)
Definition: transcode.c:2945
#define ECONV_CR_NEWLINE_DECORATOR
Definition: encoding.h:334
#define RARRAY_LENINT(ary)
Definition: ruby.h:884
VALUE rb_str_dup(VALUE)
Definition: string.c:1062
static VALUE econv_source_encoding(VALUE self)
Definition: transcode.c:3465
static VALUE proc_fallback(VALUE fallback, VALUE c)
Definition: transcode.c:2248
static VALUE sym_cr
Definition: transcode.c:33
static VALUE sym_finished
Definition: transcode.c:41
VALUE rb_hash_freeze(VALUE hash)
Definition: hash.c:62
#define FUNsi
#define FL_UNSET(x, f)
Definition: ruby.h:1173
#define INVALID
#define BL_MIN_BYTE
int rb_respond_to(VALUE, ID)
Definition: vm_method.c:1638
#define StringValueCStr(v)
Definition: ruby.h:541
static int make_replacement(rb_econv_t *ec)
Definition: transcode.c:2164
#define RSTRING_PTR(str)
Definition: ruby.h:845
static rb_econv_result_t transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos, const unsigned char *in_stop, unsigned char *out_stop, rb_transcoding *tc, const int opt)
Definition: transcode.c:432
#define ONEbt
#define rb_exc_new3
Definition: intern.h:248
#define ECONV_UNDEF_REPLACE
Definition: encoding.h:324
void rb_str_modify(VALUE)
Definition: string.c:1484
rb_encoding * rb_enc_get(VALUE obj)
Definition: encoding.c:832
static VALUE sym_after_output
Definition: transcode.c:42
int size
Definition: encoding.c:49
static VALUE econv_inspect(VALUE self)
Definition: transcode.c:3427
#define f
#define INT2FIX(i)
Definition: ruby.h:231
static rb_transcoding * rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
Definition: transcode.c:783
void * rb_check_typeddata(VALUE obj, const rb_data_type_t *data_type)
Definition: error.c:520
#define RARRAY_AREF(a, i)
Definition: ruby.h:901
unsigned char * out_data_end
Definition: transcode.c:106
static rb_econv_result_t rb_econv_convert0(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags)
Definition: transcode.c:1266
#define xmalloc
Definition: defines.h:108
#define SIZE_MAX
Definition: ruby.h:274
static int asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
Definition: transcode.c:1769
size_t rb_econv_memsize(rb_econv_t *ec)
Definition: transcode.c:1720
rb_econv_t * rb_econv_open(const char *sname, const char *dname, int ecflags)
Definition: transcode.c:1067
union rb_transcoding::@156 writebuf
#define TRANSCODING_WRITEBUF(tc)
Definition: transcode.c:88
static const unsigned char * transcode_char_start(rb_transcoding *tc, const unsigned char *in_start, const unsigned char *inchar_start, const unsigned char *in_p, size_t *char_len_ptr)
Definition: transcode.c:412
VALUE rb_check_array_type(VALUE ary)
Definition: array.c:628
VALUE rb_hash_aref(VALUE hash, VALUE key)
Definition: hash.c:697
void rb_error_arity(int argc, int min, int max)
static VALUE ecerr_error_bytes(VALUE self)
Definition: transcode.c:4322
static rb_econv_result_t rb_transcoding_convert(rb_transcoding *tc, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags)
Definition: transcode.c:810
VALUE rb_str_catf(VALUE str, const char *format,...)
Definition: sprintf.c:1290
#define rb_funcall3
Definition: ruby.h:1457
uint8_t key[16]
Definition: random.c:1250
void rb_str_shared_replace(VALUE, VALUE)
Definition: string.c:972
#define RTEST(v)
Definition: ruby.h:437
static void declare_transcoder(const char *sname, const char *dname, const char *lib)
Definition: transcode.c:222
unsigned int next_table
Definition: transcode.c:59
size_t readagain_len
Definition: transcode.c:141
static int rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
Definition: transcode.c:1883
static VALUE sym_invalid
Definition: transcode.c:27
static int rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
Definition: transcode.c:894
#define getBT2(a)
static VALUE econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
Definition: transcode.c:2979
int num_allocated
Definition: transcode.c:128
#define BYTE_ADDR(index)
const char * destination_encoding_name
Definition: transcode.c:114
static VALUE econv_convpath(VALUE self)
Definition: transcode.c:3511
static int trans_sweep(rb_econv_t *ec, const unsigned char **input_ptr, const unsigned char *input_stop, unsigned char **output_ptr, unsigned char *output_stop, int flags, int start)
Definition: transcode.c:1094
VALUE rb_enc_default_internal(void)
Definition: encoding.c:1445
const char * rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
Definition: transcode.c:1786
VALUE rb_enc_str_new(const char *, long, rb_encoding *)
Definition: string.c:548
#define rb_safe_level()
Definition: tcltklib.c:95
const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec)
Definition: transcode.c:1503
static VALUE ecerr_source_encoding(VALUE self)
Definition: transcode.c:4256
static int output_hex_charref(rb_econv_t *ec)
Definition: transcode.c:1390
VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
Definition: transcode.c:1877
#define hash_fallback
Definition: transcode.c:2245
static VALUE ecerr_readagain_bytes(VALUE self)
Definition: transcode.c:4334
const char * name
Definition: nkf.c:208
#define xrealloc
Definition: defines.h:111
#define ID2SYM(x)
Definition: ruby.h:355
VALUE rb_eUndefinedConversionError
Definition: transcode.c:21
const char * rb_id2name(ID id)
Definition: ripper.c:17227
int started
Definition: transcode.c:116
rb_econv_elem_t * elems
Definition: transcode.c:127
static VALUE sym_text
Definition: transcode.c:28
const char * replacement_enc
Definition: transcode.c:120
VALUE rb_str_new_frozen(VALUE)
Definition: string.c:833
VALUE rb_str_drop_bytes(VALUE, long)
Definition: string.c:3635
const char * source_encoding_name
Definition: transcode.c:113
size_t replacement_len
Definition: transcode.c:119
int replacement_allocated
Definition: transcode.c:121
static VALUE sym_undef
Definition: transcode.c:27
#define BL_MAX_BYTE
struct search_path_queue_tag * next
Definition: transcode.c:244
int rb_enc_find_index(const char *name)
Definition: encoding.c:684
static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx)
Definition: transcode.c:2895
static int econv_opts(VALUE opt, int ecflags)
Definition: transcode.c:2428
#define rb_check_frozen(obj)
Definition: intern.h:277
static VALUE sym_destination_buffer_full
Definition: transcode.c:39
#define getBT0(a)
static unsigned char * allocate_converted_string(const char *sname, const char *dname, const unsigned char *str, size_t len, unsigned char *caller_dst_buf, size_t caller_dst_bufsize, size_t *dst_len_ptr)
Definition: transcode.c:1519
void void xfree(void *)
const rb_transcoder * transcoder
Definition: transcode.c:54
static transcoder_entry_t * get_transcoder_entry(const char *sname, const char *dname)
Definition: transcode.c:189
#define rb_intern(str)
ssize_t writebuf_off
Definition: transcode.c:71
VALUE rb_str_buf_new(long)
Definition: string.c:891
#define SYMBOL_P(x)
Definition: ruby.h:354
#define TWObt
VALUE rb_str_scrub(VALUE, VALUE)
Definition: string.c:8037
#define NULL
Definition: _sdbm.c:103
struct rb_transcoding rb_transcoding
#define Qundef
Definition: ruby.h:428
st_index_t num_entries
Definition: st.h:85
void rb_define_method(VALUE klass, const char *name, VALUE(*func)(ANYARGS), int argc)
Definition: class.c:1488
int st_foreach(st_table *, int(*)(ANYARGS), st_data_t)
Definition: st.c:1034
const unsigned char * replacement_str
Definition: transcode.c:118
#define bp()
Definition: vm_debug.h:25
#define STR1_LENGTH(byte_addr)
VALUE(* func_ii)(void *, VALUE)
#define encoding_equal(enc1, enc2)
Definition: transcode.c:241
#define TRANSCODING_WRITEBUF_SIZE(tc)
Definition: transcode.c:92
static rb_encoding * make_dummy_encoding(const char *name)
Definition: transcode.c:2935
VALUE rb_eArgError
Definition: error.c:549
#define ECONV_UNIVERSAL_NEWLINE_DECORATOR
Definition: encoding.h:332
#define writebuf_off
rb_encoding * rb_enc_find(const char *name)
Definition: encoding.c:708
#define NUM2LONG(x)
Definition: ruby.h:600
transcoder_entry_t ** entries
Definition: transcode.c:960
static VALUE econv_result_to_symbol(rb_econv_result_t res)
Definition: transcode.c:3570
VALUE rb_attr_get(VALUE, ID)
Definition: variable.c:1127
char ** argv
Definition: ruby.c:132
#define StringValue(v)
Definition: ruby.h:539
static VALUE ecerr_source_encoding_name(VALUE self)
Definition: transcode.c:4230
rb_encoding * rb_enc_from_index(int index)
Definition: encoding.c:590
static VALUE econv_primitive_convert(int argc, VALUE *argv, VALUE self)
Definition: transcode.c:3679
ssize_t(* func_io)(void *, VALUE, const unsigned char *, size_t)
VALUE rb_obj_class(VALUE)
Definition: object.c:227
VALUE rb_str_new(const char *, long)
Definition: string.c:534
static VALUE ecerr_destination_encoding_name(VALUE self)
Definition: transcode.c:4268
static VALUE sym_incomplete_input
Definition: transcode.c:43