Ruby Forum Ruby-dev > iso-2022-jp implementation

Posted by Tanaka Akira (Guest)
on 07.08.2008 14:34
(Received via mailing list)
ISO-2022-JP の実装ですが、0x1e が shift だといってみたり、

% ./ruby -ve 'p "\x1e".encode("euc-jp", "iso-2022-jp")'
ruby 1.9.0 (2008-08-07 revision 18416) [i686-linux]
-e:1:in `encode': shift is not supported (RuntimeError)
        from -e:1:in `<main>'

:invalid=>:replace, :undef=>:replace の両方を指定しても例外が出たり、

% ./ruby -e 'p "\e(X".encode("EUC-JP", "ISO-2022-JP", 
:invalid=>:replace, :undef=>:replace)'
-e:1:in `encode': this mode is not supported (ESC ( X) (RuntimeError)
        from -e:1:in `<main>'

エラーメッセージに NUL が入ったり、

% ./ruby -e 'p "\e".encode("EUC-JP", "ISO-2022-JP")'|& cat -v
-e:1:in `encode': this mode is not supported (ESC ^@) (RuntimeError)
        from -e:1:in `<main>'

-1 bytes left という怪しいメッセージが出たり、

% ./ruby -e 'p "\e$(Da".encode("EUC-JP", "ISO-2022-JP")'
-e:1:in `encode': not fully converted, -1 bytes left (ArgumentError)
        from -e:1:in `<main>'

するので、作りなおして以下のようにするのはどうでしょうか。

Index: enc/trans/iso2022.erb.c
===================================================================
--- enc/trans/iso2022.erb.c  (revision 0)
+++ enc/trans/iso2022.erb.c  (revision 0)
@@ -0,0 +1,142 @@
+#include "transcode_data.h"
+
+<%
+  map = {}
+  map["1b2842"] = :func_so       # designate US-ASCII to G0. 
"ESC ( B"
+  map["1b284a"] = :func_so       # designate JIS X 0201 latin to G0. 
"ESC ( J"
+  map["1b2440"] = :func_so       # designate JIS X 0208 1978 to G0. 
"ESC $ @"
+  map["1b2442"] = :func_so       # designate JIS X 0208 1983 to G0. 
"ESC $ B"
+  map["{00-0d,10-1a,1c-7f}"] = :func_si
+
+  map_jisx0208_rest = {}
+  map_jisx0208_rest["{21-7e}"] = :func_so
+%>
+
+<%= transcode_generate_node(ActionMap.parse(map), "iso2022jp_to_eucjp", 
[]) %>
+<%= transcode_generate_node(ActionMap.parse(map_jisx0208_rest), 
"iso2022jp_to_eucjp_jisx0208_rest", []) %>
+
+static VALUE
+fun_si_iso2022jp_to_eucjp(rb_transcoding* t, const unsigned char* s, 
size_t l)
+{
+    if (t->stateful[0] == 0)
+        return (VALUE)NOMAP;
+    else if (0x21 <= s[0] && s[0] <= 0x7e)
+        return (VALUE)&iso2022jp_to_eucjp_jisx0208_rest;
+    else
+        return (VALUE)INVALID;
+}
+
+static int
+fun_so_iso2022jp_to_eucjp(rb_transcoding* t, const unsigned char* s, 
size_t l, unsigned char* o)
+{
+    if (s[0] == 0x1b) {
+        if (s[1] == '(') {
+            switch (s[l-1]) {
+              case 'B':
+              case 'J':
+                t->stateful[0] = 0;
+                break;
+            }
+        }
+        else {
+            switch (s[l-1]) {
+              case '@':
+              case 'B':
+                t->stateful[0] = 1;
+                break;
+            }
+        }
+        return 0;
+    }
+    else {
+        o[0] = s[0] | 0x80;
+        o[1] = s[1] | 0x80;
+        return 2;
+    }
+}
+
+static const rb_transcoder
+rb_ISO_2022_JP_to_EUC_JP = {
+    "ISO-2022-JP", "EUC-JP", &iso2022jp_to_eucjp, 3, 0,
+    NULL, fun_si_iso2022jp_to_eucjp, NULL, fun_so_iso2022jp_to_eucjp
+};
+
+<%
+  map_eucjp = {
+    "{0e,0f,1b}" => :undef,
+    "{00-0d,10-1a,1c-7f}" => :func_so,
+    "{a1-fe}{a1-fe}" => :func_so,
+    "8e{a1-fe}" => :undef,
+    "8f{a1-fe}{a1-fe}" => :undef,
+  }
+%>
+
+<%= transcode_generate_node(ActionMap.parse(map_eucjp), 
"eucjp_to_iso2022jp", []) %>
+
+static int
+fun_so_eucjp_to_iso2022jp(rb_transcoding *t, const unsigned char *s, 
size_t l, unsigned char *o)
+{
+    unsigned char *output0 = o;
+
+    if (t->stateful[0] == 0) {
+        t->stateful[0] = 1; /* initialized flag */
+        t->stateful[1] = 1; /* ASCII mode */
+    }
+
+    if (l != t->stateful[1]) {
+        if (l == 1) {
+            *o++ = 0x1b;
+            *o++ = '(';
+            *o++ = 'B';
+            t->stateful[1] = 1;
+        }
+        else {
+            *o++ = 0x1b;
+            *o++ = '$';
+            *o++ = 'B';
+            t->stateful[1] = 2;
+        }
+    }
+
+    if (l == 1) {
+        *o++ = s[0] & 0x7f;
+    }
+    else {
+        *o++ = s[0] & 0x7f;
+        *o++ = s[1] & 0x7f;
+    }
+
+    return o - output0;
+}
+
+static int
+finish_eucjp_to_iso2022jp(rb_transcoding *t, unsigned char *o)
+{
+    unsigned char *output0 = o;
+
+    if (t->stateful[0] == 0)
+        return 0;
+
+    if (t->stateful[1] != 1) {
+        *o++ = 0x1b;
+        *o++ = '(';
+        *o++ = 'B';
+        t->stateful[1] = 1;
+    }
+
+    return o - output0;
+}
+
+static const rb_transcoder
+rb_EUC_JP_to_ISO_2022_JP = {
+    "EUC-JP", "ISO-2022-JP", &eucjp_to_iso2022jp, 5, 0,
+    NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp, 
finish_eucjp_to_iso2022jp
+};
+
+void
+Init_iso2022(void)
+{
+    rb_register_transcoder(&rb_ISO_2022_JP_to_EUC_JP);
+    rb_register_transcoder(&rb_EUC_JP_to_ISO_2022_JP);
+}
+
Index: enc/trans/japanese.erb.c
===================================================================
--- enc/trans/japanese.erb.c  (revision 18417)
+++ enc/trans/japanese.erb.c  (working copy)
@@ -17,235 +17,8 @@
 <%= transcode_tblgen "UTF-8", "EUC-JP", [["{00-7f}", :nomap], 
*UCS_TO_EUCJP_TBL] %>
 <%= transcode_tblgen "UTF-8", "CP51932", [["{00-7f}", :nomap], 
*UCS_TO_EUCJP_TBL] %>

-#define ISO_2022_ENCODING(escseq, byte) ((escseq<<8)|byte)
-enum ISO_2022_ESCSEQ {
-    ISO_2022_CZD   = '!',
-    ISO_2022_C1D   = '"',
-    ISO_2022_GZD4  = '(',
-    ISO_2022_G1D4  = ')',
-    ISO_2022_G2D4  = '*',
-    ISO_2022_G3D4  = '+',
-    ISO_2022_G1D6  = '-',
-    ISO_2022_G2D6  = '.',
-    ISO_2022_G3D6  = '/',
-    ISO_2022_GZDM4 = ISO_2022_ENCODING('$','('),
-    ISO_2022_G1DM4 = ISO_2022_ENCODING('$',')'),
-    ISO_2022_G2DM4 = ISO_2022_ENCODING('$','*'),
-    ISO_2022_G3DM4 = ISO_2022_ENCODING('$','+'),
-    ISO_2022_G1DM6 = ISO_2022_ENCODING('$','-'),
-    ISO_2022_G2DM6 = ISO_2022_ENCODING('$','.'),
-    ISO_2022_G3DM6 = ISO_2022_ENCODING('$','/'),
-    ISO_2022_DOCS  = ISO_2022_ENCODING('%','I'),
-    ISO_2022_IRR   = '&'
-};
-
-
-#define ISO_2022_GZ_ASCII 
ISO_2022_ENCODING(ISO_2022_GZD4, 'B')
-#define ISO_2022_GZ_JIS_X_0201_Katakana 
ISO_2022_ENCODING(ISO_2022_GZD4, 'I')
-#define ISO_2022_GZ_JIS_X_0201_Roman 
ISO_2022_ENCODING(ISO_2022_GZD4, 'J')
-#define ISO_2022_GZ_JIS_C_6226_1978 
ISO_2022_ENCODING(ISO_2022_GZDM4,'@')
-#define ISO_2022_GZ_JIS_X_0208_1983 
ISO_2022_ENCODING(ISO_2022_GZDM4,'B')
-#define ISO_2022_GZ_JIS_X_0212_1990 
ISO_2022_ENCODING(ISO_2022_GZDM4,'D')
-#define ISO_2022_GZ_JIS_X_0213_2000_1 
ISO_2022_ENCODING(ISO_2022_GZDM4,'O')
-#define ISO_2022_GZ_JIS_X_0213_2000_2 
ISO_2022_ENCODING(ISO_2022_GZDM4,'P')
-#define ISO_2022_GZ_JIS_X_0213_2004_1 
ISO_2022_ENCODING(ISO_2022_GZDM4,'Q')
-
-#define UNSUPPORTED_MODE TRANSCODE_ERROR
-
-static int
-get_iso_2022_mode(const unsigned char **in_pos)
-{
-    int new_mode;
-    const unsigned char *in_p = *in_pos;
-    switch (*in_p++) {
-      case '(':
-  switch (*in_p++) {
-    case 'B': case 'I': case 'J':
-      new_mode = ISO_2022_ENCODING(ISO_2022_GZD4, *(in_p-1));
-      break;
-    default:
-      rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC ( 
%c)", *(in_p-1));
-      break;
-  }
-  break;
-      case '$':
-  switch (*in_p++) {
-    case '@': case 'A': case 'B':
-      new_mode = ISO_2022_ENCODING(ISO_2022_GZDM4, *(in_p-1));
-      break;
-    case '(':
-      switch (*in_p++) {
-        case 'D': case 'O': case 'P': case 'Q':
-    new_mode = ISO_2022_ENCODING(ISO_2022_GZDM4, *(in_p-1));
-    break;
-        default:
-    rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC $ ( 
%c)", *(in_p-1));
-    break;
-      }
-      break;
-    default:
-      rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC $ 
%c)", *(in_p-1));
-      break;
-  }
-  break;
-      default:
-  rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC %c)", 
*(in_p-1));
-  break;
-    }
-    *in_pos = in_p;
-    return new_mode;
-}
-
-static void
-from_iso_2022_jp_transcoder_preprocessor(const unsigned char **in_pos, 
unsigned char **out_pos,
-           const unsigned char *in_stop, unsigned char *out_stop,
-           rb_transcoding *my_transcoding)
-{
-    const rb_transcoder *my_transcoder = my_transcoding->transcoder;
-    const unsigned char *in_p = *in_pos;
-    unsigned char *out_p = *out_pos;
-    int cur_mode = ISO_2022_GZ_ASCII;
-    unsigned char c1;
-    unsigned char *out_s = out_stop - my_transcoder->max_output + 1;
-    while (in_p < in_stop) {
-  if (out_p >= out_s) {
-      int len = (out_p - *out_pos);
-      int new_len = (len + my_transcoder->max_output) * 2;
-      *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, 
new_len);
-      out_p = *out_pos + len;
-      out_s = *out_pos + new_len - my_transcoder->max_output;
-  }
-  c1 = *in_p++;
-  if (c1 == 0x1B) {
-      cur_mode = get_iso_2022_mode(&in_p);
-  }
-  else if (c1 == 0x1E || c1 == 0x1F) {
-      /* SHIFT */
-      rb_raise(UNSUPPORTED_MODE, "shift is not supported");
-  }
-  else if (c1 >= 0x80) {
-      rb_raise(TRANSCODE_ERROR, "invalid byte sequence");
-  }
-  else {
-      switch (cur_mode) {
-        case ISO_2022_GZ_ASCII:
-        case ISO_2022_GZ_JIS_X_0201_Roman:
-    *out_p++ = c1;
-    break;
-        case ISO_2022_GZ_JIS_X_0201_Katakana:
-    *out_p++ = 0x8E;
-    *out_p++ = c1 | 0x80;
-    break;
-        case ISO_2022_GZ_JIS_X_0212_1990:
-    *out_p++ = 0x8F;
-        case ISO_2022_GZ_JIS_C_6226_1978:
-        case ISO_2022_GZ_JIS_X_0208_1983:
-    *out_p++ = c1 | 0x80;
-    *out_p++ = *in_p++ | 0x80;
-    break;
-      }
-  }
-    }
-    /* cleanup */
-    *in_pos  = in_p;
-    *out_pos = out_p;
-}
-
-static int
-select_iso_2022_mode(unsigned char **out_pos, int new_mode)
-{
-    unsigned char *out_p = *out_pos;
-    *out_p++ = '\x1b';
-    switch (new_mode>>8) {
-      case ISO_2022_GZD4:
-  *out_p++ = new_mode >> 8;
-  *out_p++ = new_mode & 0x7F;
-  break;
-      case ISO_2022_GZDM4:
-  *out_p++ = new_mode >> 16;
-  if ((new_mode & 0x7F) != '@' &&
-      (new_mode & 0x7F) != 'A' &&
-      (new_mode & 0x7F) != 'B')
-  {
-      *out_p++ = (new_mode>>8) & 0x7F;
-  }
-  *out_p++ = new_mode & 0x7F;
-  break;
-      default:
-  rb_raise(UNSUPPORTED_MODE, "this mode is not supported.");
-  break;
-    }
-    *out_pos = out_p;
-    return new_mode;
-}
-
-static void
-to_iso_2022_jp_transcoder_postprocessor(const unsigned char **in_pos, 
unsigned char **out_pos,
-          const unsigned char *in_stop, unsigned char *out_stop,
-          rb_transcoding *my_transcoding)
-{
-    const rb_transcoder *my_transcoder = my_transcoding->transcoder;
-    const unsigned char *in_p = *in_pos;
-    unsigned char *out_p = *out_pos;
-    int cur_mode = ISO_2022_GZ_ASCII, new_mode = 0;
-    unsigned char next_byte;
-    unsigned char *out_s = out_stop - my_transcoder->max_output + 1;
-    while (in_p < in_stop) {
-  if (out_p >= out_s) {
-      int len = (out_p - *out_pos);
-      int new_len = (len + my_transcoder->max_output) * 2;
-      *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, 
new_len);
-      out_p = *out_pos + len;
-      out_s = *out_pos + new_len - my_transcoder->max_output;
-  }
-  next_byte = *in_p++;
-  if (next_byte < 0x80) {
-      new_mode = ISO_2022_GZ_ASCII;
-  }
-  else if (next_byte == 0x8E) {
-      new_mode = ISO_2022_GZ_JIS_X_0201_Katakana;
-      next_byte = *in_p++;
-  }
-  else if (next_byte == 0x8F) {
-      new_mode = ISO_2022_GZ_JIS_X_0212_1990;
-      next_byte = *in_p++;
-  }
-  else {
-      new_mode = ISO_2022_GZ_JIS_X_0208_1983;
-  }
-  if (cur_mode != new_mode)
-      cur_mode = select_iso_2022_mode(&out_p, new_mode);
-  if (cur_mode < 0xFFFF) {
-      *out_p++ = next_byte & 0x7F;
-  }
-  else {
-      *out_p++ = next_byte & 0x7F;
-      *out_p++ = *in_p++ & 0x7F;
-  }
-    }
-    if (cur_mode != ISO_2022_GZ_ASCII)
-  cur_mode = select_iso_2022_mode(&out_p, ISO_2022_GZ_ASCII);
-    /* cleanup */
-    *in_pos  = in_p;
-    *out_pos = out_p;
-}
-
-static const rb_transcoder
-rb_from_ISO_2022_JP = {
-    "ISO-2022-JP", "UTF-8", &from_EUC_JP, 8, 0,
-    &from_iso_2022_jp_transcoder_preprocessor, NULL,
-};
-
-static const rb_transcoder
-rb_to_ISO_2022_JP = {
-    "UTF-8", "ISO-2022-JP", &to_EUC_JP, 8, 1,
-    NULL, &to_iso_2022_jp_transcoder_postprocessor,
-};
-
 void
 Init_japanese(void)
 {
 <%= transcode_register_code %>
-    rb_register_transcoder(&rb_from_ISO_2022_JP);
-    rb_register_transcoder(&rb_to_ISO_2022_JP);
 }
Index: enc/trans/utf_16_32.erb.c
===================================================================
--- enc/trans/utf_16_32.erb.c  (revision 18417)
+++ enc/trans/utf_16_32.erb.c  (working copy)
@@ -1,7 +1,7 @@
 #include "transcode_data.h"

 static int
-fun_so_from_utf_16be(const unsigned char* s, unsigned char* o)
+fun_so_from_utf_16be(rb_transcoding* t, const unsigned char* s, size_t 
l, unsigned char* o)
 {
     if (!s[0] && s[1]<0x80) {
         o[0] = s[1];
@@ -29,7 +29,7 @@ fun_so_from_utf_16be(const unsigned char
 }

 static int
-fun_so_to_utf_16be(const unsigned char* s, unsigned char* o)
+fun_so_to_utf_16be(rb_transcoding* t, const unsigned char* s, size_t l, 
unsigned char* o)
 {
     if (!(s[0]&0x80)) {
         o[0] = 0x00;
@@ -57,7 +57,7 @@ fun_so_to_utf_16be(const unsigned char*
 }

 static int
-fun_so_from_utf_16le(const unsigned char* s, unsigned char* o)
+fun_so_from_utf_16le(rb_transcoding* t, const unsigned char* s, size_t 
l, unsigned char* o)
 {
     if (!s[1] && s[0]<0x80) {
         o[0] = s[0];
@@ -85,7 +85,7 @@ fun_so_from_utf_16le(const unsigned char
 }

 static int
-fun_so_to_utf_16le(const unsigned char* s, unsigned char* o)
+fun_so_to_utf_16le(rb_transcoding* t, const unsigned char* s, size_t l, 
unsigned char* o)
 {
     if (!(s[0]&0x80)) {
         o[1] = 0x00;
@@ -113,7 +113,7 @@ fun_so_to_utf_16le(const unsigned char*
 }

 static int
-fun_so_from_utf_32be(const unsigned char* s, unsigned char* o)
+fun_so_from_utf_32be(rb_transcoding* t, const unsigned char* s, size_t 
l, unsigned char* o)
 {
     if (!s[1]) {
         if (s[2]==0 && s[3]<0x80) {
@@ -142,7 +142,7 @@ fun_so_from_utf_32be(const unsigned char
 }

 static int
-fun_so_to_utf_32be(const unsigned char* s, unsigned char* o)
+fun_so_to_utf_32be(rb_transcoding* t, const unsigned char* s, size_t l, 
unsigned char* o)
 {
     o[0] = 0;
     if (!(s[0]&0x80)) {
@@ -168,13 +168,13 @@ fun_so_to_utf_32be(const unsigned char*
 }

 static int
-fun_so_from_utf_32le(const unsigned char* s, unsigned char* o)
+fun_so_from_utf_32le(rb_transcoding* t, const unsigned char* s, size_t 
l, unsigned char* o)
 {
     return 1;
 }

 static int
-fun_so_to_utf_32le(const unsigned char* s, unsigned char* o)
+fun_so_to_utf_32le(rb_transcoding* t, const unsigned char* s, size_t l, 
unsigned char* o)
 {
     return 4;
 }
@@ -191,7 +191,7 @@ fun_so_to_utf_32le(const unsigned char*
 static const rb_transcoder
 rb_from_UTF_16BE = {
     "UTF-16BE", "UTF-8", &from_UTF_16BE, 4, 0,
-    NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_16be
+    NULL, NULL, NULL, &fun_so_from_utf_16be
 };

 <%=
@@ -217,7 +217,7 @@ rb_from_UTF_16BE = {
 static const rb_transcoder
 rb_to_UTF_16BE = {
     "UTF-8", "UTF-16BE", &to_UTF_16BE, 4, 1,
-    NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_16be
+    NULL, NULL, NULL, &fun_so_to_utf_16be
 };

 <%=
@@ -232,13 +232,13 @@ rb_to_UTF_16BE = {
 static const rb_transcoder
 rb_from_UTF_16LE = {
     "UTF-16LE", "UTF-8", &from_UTF_16LE, 4, 0,
-    NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_16le
+    NULL, NULL, NULL, &fun_so_from_utf_16le
 };

 static const rb_transcoder
 rb_to_UTF_16LE = {
     "UTF-8", "UTF-16LE", &to_UTF_16BE, 4, 1,
-    NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_16le
+    NULL, NULL, NULL, &fun_so_to_utf_16le
 };

 <%=
@@ -254,13 +254,13 @@ rb_to_UTF_16LE = {
 static const rb_transcoder
 rb_from_UTF_32BE = {
     "UTF-32BE", "UTF-8", &from_UTF_32BE, 4, 0,
-    NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_32be
+    NULL, NULL, NULL, &fun_so_from_utf_32be
 };

 static const rb_transcoder
 rb_to_UTF_32BE = {
     "UTF-8", "UTF-32BE", &to_UTF_16BE, 4, 1,
-    NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_32be
+    NULL, NULL, NULL, &fun_so_to_utf_32be
 };

 <%=
@@ -276,13 +276,13 @@ rb_to_UTF_32BE = {
 static const rb_transcoder
 rb_from_UTF_32LE = {
     "UTF-32LE", "UTF-8", &from_UTF_32LE, 4, 0,
-    NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_32le
+    NULL, NULL, NULL, &fun_so_from_utf_32le
 };

 static const rb_transcoder
 rb_to_UTF_32LE = {
     "UTF-8", "UTF-32LE", &to_UTF_16BE, 4, 1,
-    NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_32le
+    NULL, NULL, NULL, &fun_so_to_utf_32le
 };

 void
Index: transcode_data.h
===================================================================
--- transcode_data.h  (revision 18417)
+++ transcode_data.h  (working copy)
@@ -63,6 +63,8 @@ typedef struct rb_transcoding {
     VALUE ruby_string_dest; /* the String used as the conversion 
destination,
              or NULL if something else is being converted */
     unsigned char *(*flush_func)(struct rb_transcoding*, int, int);
+
+    unsigned char stateful[256]; /* opaque data for stateful encoding 
*/
 } rb_transcoding;

 /* static structure, one per supported encoding pair */
@@ -72,12 +74,11 @@ typedef struct rb_transcoder {
     const BYTE_LOOKUP *conv_tree_start;
     int max_output;
     int from_utf8;
-    void (*preprocessor)(const unsigned char**, unsigned char**, const 
unsigned char*, unsigned char*, struct rb_transcoding *);
-    void (*postprocessor)(const unsigned char**, unsigned char**, const 
unsigned char*, unsigned char*, struct rb_transcoding *);
-    VALUE (*func_ii)(VALUE); /* info  -> info   */
-    VALUE (*func_si)(const unsigned char *); /* start -> info   */
-    int (*func_io)(VALUE, const unsigned char*); /* info  -> output */
-    int (*func_so)(const unsigned char*, unsigned char*); /* start -> 
output */
+    VALUE (*func_ii)(rb_transcoding*, VALUE); /* info  -> info   */
+    VALUE (*func_si)(rb_transcoding*, const unsigned char*, size_t); /* 
start -> info   */
+    int (*func_io)(rb_transcoding*, VALUE, const unsigned char*); /* 
info  -> output */
+    int (*func_so)(rb_transcoding*, const unsigned char*, size_t, 
unsigned char*); /* start -> output */
+    int (*finish_func)(rb_transcoding*, unsigned char*); /* -> output 
*/
 } rb_transcoder;

 void rb_declare_transcoder(const char *enc1, const char *enc2, const 
char *lib);
Index: tool/transcode-tblgen.rb
===================================================================
--- tool/transcode-tblgen.rb  (revision 18417)
+++ tool/transcode-tblgen.rb  (working copy)
@@ -234,6 +234,12 @@ class ActionMap
       "UNDEF"
     when :invalid
       "INVALID"
+    when :func_ii
+      "FUNii"
+    when :func_si
+      "FUNsi"
+    when :func_io
+      "FUNio"
     when :func_so
       "FUNso"
     when /\A([0-9a-f][0-9a-f])\z/i
Index: test/ruby/test_transcode.rb
===================================================================
--- test/ruby/test_transcode.rb  (revision 18417)
+++ test/ruby/test_transcode.rb  (working copy)
@@ -321,12 +321,13 @@ class TestTranscode < Test::Unit::TestCa
     assert_raise(RuntimeError) { "\x1b(A".encode("utf-8", 
"iso-2022-jp") }
     assert_raise(RuntimeError) { "\x1b$(A".encode("utf-8", 
"iso-2022-jp") }
     assert_raise(RuntimeError) { "\x1b$C".encode("utf-8", 
"iso-2022-jp") }
-    assert_raise(RuntimeError) { "\x1e".encode("utf-8", "iso-2022-jp") 
}
+    assert_raise(RuntimeError) { "\x0e".encode("utf-8", "iso-2022-jp") 
}
     assert_raise(RuntimeError) { "\x80".encode("utf-8", "iso-2022-jp") 
}
     assert_raise(RuntimeError) { "\x1b$(Dd!\x1b(B".encode("utf-8", 
"iso-2022-jp") }
     assert_raise(RuntimeError) { "\u9299".encode("iso-2022-jp") }
-    #@@@@ TODO: the next test should actually fail, because iso-2022-jp 
does not include half-width kana
-    check_both_ways("\uff71\uff72\uff73\uff74\uff75", 
"\x1b(I12345\x1b(B", "iso-2022-jp") # JIS X 0201 ァィゥェォ
+    assert_raise(RuntimeError) { "\u9299".encode("iso-2022-jp") }
+    assert_raise(RuntimeError) { 
"\uff71\uff72\uff73\uff74\uff75".encode("iso-2022-jp") }
+    assert_raise(RuntimeError) { "\x1b(I12345\x1b(B".encode("utf-8", 
"iso-2022-jp") }
   end

   def test_iso_2022_jp_1
Index: transcode.c
===================================================================
--- transcode.c  (revision 18417)
+++ transcode.c  (working copy)
@@ -25,53 +25,78 @@ static VALUE sym_invalid, sym_undef, sym
  *  Dispatch data and logic
  */

-static st_table *transcoder_table, *transcoder_lib_table;
+typedef struct {
+    const char *from;
+    const char *to;
+    const char *lib; /* maybe null.  it means that don't load the 
library. */
+    const rb_transcoder *transcoder;
+} transcoder_entry_t;

-#define TRANSCODER_INTERNAL_SEPARATOR '\t'
+static st_table *transcoder_table;

-static char *
-transcoder_key(const char *from_e, const char *to_e)
+static transcoder_entry_t *
+make_transcoder_entry(const char *from, const char *to)
 {
-    int to_len = strlen(to_e);
-    int from_len = strlen(from_e);
-    char *const key = xmalloc(to_len + from_len + 2);
+    st_data_t val;
+    st_table *table2;

-    memcpy(key, to_e, to_len);
-    memcpy(key + to_len + 1, from_e, from_len + 1);
-    key[to_len] = TRANSCODER_INTERNAL_SEPARATOR;
-    return key;
+    if (!st_lookup(transcoder_table, (st_data_t)from, &val)) {
+        val = (st_data_t)st_init_strcasetable();
+        st_add_direct(transcoder_table, (st_data_t)from, val);
+    }
+    table2 = (st_table *)val;
+    if (!st_lookup(table2, (st_data_t)to, &val)) {
+        transcoder_entry_t *entry = ALLOC(transcoder_entry_t);
+        entry->from = from;
+        entry->to = to;
+        entry->lib = NULL;
+        entry->transcoder = NULL;
+        val = (st_data_t)entry;
+        st_add_direct(table2, (st_data_t)to, val);
+    }
+    return (transcoder_entry_t *)val;
+}
+
+static transcoder_entry_t *
+get_transcoder_entry(const char *from, const char *to)
+{
+    st_data_t val;
+    st_table *table2;
+
+    if (!st_lookup(transcoder_table, (st_data_t)from, &val)) {
+        return NULL;
+    }
+    table2 = (st_table *)val;
+    if (!st_lookup(table2, (st_data_t)to, &val)) {
+        return NULL;
+    }
+    return (transcoder_entry_t *)val;
 }

 void
 rb_register_transcoder(const rb_transcoder *tr)
 {
-    st_data_t k, val = 0;
     const char *const from_e = tr->from_encoding;
     const char *const to_e = tr->to_encoding;
-    char *const key = transcoder_key(from_e, to_e);

-    if (st_lookup(transcoder_table, (st_data_t)key, &val)) {
-  xfree(key);
+    transcoder_entry_t *entry;
+
+    entry = make_transcoder_entry(from_e, to_e);
+    if (entry->transcoder) {
   rb_raise(rb_eArgError, "transcoder from %s to %s has been already 
registered",
      from_e, to_e);
     }
-    k = (st_data_t)key;
-    if (st_delete(transcoder_lib_table, &k, &val)) {
-  xfree((char *)k);
-    }
-    st_insert(transcoder_table, (st_data_t)key, (st_data_t)tr);
+
+    entry->transcoder = tr;
 }

 static void
 declare_transcoder(const char *to, const char *from, const char *lib)
 {
-    const char *const key = transcoder_key(to, from);
-    st_data_t k = (st_data_t)key, val;
+    transcoder_entry_t *entry;

-    if (st_delete(transcoder_lib_table, &k, &val)) {
-  xfree((char *)k);
-    }
-    st_insert(transcoder_lib_table, (st_data_t)key, (st_data_t)lib);
+    entry = make_transcoder_entry(from, to);
+    entry->lib = lib;
 }

 #define MAX_TRANSCODER_LIBNAME_LEN 64
@@ -90,38 +115,166 @@ rb_declare_transcoder(const char *enc1,

 #define encoding_equal(enc1, enc2) (STRCASECMP(enc1, enc2) == 0)

+typedef struct search_path_queue_tag {
+    struct search_path_queue_tag *next;
+    const char *enc;
+} search_path_queue_t;
+
+typedef struct {
+    st_table *visited;
+    search_path_queue_t *queue;
+    search_path_queue_t **queue_last_ptr;
+    const char *base_enc;
+} search_path_bfs_t;
+
+static int
+transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
+{
+    const char *to = (const char *)key;
+    search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
+    search_path_queue_t *q;
+
+    if (st_lookup(bfs->visited, (st_data_t)to, &val)) {
+        return ST_CONTINUE;
+    }
+
+    q = ALLOC(search_path_queue_t);
+    q->enc = to;
+    q->next = NULL;
+    *bfs->queue_last_ptr = q;
+    bfs->queue_last_ptr = &q->next;
+
+    st_add_direct(bfs->visited, (st_data_t)to, 
(st_data_t)bfs->base_enc);
+    return ST_CONTINUE;
+}
+
+static int
+transcode_search_path(const char *from, const char *to,
+    void (*callback)(const char *from, const char *to, int depth, void 
*arg),
+    void *arg)
+{
+    search_path_bfs_t bfs;
+    search_path_queue_t *q;
+    st_data_t val;
+    st_table *table2;
+    int found;
+
+    q = ALLOC(search_path_queue_t);
+    q->enc = from;
+    q->next = NULL;
+    bfs.queue_last_ptr = &q->next;
+    bfs.queue = q;
+
+    bfs.visited = st_init_strcasetable();
+    st_add_direct(bfs.visited, (st_data_t)from, (st_data_t)NULL);
+
+    while (bfs.queue) {
+        q = bfs.queue;
+        bfs.queue = q->next;
+        if (!bfs.queue)
+            bfs.queue_last_ptr = &bfs.queue;
+
+        if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
+            xfree(q);
+            continue;
+        }
+        table2 = (st_table *)val;
+
+        if (st_lookup(table2, (st_data_t)to, &val)) {
+            st_add_direct(bfs.visited, (st_data_t)to, 
(st_data_t)q->enc);
+            xfree(q);
+            found = 1;
+            goto cleanup;
+        }
+
+        bfs.base_enc = q->enc;
+        st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
+        bfs.base_enc = NULL;
+
+        xfree(q);
+    }
+    found = 0;
+
+cleanup:
+    while (bfs.queue) {
+        q = bfs.queue;
+        bfs.queue = q->next;
+        xfree(q);
+    }
+
+    if (found) {
+        const char *enc = to;
+        int depth = 0;
+        while (1) {
+            st_lookup(bfs.visited, (st_data_t)enc, &val);
+            if (!val)
+                break;
+            depth++;
+            enc = (const char *)val;
+        }
+        enc = to;
+        while (1) {
+            st_lookup(bfs.visited, (st_data_t)enc, &val);
+            if (!val)
+                break;
+            callback((const char *)val, enc, --depth, arg);
+            enc = (const char *)val;
+        }
+    }
+
+    st_free_table(bfs.visited);
+
+    return found;
+}
+
+static void
+transcode_dispatch_cb(const char *from, const char *to, int depth, void 
*arg)
+{
+    const rb_transcoder **first_transcoder_ptr = (const rb_transcoder 
**)arg;
+
+    transcoder_entry_t *entry;
+
+    if (!*first_transcoder_ptr)
+        return;
+
+    entry = get_transcoder_entry(from, to);
+    if (!entry)
+        goto failed;
+
+    if (!entry->transcoder && entry->lib) {
+        const char *lib = entry->lib;
+        int len = strlen(lib);
+        char path[sizeof(transcoder_lib_prefix) + 
MAX_TRANSCODER_LIBNAME_LEN];
+
+        entry->lib = NULL;
+
+        if (len > MAX_TRANSCODER_LIBNAME_LEN) goto failed;
+        memcpy(path, transcoder_lib_prefix, 
sizeof(transcoder_lib_prefix) - 1);
+        memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1);
+        if (!rb_require(path)) goto failed;
+    }
+    if (!entry->transcoder)
+        goto failed;
+
+    if (depth == 0)
+        *first_transcoder_ptr = entry->transcoder;
+
+    return;
+
+failed:
+    *first_transcoder_ptr = NULL;
+    return;
+}
+
 static const rb_transcoder *
 transcode_dispatch(const char *from_encoding, const char *to_encoding)
 {
-    char *const key = transcoder_key(from_encoding, to_encoding);
-    st_data_t k, val = 0;
+    const rb_transcoder *first_transcoder = (rb_transcoder *)1;

-    while (!st_lookup(transcoder_table, (k = (st_data_t)key), &val) &&
-     st_delete(transcoder_lib_table, &k, &val)) {
-  const char *const lib = (const char *)val;
-  int len = strlen(lib);
-  char path[sizeof(transcoder_lib_prefix) + 
MAX_TRANSCODER_LIBNAME_LEN];
-
-  xfree((char *)k);
-  if (len > MAX_TRANSCODER_LIBNAME_LEN) return NULL;
-  memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 
1);
-  memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1);
-  if (!rb_require(path)) return NULL;
-    }
-    if (!val) {
-  if (!st_lookup(transcoder_table, (st_data_t)key, &val)) {
-      xfree(key);
-      /* multistep logic, via UTF-8 */
-      if (!encoding_equal(from_encoding, "UTF-8") &&
-    !encoding_equal(to_encoding, "UTF-8") &&
-    transcode_dispatch("UTF-8", to_encoding)) {  /* check that we have 
a second step */
-    return transcode_dispatch(from_encoding, "UTF-8"); /* return first 
step */
-      }
-      return NULL;
-  }
+    if (transcode_search_path(from_encoding, to_encoding, 
transcode_dispatch_cb, (void *)&first_transcoder)) {
+        return first_transcoder;
     }
-    xfree(key);
-    return (rb_transcoder *)val;
+    return NULL;
 }

 static void
@@ -245,17 +398,17 @@ transcode_loop(const unsigned char **in_
       *out_p++ = getBT3(next_info);
       continue;
     case FUNii:
-      next_info = (VALUE)(*my_transcoder->func_ii)(next_info);
+      next_info = (VALUE)(*my_transcoder->func_ii)(my_transcoding, 
next_info);
       goto follow_info;
     case FUNsi:
-      next_info = (VALUE)(*my_transcoder->func_si)(char_start);
+      next_info = (VALUE)(*my_transcoder->func_si)(my_transcoding, 
char_start, (size_t)(in_p-char_start));
       goto follow_info;
       break;
     case FUNio:
-      out_p += (VALUE)(*my_transcoder->func_io)(next_info, out_p);
+      out_p += (VALUE)(*my_transcoder->func_io)(my_transcoding, 
next_info, out_p);
       break;
     case FUNso:
-      out_p += (VALUE)(*my_transcoder->func_so)(char_start, out_p);
+      out_p += (VALUE)(*my_transcoder->func_so)(my_transcoding, 
char_start, (size_t)(in_p-char_start), out_p);
       break;
     case INVALID:
       goto invalid;
@@ -290,6 +443,16 @@ transcode_loop(const unsigned char **in_
   continue;
     }
     /* cleanup */
+    if (my_transcoder->finish_func) {
+  if (out_p >= out_s) {
+      int len = (out_p - *out_pos);
+      int new_len = (len + my_transcoder->max_output) * 2;
+      *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, 
new_len);
+      out_p = *out_pos + len;
+      out_s = *out_pos + new_len - my_transcoder->max_output;
+  }
+        out_p += my_transcoder->finish_func(my_transcoding, out_p);
+    }
     *in_pos  = in_p;
     *out_pos = out_p;
 }
@@ -401,21 +564,8 @@ str_transcode(int argc, VALUE *argv, VAL
   }

   my_transcoding.transcoder = my_transcoder;
+        memset(my_transcoding.stateful, 0, 
sizeof(my_transcoding.stateful));

-  if (my_transcoder->preprocessor) {
-      fromp = sp = (unsigned char *)RSTRING_PTR(str);
-      slen = RSTRING_LEN(str);
-      blen = slen + 30; /* len + margin */
-      dest = rb_str_tmp_new(blen);
-      bp = (unsigned char *)RSTRING_PTR(dest);
-      my_transcoding.ruby_string_dest = dest;
-      (*my_transcoder->preprocessor)(&fromp, &bp, (sp+slen), (bp+blen), 
&my_transcoding);
-      if (fromp != sp+slen) {
-    rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes 
left", sp+slen-fromp);
-      }
-      rb_str_set_len(dest, (char *)bp - RSTRING_PTR(dest));
-      str = dest;
-  }
   fromp = sp = (unsigned char *)RSTRING_PTR(str);
   slen = RSTRING_LEN(str);
   blen = slen + 30; /* len + margin */
@@ -431,21 +581,6 @@ str_transcode(int argc, VALUE *argv, VAL
   buf = (unsigned char *)RSTRING_PTR(dest);
   *bp = '\0';
   rb_str_set_len(dest, bp - buf);
-  if (my_transcoder->postprocessor) {
-      str = dest;
-      fromp = sp = (unsigned char *)RSTRING_PTR(str);
-      slen = RSTRING_LEN(str);
-      blen = slen + 30; /* len + margin */
-      dest = rb_str_tmp_new(blen);
-      bp = (unsigned char *)RSTRING_PTR(dest);
-      my_transcoding.ruby_string_dest = dest;
-      (*my_transcoder->postprocessor)(&fromp, &bp, (sp+slen), 
(bp+blen), &my_transcoding);
-      if (fromp != sp+slen) {
-    rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes 
left", sp+slen-fromp);
-      }
-      buf = (unsigned char *)RSTRING_PTR(dest);
-      rb_str_set_len(dest, bp - buf);
-  }

   if (encoding_equal(my_transcoder->to_encoding, to_e)) {
       final_encoding = 1;
@@ -541,7 +676,6 @@ void
 Init_transcode(void)
 {
     transcoder_table = st_init_strcasetable();
-    transcoder_lib_table = st_init_strcasetable();

     sym_invalid = ID2SYM(rb_intern("invalid"));
     sym_undef = ID2SYM(rb_intern("undef"));
--
[田中 哲][たなか あきら][Tanaka Akira]
Posted by NARUSE, Yui (Guest)
on 07.08.2008 16:33
(Received via mailing list)
成瀬です。

Tanaka Akira wrote:
> ISO-2022-JP の実装ですが、0x1e が shift だといってみたり、
> ....
> するので、作りなおして以下のようにするのはどうでしょうか。

いいのではないでしょうか、一時的な実装のつもりでしたし。

一応ログに残しておく意味も含めて確認しておきますが、
この「ISO-2022-JP」は RFC 1468 であり、いわゆる半角カナは含まないのですよね。
Posted by Tanaka Akira (Guest)
on 07.08.2008 17:09
(Received via mailing list)
In article <489B07A6.8020303@airemix.jp>,
  "NARUSE, Yui" <naruse@airemix.jp> writes:

> $B0l1~%m%0$K;D$7$F$*$/0UL#$b4^$a$F3NG'$7$F$*$-$^$9$,!"(B
> $B$3$N!V(BISO-2022-JP$B!W$O(B RFC 1468 $B$G$"$j!"$$$o$f$kH>3Q%+%J$O4^$^$J$$$N$G$9$h$M!#(B

$B$($'!"(BJIS X 0201 $BJR2>L>$O4^$s$G$$$^$;$s!#(B