Ruby  2.4.2p198(2017-09-14revision59899)
nkf.c
Go to the documentation of this file.
1 /*
2  * NKF - Ruby extension for Network Kanji Filter
3  *
4  * original nkf2.x is maintained at http://sourceforge.jp/projects/nkf/
5  *
6  * $Id: nkf.c 47744 2014-09-30 05:25:32Z nobu $
7  *
8  */
9 
10 #define RUBY_NKF_REVISION "$Revision: 47744 $"
11 #define RUBY_NKF_VERSION NKF_VERSION " (" NKF_RELEASE_DATE ")"
12 
13 #include "ruby/ruby.h"
14 #include "ruby/encoding.h"
15 
16 /* Replace nkf's getchar/putchar for variable modification */
17 /* we never use getc, ungetc */
18 
19 #undef getc
20 #undef ungetc
21 #define getc(f) (input_ctr>=i_len?-1:input[input_ctr++])
22 #define ungetc(c,f) input_ctr--
23 
24 #define INCSIZE 32
25 #undef putchar
26 #undef TRUE
27 #undef FALSE
28 #define putchar(c) rb_nkf_putchar(c)
29 
30 /* Input/Output pointers */
31 
32 static unsigned char *output;
33 static unsigned char *input;
34 static int input_ctr;
35 static int i_len;
36 static int output_ctr;
37 static int o_len;
38 static int incsize;
39 
40 static VALUE result;
41 
42 static int
43 rb_nkf_putchar(unsigned int c)
44 {
45  if (output_ctr >= o_len) {
46  o_len += incsize;
48  incsize *= 2;
49  output = (unsigned char *)RSTRING_PTR(result);
50  }
51  output[output_ctr++] = c;
52 
53  return c;
54 }
55 
56 /* Include kanji filter main part */
57 /* getchar and putchar will be replaced during inclusion */
58 
59 #define PERL_XS 1
60 #include "nkf-utf8/config.h"
61 #include "nkf-utf8/utf8tbl.c"
62 #include "nkf-utf8/nkf.c"
63 
65 {
66  int idx = rb_enc_find_index(name);
67  if (idx < 0) {
68  nkf_encoding *nkf_enc = nkf_enc_find(name);
70  if (idx < 0) {
71  idx = rb_define_dummy_encoding(name);
72  }
73  }
74  return rb_enc_from_index(idx);
75 }
76 
77 int nkf_split_options(const char *arg)
78 {
79  int count = 0;
80  unsigned char option[256];
81  int i = 0, j = 0;
82  int is_escaped = FALSE;
83  int is_single_quoted = FALSE;
84  int is_double_quoted = FALSE;
85  for(i = 0; arg[i]; i++){
86  if(j == 255){
87  return -1;
88  }else if(is_single_quoted){
89  if(arg[i] == '\''){
90  is_single_quoted = FALSE;
91  }else{
92  option[j++] = arg[i];
93  }
94  }else if(is_escaped){
95  is_escaped = FALSE;
96  option[j++] = arg[i];
97  }else if(arg[i] == '\\'){
98  is_escaped = TRUE;
99  }else if(is_double_quoted){
100  if(arg[i] == '"'){
101  is_double_quoted = FALSE;
102  }else{
103  option[j++] = arg[i];
104  }
105  }else if(arg[i] == '\''){
106  is_single_quoted = TRUE;
107  }else if(arg[i] == '"'){
108  is_double_quoted = TRUE;
109  }else if(arg[i] == ' '){
110  option[j] = '\0';
111  options(option);
112  j = 0;
113  }else{
114  option[j++] = arg[i];
115  }
116  }
117  if(j){
118  option[j] = '\0';
119  options(option);
120  }
121  return count;
122 }
123 
124 /*
125  * call-seq:
126  * NKF.nkf(opt, str) => string
127  *
128  * Convert _str_ and return converted result.
129  * Conversion details are specified by _opt_ as String.
130  *
131  * require 'nkf'
132  * output = NKF.nkf("-s", input)
133  */
134 
135 static VALUE
137 {
138  VALUE tmp;
139  reinit();
140  StringValue(opt);
142  if (!output_encoding) rb_raise(rb_eArgError, "no output encoding given");
143 
150  }
152 
153  incsize = INCSIZE;
154 
155  input_ctr = 0;
156  StringValue(src);
157  input = (unsigned char *)RSTRING_PTR(src);
158  i_len = RSTRING_LENINT(src);
159  tmp = rb_str_new(0, i_len*3 + 10);
160 
161  output_ctr = 0;
162  output = (unsigned char *)RSTRING_PTR(tmp);
163  o_len = RSTRING_LENINT(tmp);
164  *output = '\0';
165 
166  /* use _result_ begin*/
167  result = tmp;
169  result = Qnil;
170  /* use _result_ end */
171 
173  OBJ_INFECT(tmp, src);
174 
175  if (mimeout_f)
177  else
179 
180  return tmp;
181 }
182 
183 
184 /*
185  * call-seq:
186  * NKF.guess(str) => encoding
187  *
188  * Returns guessed encoding of _str_ by nkf routine.
189  *
190  */
191 
192 static VALUE
194 {
195  reinit();
196 
197  input_ctr = 0;
198  StringValue(src);
199  input = (unsigned char *)RSTRING_PTR(src);
200  i_len = RSTRING_LENINT(src);
201 
202  guess_f = TRUE;
203  kanji_convert( NULL );
204  guess_f = FALSE;
205 
207 }
208 
209 
210 /*
211  * NKF - Ruby extension for Network Kanji Filter
212  *
213  * == Description
214  *
215  * This is a Ruby Extension version of nkf (Network Kanji Filter).
216  * It converts the first argument and returns converted result. Conversion
217  * details are specified by flags as the first argument.
218  *
219  * *Nkf* is a yet another kanji code converter among networks, hosts and terminals.
220  * It converts input kanji code to designated kanji code
221  * such as ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8 or UTF-16.
222  *
223  * One of the most unique faculty of *nkf* is the guess of the input kanji encodings.
224  * It currently recognizes ISO-2022-JP, Shift_JIS, EUC-JP, UTF-8 and UTF-16.
225  * So users needn't set the input kanji code explicitly.
226  *
227  * By default, X0201 kana is converted into X0208 kana.
228  * For X0201 kana, SO/SI, SSO and ESC-(-I methods are supported.
229  * For automatic code detection, nkf assumes no X0201 kana in Shift_JIS.
230  * To accept X0201 in Shift_JIS, use <b>-X</b>, <b>-x</b> or <b>-S</b>.
231  *
232  * == Flags
233  *
234  * === -b -u
235  *
236  * Output is buffered (DEFAULT), Output is unbuffered.
237  *
238  * === -j -s -e -w -w16 -w32
239  *
240  * Output code is ISO-2022-JP (7bit JIS), Shift_JIS, EUC-JP,
241  * UTF-8N, UTF-16BE, UTF-32BE.
242  * Without this option and compile option, ISO-2022-JP is assumed.
243  *
244  * === -J -S -E -W -W16 -W32
245  *
246  * Input assumption is JIS 7 bit, Shift_JIS, EUC-JP,
247  * UTF-8, UTF-16, UTF-32.
248  *
249  * ==== -J
250  *
251  * Assume JIS input. It also accepts EUC-JP.
252  * This is the default. This flag does not exclude Shift_JIS.
253  *
254  * ==== -S
255  *
256  * Assume Shift_JIS and X0201 kana input. It also accepts JIS.
257  * EUC-JP is recognized as X0201 kana. Without <b>-x</b> flag,
258  * X0201 kana (halfwidth kana) is converted into X0208.
259  *
260  * ==== -E
261  *
262  * Assume EUC-JP input. It also accepts JIS.
263  * Same as -J.
264  *
265  * === -t
266  *
267  * No conversion.
268  *
269  * === -i_
270  *
271  * Output sequence to designate JIS-kanji. (DEFAULT B)
272  *
273  * === -o_
274  *
275  * Output sequence to designate ASCII. (DEFAULT B)
276  *
277  * === -r
278  *
279  * {de/en}crypt ROT13/47
280  *
281  * === -h[123] --hiragana --katakana --katakana-hiragana
282  *
283  * [-h1 --hiragana] Katakana to Hiragana conversion.
284  *
285  * [-h2 --katakana] Hiragana to Katakana conversion.
286  *
287  * [-h3 --katakana-hiragana] Katakana to Hiragana and Hiragana to Katakana conversion.
288  *
289  * === -T
290  *
291  * Text mode output (MS-DOS)
292  *
293  * === -l
294  *
295  * ISO8859-1 (Latin-1) support
296  *
297  * === -f[<code>m</code> [- <code>n</code>]]
298  *
299  * Folding on <code>m</code> length with <code>n</code> margin in a line.
300  * Without this option, fold length is 60 and fold margin is 10.
301  *
302  * === -F
303  *
304  * New line preserving line folding.
305  *
306  * === -Z[0-3]
307  *
308  * Convert X0208 alphabet (Fullwidth Alphabets) to ASCII.
309  *
310  * [-Z -Z0] Convert X0208 alphabet to ASCII.
311  *
312  * [-Z1] Converts X0208 kankaku to single ASCII space.
313  *
314  * [-Z2] Converts X0208 kankaku to double ASCII spaces.
315  *
316  * [-Z3] Replacing Fullwidth >, <, ", & into '&gt;', '&lt;', '&quot;', '&amp;' as in HTML.
317  *
318  * === -X -x
319  *
320  * Assume X0201 kana in MS-Kanji.
321  * With <b>-X</b> or without this option, X0201 is converted into X0208 Kana.
322  * With <b>-x</b>, try to preserve X0208 kana and do not convert X0201 kana to X0208.
323  * In JIS output, ESC-(-I is used. In EUC output, SSO is used.
324  *
325  * === -B[0-2]
326  *
327  * Assume broken JIS-Kanji input, which lost ESC.
328  * Useful when your site is using old B-News Nihongo patch.
329  *
330  * [-B1] allows any char after ESC-( or ESC-$.
331  *
332  * [-B2] forces ASCII after NL.
333  *
334  * === -I
335  *
336  * Replacing non iso-2022-jp char into a geta character
337  * (substitute character in Japanese).
338  *
339  * === -d -c
340  *
341  * Delete \r in line feed, Add \r in line feed.
342  *
343  * === -m[BQN0]
344  *
345  * MIME ISO-2022-JP/ISO8859-1 decode. (DEFAULT)
346  * To see ISO8859-1 (Latin-1) -l is necessary.
347  *
348  * [-mB] Decode MIME base64 encoded stream. Remove header or other part before
349  * conversion.
350  *
351  * [-mQ] Decode MIME quoted stream. '_' in quoted stream is converted to space.
352  *
353  * [-mN] Non-strict decoding.
354  * It allows line break in the middle of the base64 encoding.
355  *
356  * [-m0] No MIME decode.
357  *
358  * === -M
359  *
360  * MIME encode. Header style. All ASCII code and control characters are intact.
361  * Kanji conversion is performed before encoding, so this cannot be used as a picture encoder.
362  *
363  * [-MB] MIME encode Base64 stream.
364  *
365  * [-MQ] Perfome quoted encoding.
366  *
367  * === -l
368  *
369  * Input and output code is ISO8859-1 (Latin-1) and ISO-2022-JP.
370  * <b>-s</b>, <b>-e</b> and <b>-x</b> are not compatible with this option.
371  *
372  * === -L[uwm]
373  *
374  * new line mode
375  * Without this option, nkf doesn't convert line breaks.
376  *
377  * [-Lu] unix (LF)
378  *
379  * [-Lw] windows (CRLF)
380  *
381  * [-Lm] mac (CR)
382  *
383  * === --fj --unix --mac --msdos --windows
384  *
385  * convert for these system
386  *
387  * === --jis --euc --sjis --mime --base64
388  *
389  * convert for named code
390  *
391  * === --jis-input --euc-input --sjis-input --mime-input --base64-input
392  *
393  * assume input system
394  *
395  * === --ic=<code>input codeset</code> --oc=<code>output codeset</code>
396  *
397  * Set the input or output codeset.
398  * NKF supports following codesets and those codeset name are case insensitive.
399  *
400  * [ISO-2022-JP] a.k.a. RFC1468, 7bit JIS, JUNET
401  *
402  * [EUC-JP (eucJP-nkf)] a.k.a. AT&T JIS, Japanese EUC, UJIS
403  *
404  * [eucJP-ascii] a.k.a. x-eucjp-open-19970715-ascii
405  *
406  * [eucJP-ms] a.k.a. x-eucjp-open-19970715-ms
407  *
408  * [CP51932] Microsoft Version of EUC-JP.
409  *
410  * [Shift_JIS] SJIS, MS-Kanji
411  *
412  * [Windows-31J] a.k.a. CP932
413  *
414  * [UTF-8] same as UTF-8N
415  *
416  * [UTF-8N] UTF-8 without BOM
417  *
418  * [UTF-8-BOM] UTF-8 with BOM
419  *
420  * [UTF-16] same as UTF-16BE
421  *
422  * [UTF-16BE] UTF-16 Big Endian without BOM
423  *
424  * [UTF-16BE-BOM] UTF-16 Big Endian with BOM
425  *
426  * [UTF-16LE] UTF-16 Little Endian without BOM
427  *
428  * [UTF-16LE-BOM] UTF-16 Little Endian with BOM
429  *
430  * [UTF-32] same as UTF-32BE
431  *
432  * [UTF-32BE] UTF-32 Big Endian without BOM
433  *
434  * [UTF-32BE-BOM] UTF-32 Big Endian with BOM
435  *
436  * [UTF-32LE] UTF-32 Little Endian without BOM
437  *
438  * [UTF-32LE-BOM] UTF-32 Little Endian with BOM
439  *
440  * [UTF8-MAC] NKDed UTF-8, a.k.a. UTF8-NFD (input only)
441  *
442  * === --fb-{skip, html, xml, perl, java, subchar}
443  *
444  * Specify the way that nkf handles unassigned characters.
445  * Without this option, --fb-skip is assumed.
446  *
447  * === --prefix= <code>escape character</code> <code>target character</code> ..
448  *
449  * When nkf converts to Shift_JIS,
450  * nkf adds a specified escape character to specified 2nd byte of Shift_JIS characters.
451  * 1st byte of argument is the escape character and following bytes are target characters.
452  *
453  * === --no-cp932ext
454  *
455  * Handle the characters extended in CP932 as unassigned characters.
456  *
457  * == --no-best-fit-chars
458  *
459  * When Unicode to Encoded byte conversion,
460  * don't convert characters which is not round trip safe.
461  * When Unicode to Unicode conversion,
462  * with this and -x option, nkf can be used as UTF converter.
463  * (In other words, without this and -x option, nkf doesn't save some characters)
464  *
465  * When nkf convert string which related to path, you should use this opion.
466  *
467  * === --cap-input
468  *
469  * Decode hex encoded characters.
470  *
471  * === --url-input
472  *
473  * Unescape percent escaped characters.
474  *
475  * === --
476  *
477  * Ignore rest of -option.
478  */
479 
480 void
481 Init_nkf(void)
482 {
483  VALUE mNKF = rb_define_module("NKF");
484 
485  rb_define_module_function(mNKF, "nkf", rb_nkf_convert, 2);
486  rb_define_module_function(mNKF, "guess", rb_nkf_guess, 1);
487  rb_define_alias(rb_singleton_class(mNKF), "guess", "guess");
488 
489  rb_define_const(mNKF, "AUTO", Qnil);
490  rb_define_const(mNKF, "NOCONV", Qnil);
491  rb_define_const(mNKF, "UNKNOWN", Qnil);
492  rb_define_const(mNKF, "BINARY", rb_enc_from_encoding(rb_nkf_enc_get("BINARY")));
493  rb_define_const(mNKF, "ASCII", rb_enc_from_encoding(rb_nkf_enc_get("US-ASCII")));
494  rb_define_const(mNKF, "JIS", rb_enc_from_encoding(rb_nkf_enc_get("ISO-2022-JP")));
495  rb_define_const(mNKF, "EUC", rb_enc_from_encoding(rb_nkf_enc_get("EUC-JP")));
496  rb_define_const(mNKF, "SJIS", rb_enc_from_encoding(rb_nkf_enc_get("Shift_JIS")));
498  rb_define_const(mNKF, "UTF16", rb_enc_from_encoding(rb_nkf_enc_get("UTF-16BE")));
499  rb_define_const(mNKF, "UTF32", rb_enc_from_encoding(rb_nkf_enc_get("UTF-32BE")));
500 
501  /* Full version string of nkf */
502  rb_define_const(mNKF, "VERSION", rb_str_new2(RUBY_NKF_VERSION));
503  /* Version of nkf */
504  rb_define_const(mNKF, "NKF_VERSION", rb_str_new2(NKF_VERSION));
505  /* Release date of nkf */
506  rb_define_const(mNKF, "NKF_RELEASE_DATE", rb_str_new2(NKF_RELEASE_DATE));
507 }
static int incsize
Definition: nkf.c:38
#define FALSE
Definition: nkf.h:174
static VALUE rb_nkf_convert(VALUE obj, VALUE opt, VALUE src)
Definition: nkf.c:136
rb_encoding * rb_nkf_enc_get(const char *name)
Definition: nkf.c:64
#define NKF_RELEASE_DATE
Definition: nkf.c:24
void Init_nkf(void)
Definition: nkf.c:481
static int rb_nkf_putchar(unsigned int c)
Definition: nkf.c:43
#define nkf_enc_name(enc)
Definition: nkf.c:758
VALUE rb_enc_from_encoding(rb_encoding *encoding)
Definition: encoding.c:117
Definition: nkf.c:115
void rb_str_set_len(VALUE, long)
Definition: string.c:2545
void rb_raise(VALUE exc, const char *fmt,...)
Definition: error.c:2207
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:854
static VALUE rb_nkf_guess(VALUE obj, VALUE src)
Definition: nkf.c:193
static void reinit(void)
Definition: nkf.c:5579
static int kanji_convert(FILE *f)
Definition: nkf.c:5834
rb_encoding * rb_utf8_encoding(void)
Definition: encoding.c:1320
static int mimeout_f
Definition: nkf.c:402
#define RUBY_NKF_VERSION
Definition: nkf.c:11
static unsigned char * output
Definition: nkf.c:32
VALUE rb_singleton_class(VALUE obj)
Returns the singleton class of obj.
Definition: class.c:1689
#define NKF_VERSION
Definition: nkf.c:23
static int output_ctr
Definition: nkf.c:36
unsigned int input
Definition: nkf.c:4312
static int output_bom_f
Definition: nkf.c:365
void rb_define_const(VALUE, const char *, VALUE)
Definition: variable.c:2734
#define INCSIZE
Definition: nkf.c:24
static const char * get_guessed_code(void)
Definition: nkf.c:4551
#define rb_str_new2
Definition: intern.h:857
static int i_len
Definition: nkf.c:35
VALUE rb_str_resize(VALUE, long)
Definition: string.c:2562
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition: class.c:1758
void rb_define_module_function(VALUE module, const char *name, VALUE(*func)(ANYARGS), int argc)
Defines a module function for module.
Definition: class.c:1731
#define TRUE
Definition: nkf.h:175
#define nkf_enc_to_base_encoding(enc)
Definition: nkf.c:760
static int input_ctr
Definition: nkf.c:34
rb_encoding * rb_usascii_encoding(void)
Definition: encoding.c:1335
#define nkf_enc_to_index(enc)
Definition: nkf.c:759
#define Qnil
Definition: ruby.h:438
int rb_define_dummy_encoding(const char *name)
Definition: encoding.c:466
static int options(unsigned char *cp)
Definition: nkf.c:6358
unsigned long VALUE
Definition: ruby.h:85
static VALUE result
Definition: nkf.c:40
int count
Definition: nkf.c:5042
#define RSTRING_PTR(str)
Definition: ruby.h:982
static int o_len
Definition: nkf.c:37
static nkf_encoding * nkf_enc_find(const char *name)
Definition: nkf.c:750
#define OBJ_INFECT(x, s)
Definition: ruby.h:1304
Definition: nkf.c:110
Definition: nkf.c:113
const char * name
Definition: nkf.c:208
int rb_enc_find_index(const char *name)
Definition: encoding.c:704
#define RSTRING_LENINT(str)
Definition: ruby.h:990
Definition: nkf.c:108
VALUE rb_define_module(const char *name)
Definition: class.c:768
int nkf_split_options(const char *arg)
Definition: nkf.c:77
static nkf_encoding * output_encoding
Definition: nkf.c:338
#define NULL
Definition: _sdbm.c:102
Definition: nkf.c:118
static int guess_f
Definition: nkf.c:450
VALUE rb_eArgError
Definition: error.c:763
#define StringValue(v)
Definition: ruby.h:569
static nkf_encoding * nkf_enc_from_index(int idx)
Definition: nkf.c:728
rb_encoding * rb_enc_from_index(int index)
Definition: encoding.c:616
Definition: nkf.c:120
VALUE rb_str_new(const char *, long)
Definition: string.c:736