Python-2.7.3/Modules/cjkcodecs/_codecs_kr.c

No issues found

  1 /*
  2  * _codecs_kr.c: Codecs collection for Korean encodings
  3  *
  4  * Written by Hye-Shik Chang <perky@FreeBSD.org>
  5  */
  6 
  7 #include "cjkcodecs.h"
  8 #include "mappings_kr.h"
  9 
 10 /*
 11  * EUC-KR codec
 12  */
 13 
 14 #define EUCKR_JAMO_FIRSTBYTE    0xA4
 15 #define EUCKR_JAMO_FILLER       0xD4
 16 
 17 static const unsigned char u2cgk_choseong[19] = {
 18     0xa1, 0xa2, 0xa4, 0xa7, 0xa8, 0xa9, 0xb1, 0xb2,
 19     0xb3, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb,
 20     0xbc, 0xbd, 0xbe
 21 };
 22 static const unsigned char u2cgk_jungseong[21] = {
 23     0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6,
 24     0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce,
 25     0xcf, 0xd0, 0xd1, 0xd2, 0xd3
 26 };
 27 static const unsigned char u2cgk_jongseong[28] = {
 28     0xd4, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
 29     0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
 30     0xb1, 0xb2, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xba,
 31     0xbb, 0xbc, 0xbd, 0xbe
 32 };
 33 
 34 ENCODER(euc_kr)
 35 {
 36     while (inleft > 0) {
 37         Py_UNICODE c = IN1;
 38         DBCHAR code;
 39 
 40         if (c < 0x80) {
 41             WRITE1((unsigned char)c)
 42             NEXT(1, 1)
 43             continue;
 44         }
 45         UCS4INVALID(c)
 46 
 47         REQUIRE_OUTBUF(2)
 48         TRYMAP_ENC(cp949, code, c);
 49         else return 1;
 50 
 51         if ((code & 0x8000) == 0) {
 52             /* KS X 1001 coded character */
 53             OUT1((code >> 8) | 0x80)
 54             OUT2((code & 0xFF) | 0x80)
 55             NEXT(1, 2)
 56         }
 57         else {          /* Mapping is found in CP949 extension,
 58                  * but we encode it in KS X 1001:1998 Annex 3,
 59                  * make-up sequence for EUC-KR. */
 60 
 61             REQUIRE_OUTBUF(8)
 62 
 63             /* syllable composition precedence */
 64             OUT1(EUCKR_JAMO_FIRSTBYTE)
 65             OUT2(EUCKR_JAMO_FILLER)
 66 
 67             /* All codepoints in CP949 extension are in unicode
 68              * Hangul Syllable area. */
 69             assert(0xac00 <= c && c <= 0xd7a3);
 70             c -= 0xac00;
 71 
 72             OUT3(EUCKR_JAMO_FIRSTBYTE)
 73             OUT4(u2cgk_choseong[c / 588])
 74             NEXT_OUT(4)
 75 
 76             OUT1(EUCKR_JAMO_FIRSTBYTE)
 77             OUT2(u2cgk_jungseong[(c / 28) % 21])
 78             OUT3(EUCKR_JAMO_FIRSTBYTE)
 79             OUT4(u2cgk_jongseong[c % 28])
 80             NEXT(1, 4)
 81         }
 82     }
 83 
 84     return 0;
 85 }
 86 
 87 #define NONE    127
 88 
 89 static const unsigned char cgk2u_choseong[] = { /* [A1, BE] */
 90        0,    1, NONE,    2, NONE, NONE,    3,    4,
 91        5, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
 92        6,    7,    8, NONE,    9,   10,   11,   12,
 93       13,   14,   15,   16,   17,   18
 94 };
 95 static const unsigned char cgk2u_jongseong[] = { /* [A1, BE] */
 96        1,    2,    3,    4,    5,    6,    7, NONE,
 97        8,    9,   10,   11,   12,   13,   14,   15,
 98       16,   17, NONE,   18,   19,   20,   21,   22,
 99     NONE,   23,   24,   25,   26,   27
100 };
101 
102 DECODER(euc_kr)
103 {
104     while (inleft > 0) {
105         unsigned char c = IN1;
106 
107         REQUIRE_OUTBUF(1)
108 
109         if (c < 0x80) {
110             OUT1(c)
111             NEXT(1, 1)
112             continue;
113         }
114 
115         REQUIRE_INBUF(2)
116 
117         if (c == EUCKR_JAMO_FIRSTBYTE &&
118             IN2 == EUCKR_JAMO_FILLER) {
119             /* KS X 1001:1998 Annex 3 make-up sequence */
120             DBCHAR cho, jung, jong;
121 
122             REQUIRE_INBUF(8)
123             if ((*inbuf)[2] != EUCKR_JAMO_FIRSTBYTE ||
124                 (*inbuf)[4] != EUCKR_JAMO_FIRSTBYTE ||
125                 (*inbuf)[6] != EUCKR_JAMO_FIRSTBYTE)
126                 return 8;
127 
128             c = (*inbuf)[3];
129             if (0xa1 <= c && c <= 0xbe)
130                 cho = cgk2u_choseong[c - 0xa1];
131             else
132                 cho = NONE;
133 
134             c = (*inbuf)[5];
135             jung = (0xbf <= c && c <= 0xd3) ? c - 0xbf : NONE;
136 
137             c = (*inbuf)[7];
138             if (c == EUCKR_JAMO_FILLER)
139                 jong = 0;
140             else if (0xa1 <= c && c <= 0xbe)
141                 jong = cgk2u_jongseong[c - 0xa1];
142             else
143                 jong = NONE;
144 
145             if (cho == NONE || jung == NONE || jong == NONE)
146                 return 8;
147 
148             OUT1(0xac00 + cho*588 + jung*28 + jong);
149             NEXT(8, 1)
150         }
151         else TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
152             NEXT(2, 1)
153         }
154         else
155             return 2;
156     }
157 
158     return 0;
159 }
160 #undef NONE
161 
162 
163 /*
164  * CP949 codec
165  */
166 
167 ENCODER(cp949)
168 {
169     while (inleft > 0) {
170         Py_UNICODE c = IN1;
171         DBCHAR code;
172 
173         if (c < 0x80) {
174             WRITE1((unsigned char)c)
175             NEXT(1, 1)
176             continue;
177         }
178         UCS4INVALID(c)
179 
180         REQUIRE_OUTBUF(2)
181         TRYMAP_ENC(cp949, code, c);
182         else return 1;
183 
184         OUT1((code >> 8) | 0x80)
185         if (code & 0x8000)
186             OUT2(code & 0xFF) /* MSB set: CP949 */
187         else
188             OUT2((code & 0xFF) | 0x80) /* MSB unset: ks x 1001 */
189         NEXT(1, 2)
190     }
191 
192     return 0;
193 }
194 
195 DECODER(cp949)
196 {
197     while (inleft > 0) {
198         unsigned char c = IN1;
199 
200         REQUIRE_OUTBUF(1)
201 
202         if (c < 0x80) {
203             OUT1(c)
204             NEXT(1, 1)
205             continue;
206         }
207 
208         REQUIRE_INBUF(2)
209         TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80);
210         else TRYMAP_DEC(cp949ext, **outbuf, c, IN2);
211         else return 2;
212 
213         NEXT(2, 1)
214     }
215 
216     return 0;
217 }
218 
219 
220 /*
221  * JOHAB codec
222  */
223 
224 static const unsigned char u2johabidx_choseong[32] = {
225                 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
226     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
227     0x10, 0x11, 0x12, 0x13, 0x14,
228 };
229 static const unsigned char u2johabidx_jungseong[32] = {
230                       0x03, 0x04, 0x05, 0x06, 0x07,
231                 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
232                 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
233                 0x1a, 0x1b, 0x1c, 0x1d,
234 };
235 static const unsigned char u2johabidx_jongseong[32] = {
236           0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
237     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
238     0x10, 0x11,       0x13, 0x14, 0x15, 0x16, 0x17,
239     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
240 };
241 static const DBCHAR u2johabjamo[] = {
242             0x8841, 0x8c41, 0x8444, 0x9041, 0x8446, 0x8447, 0x9441,
243     0x9841, 0x9c41, 0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f,
244     0x8450, 0xa041, 0xa441, 0xa841, 0x8454, 0xac41, 0xb041, 0xb441,
245     0xb841, 0xbc41, 0xc041, 0xc441, 0xc841, 0xcc41, 0xd041, 0x8461,
246     0x8481, 0x84a1, 0x84c1, 0x84e1, 0x8541, 0x8561, 0x8581, 0x85a1,
247     0x85c1, 0x85e1, 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1,
248     0x8741, 0x8761, 0x8781, 0x87a1,
249 };
250 
251 ENCODER(johab)
252 {
253     while (inleft > 0) {
254         Py_UNICODE c = IN1;
255         DBCHAR code;
256 
257         if (c < 0x80) {
258             WRITE1((unsigned char)c)
259             NEXT(1, 1)
260             continue;
261         }
262         UCS4INVALID(c)
263 
264         REQUIRE_OUTBUF(2)
265 
266         if (c >= 0xac00 && c <= 0xd7a3) {
267             c -= 0xac00;
268             code = 0x8000 |
269                 (u2johabidx_choseong[c / 588] << 10) |
270                 (u2johabidx_jungseong[(c / 28) % 21] << 5) |
271                 u2johabidx_jongseong[c % 28];
272         }
273         else if (c >= 0x3131 && c <= 0x3163)
274             code = u2johabjamo[c - 0x3131];
275         else TRYMAP_ENC(cp949, code, c) {
276             unsigned char c1, c2, t2;
277             unsigned short t1;
278 
279             assert((code & 0x8000) == 0);
280             c1 = code >> 8;
281             c2 = code & 0xff;
282             if (((c1 >= 0x21 && c1 <= 0x2c) ||
283                 (c1 >= 0x4a && c1 <= 0x7d)) &&
284                 (c2 >= 0x21 && c2 <= 0x7e)) {
285                 t1 = (c1 < 0x4a ? (c1 - 0x21 + 0x1b2) :
286                           (c1 - 0x21 + 0x197));
287                 t2 = ((t1 & 1) ? 0x5e : 0) + (c2 - 0x21);
288                 OUT1(t1 >> 1)
289                 OUT2(t2 < 0x4e ? t2 + 0x31 : t2 + 0x43)
290                 NEXT(1, 2)
291                 continue;
292             }
293             else
294                 return 1;
295         }
296         else
297             return 1;
298 
299         OUT1(code >> 8)
300         OUT2(code & 0xff)
301         NEXT(1, 2)
302     }
303 
304     return 0;
305 }
306 
307 #define FILL 0xfd
308 #define NONE 0xff
309 
310 static const unsigned char johabidx_choseong[32] = {
311     NONE, FILL, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
312     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
313     0x0e, 0x0f, 0x10, 0x11, 0x12, NONE, NONE, NONE,
314     NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
315 };
316 static const unsigned char johabidx_jungseong[32] = {
317     NONE, NONE, FILL, 0x00, 0x01, 0x02, 0x03, 0x04,
318     NONE, NONE, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
319     NONE, NONE, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10,
320     NONE, NONE, 0x11, 0x12, 0x13, 0x14, NONE, NONE,
321 };
322 static const unsigned char johabidx_jongseong[32] = {
323     NONE, FILL, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
324     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
325     0x0f, 0x10, NONE, 0x11, 0x12, 0x13, 0x14, 0x15,
326     0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, NONE, NONE,
327 };
328 
329 static const unsigned char johabjamo_choseong[32] = {
330     NONE, FILL, 0x31, 0x32, 0x34, 0x37, 0x38, 0x39,
331     0x41, 0x42, 0x43, 0x45, 0x46, 0x47, 0x48, 0x49,
332     0x4a, 0x4b, 0x4c, 0x4d, 0x4e, NONE, NONE, NONE,
333     NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
334 };
335 static const unsigned char johabjamo_jungseong[32] = {
336     NONE, NONE, FILL, 0x4f, 0x50, 0x51, 0x52, 0x53,
337     NONE, NONE, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
338     NONE, NONE, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
339     NONE, NONE, 0x60, 0x61, 0x62, 0x63, NONE, NONE,
340 };
341 static const unsigned char johabjamo_jongseong[32] = {
342     NONE, FILL, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
343     0x37, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
344     0x40, 0x41, NONE, 0x42, 0x44, 0x45, 0x46, 0x47,
345     0x48, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, NONE, NONE,
346 };
347 
348 DECODER(johab)
349 {
350     while (inleft > 0) {
351         unsigned char    c = IN1, c2;
352 
353         REQUIRE_OUTBUF(1)
354 
355         if (c < 0x80) {
356             OUT1(c)
357             NEXT(1, 1)
358             continue;
359         }
360 
361         REQUIRE_INBUF(2)
362         c2 = IN2;
363 
364         if (c < 0xd8) {
365             /* johab hangul */
366             unsigned char c_cho, c_jung, c_jong;
367             unsigned char i_cho, i_jung, i_jong;
368 
369             c_cho = (c >> 2) & 0x1f;
370             c_jung = ((c << 3) | c2 >> 5) & 0x1f;
371             c_jong = c2 & 0x1f;
372 
373             i_cho = johabidx_choseong[c_cho];
374             i_jung = johabidx_jungseong[c_jung];
375             i_jong = johabidx_jongseong[c_jong];
376 
377             if (i_cho == NONE || i_jung == NONE || i_jong == NONE)
378                 return 2;
379 
380             /* we don't use U+1100 hangul jamo yet. */
381             if (i_cho == FILL) {
382                 if (i_jung == FILL) {
383                     if (i_jong == FILL)
384                         OUT1(0x3000)
385                     else
386                         OUT1(0x3100 |
387                           johabjamo_jongseong[c_jong])
388                 }
389                 else {
390                     if (i_jong == FILL)
391                         OUT1(0x3100 |
392                           johabjamo_jungseong[c_jung])
393                     else
394                         return 2;
395                 }
396             } else {
397                 if (i_jung == FILL) {
398                     if (i_jong == FILL)
399                         OUT1(0x3100 |
400                           johabjamo_choseong[c_cho])
401                     else
402                         return 2;
403                 }
404                 else
405                     OUT1(0xac00 +
406                          i_cho * 588 +
407                          i_jung * 28 +
408                          (i_jong == FILL ? 0 : i_jong))
409             }
410             NEXT(2, 1)
411         } else {
412             /* KS X 1001 except hangul jamos and syllables */
413             if (c == 0xdf || c > 0xf9 ||
414                 c2 < 0x31 || (c2 >= 0x80 && c2 < 0x91) ||
415                 (c2 & 0x7f) == 0x7f ||
416                 (c == 0xda && (c2 >= 0xa1 && c2 <= 0xd3)))
417                 return 2;
418             else {
419                 unsigned char t1, t2;
420 
421                 t1 = (c < 0xe0 ? 2 * (c - 0xd9) :
422                          2 * c - 0x197);
423                 t2 = (c2 < 0x91 ? c2 - 0x31 : c2 - 0x43);
424                 t1 = t1 + (t2 < 0x5e ? 0 : 1) + 0x21;
425                 t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21;
426 
427                 TRYMAP_DEC(ksx1001, **outbuf, t1, t2);
428                 else return 2;
429                 NEXT(2, 1)
430             }
431         }
432     }
433 
434     return 0;
435 }
436 #undef NONE
437 #undef FILL
438 
439 
440 BEGIN_MAPPINGS_LIST
441   MAPPING_DECONLY(ksx1001)
442   MAPPING_ENCONLY(cp949)
443   MAPPING_DECONLY(cp949ext)
444 END_MAPPINGS_LIST
445 
446 BEGIN_CODECS_LIST
447   CODEC_STATELESS(euc_kr)
448   CODEC_STATELESS(cp949)
449   CODEC_STATELESS(johab)
450 END_CODECS_LIST
451 
452 I_AM_A_MODULE_FOR(kr)