No issues found
1 /*
2 * _codecs_cn.c: Codecs collection for Mainland Chinese encodings
3 *
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
5 */
6
7 #include "cjkcodecs.h"
8 #include "mappings_cn.h"
9
10 /**
11 * hz is predefined as 100 on AIX. So we undefine it to avoid
12 * conflict against hz codec's.
13 */
14 #ifdef _AIX
15 #undef hz
16 #endif
17
18 /* GBK and GB2312 map differently in few codepoints that are listed below:
19 *
20 * gb2312 gbk
21 * A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT
22 * A1AA U+2015 HORIZONTAL BAR U+2014 EM DASH
23 * A844 undefined U+2015 HORIZONTAL BAR
24 */
25
26 #define GBK_DECODE(dc1, dc2, assi) \
27 if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \
28 else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \
29 else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \
30 else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \
31 else TRYMAP_DEC(gbkext, assi, dc1, dc2);
32
33 #define GBK_ENCODE(code, assi) \
34 if ((code) == 0x2014) (assi) = 0xa1aa; \
35 else if ((code) == 0x2015) (assi) = 0xa844; \
36 else if ((code) == 0x00b7) (assi) = 0xa1a4; \
37 else if ((code) != 0x30fb && TRYMAP_ENC_COND(gbcommon, assi, code));
38
39 /*
40 * GB2312 codec
41 */
42
43 ENCODER(gb2312)
44 {
45 while (inleft > 0) {
46 Py_UNICODE c = IN1;
47 DBCHAR code;
48
49 if (c < 0x80) {
50 WRITE1((unsigned char)c)
51 NEXT(1, 1)
52 continue;
53 }
54 UCS4INVALID(c)
55
56 REQUIRE_OUTBUF(2)
57 TRYMAP_ENC(gbcommon, code, c);
58 else return 1;
59
60 if (code & 0x8000) /* MSB set: GBK */
61 return 1;
62
63 OUT1((code >> 8) | 0x80)
64 OUT2((code & 0xFF) | 0x80)
65 NEXT(1, 2)
66 }
67
68 return 0;
69 }
70
71 DECODER(gb2312)
72 {
73 while (inleft > 0) {
74 unsigned char c = **inbuf;
75
76 REQUIRE_OUTBUF(1)
77
78 if (c < 0x80) {
79 OUT1(c)
80 NEXT(1, 1)
81 continue;
82 }
83
84 REQUIRE_INBUF(2)
85 TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {
86 NEXT(2, 1)
87 }
88 else return 2;
89 }
90
91 return 0;
92 }
93
94
95 /*
96 * GBK codec
97 */
98
99 ENCODER(gbk)
100 {
101 while (inleft > 0) {
102 Py_UNICODE c = IN1;
103 DBCHAR code;
104
105 if (c < 0x80) {
106 WRITE1((unsigned char)c)
107 NEXT(1, 1)
108 continue;
109 }
110 UCS4INVALID(c)
111
112 REQUIRE_OUTBUF(2)
113
114 GBK_ENCODE(c, code)
115 else return 1;
116
117 OUT1((code >> 8) | 0x80)
118 if (code & 0x8000)
119 OUT2((code & 0xFF)) /* MSB set: GBK */
120 else
121 OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
122 NEXT(1, 2)
123 }
124
125 return 0;
126 }
127
128 DECODER(gbk)
129 {
130 while (inleft > 0) {
131 unsigned char c = IN1;
132
133 REQUIRE_OUTBUF(1)
134
135 if (c < 0x80) {
136 OUT1(c)
137 NEXT(1, 1)
138 continue;
139 }
140
141 REQUIRE_INBUF(2)
142
143 GBK_DECODE(c, IN2, **outbuf)
144 else return 2;
145
146 NEXT(2, 1)
147 }
148
149 return 0;
150 }
151
152
153 /*
154 * GB18030 codec
155 */
156
157 ENCODER(gb18030)
158 {
159 while (inleft > 0) {
160 ucs4_t c = IN1;
161 DBCHAR code;
162
163 if (c < 0x80) {
164 WRITE1(c)
165 NEXT(1, 1)
166 continue;
167 }
168
169 DECODE_SURROGATE(c)
170 if (c > 0x10FFFF)
171 #if Py_UNICODE_SIZE == 2
172 return 2; /* surrogates pair */
173 #else
174 return 1;
175 #endif
176 else if (c >= 0x10000) {
177 ucs4_t tc = c - 0x10000;
178
179 REQUIRE_OUTBUF(4)
180
181 OUT4((unsigned char)(tc % 10) + 0x30)
182 tc /= 10;
183 OUT3((unsigned char)(tc % 126) + 0x81)
184 tc /= 126;
185 OUT2((unsigned char)(tc % 10) + 0x30)
186 tc /= 10;
187 OUT1((unsigned char)(tc + 0x90))
188
189 #if Py_UNICODE_SIZE == 2
190 NEXT(2, 4) /* surrogates pair */
191 #else
192 NEXT(1, 4)
193 #endif
194 continue;
195 }
196
197 REQUIRE_OUTBUF(2)
198
199 GBK_ENCODE(c, code)
200 else TRYMAP_ENC(gb18030ext, code, c);
201 else {
202 const struct _gb18030_to_unibmp_ranges *utrrange;
203
204 REQUIRE_OUTBUF(4)
205
206 for (utrrange = gb18030_to_unibmp_ranges;
207 utrrange->first != 0;
208 utrrange++)
209 if (utrrange->first <= c &&
210 c <= utrrange->last) {
211 Py_UNICODE tc;
212
213 tc = c - utrrange->first +
214 utrrange->base;
215
216 OUT4((unsigned char)(tc % 10) + 0x30)
217 tc /= 10;
218 OUT3((unsigned char)(tc % 126) + 0x81)
219 tc /= 126;
220 OUT2((unsigned char)(tc % 10) + 0x30)
221 tc /= 10;
222 OUT1((unsigned char)tc + 0x81)
223
224 NEXT(1, 4)
225 break;
226 }
227
228 if (utrrange->first == 0)
229 return 1;
230 continue;
231 }
232
233 OUT1((code >> 8) | 0x80)
234 if (code & 0x8000)
235 OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */
236 else
237 OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */
238
239 NEXT(1, 2)
240 }
241
242 return 0;
243 }
244
245 DECODER(gb18030)
246 {
247 while (inleft > 0) {
248 unsigned char c = IN1, c2;
249
250 REQUIRE_OUTBUF(1)
251
252 if (c < 0x80) {
253 OUT1(c)
254 NEXT(1, 1)
255 continue;
256 }
257
258 REQUIRE_INBUF(2)
259
260 c2 = IN2;
261 if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */
262 const struct _gb18030_to_unibmp_ranges *utr;
263 unsigned char c3, c4;
264 ucs4_t lseq;
265
266 REQUIRE_INBUF(4)
267 c3 = IN3;
268 c4 = IN4;
269 if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)
270 return 4;
271 c -= 0x81; c2 -= 0x30;
272 c3 -= 0x81; c4 -= 0x30;
273
274 if (c < 4) { /* U+0080 - U+FFFF */
275 lseq = ((ucs4_t)c * 10 + c2) * 1260 +
276 (ucs4_t)c3 * 10 + c4;
277 if (lseq < 39420) {
278 for (utr = gb18030_to_unibmp_ranges;
279 lseq >= (utr + 1)->base;
280 utr++) ;
281 OUT1(utr->first - utr->base + lseq)
282 NEXT(4, 1)
283 continue;
284 }
285 }
286 else if (c >= 15) { /* U+10000 - U+10FFFF */
287 lseq = 0x10000 + (((ucs4_t)c-15) * 10 + c2)
288 * 1260 + (ucs4_t)c3 * 10 + c4;
289 if (lseq <= 0x10FFFF) {
290 WRITEUCS4(lseq);
291 NEXT_IN(4)
292 continue;
293 }
294 }
295 return 4;
296 }
297
298 GBK_DECODE(c, c2, **outbuf)
299 else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);
300 else return 2;
301
302 NEXT(2, 1)
303 }
304
305 return 0;
306 }
307
308
309 /*
310 * HZ codec
311 */
312
313 ENCODER_INIT(hz)
314 {
315 state->i = 0;
316 return 0;
317 }
318
319 ENCODER_RESET(hz)
320 {
321 if (state->i != 0) {
322 WRITE2('~', '}')
323 state->i = 0;
324 NEXT_OUT(2)
325 }
326 return 0;
327 }
328
329 ENCODER(hz)
330 {
331 while (inleft > 0) {
332 Py_UNICODE c = IN1;
333 DBCHAR code;
334
335 if (c < 0x80) {
336 if (state->i == 0) {
337 WRITE1((unsigned char)c)
338 NEXT(1, 1)
339 }
340 else {
341 WRITE3('~', '}', (unsigned char)c)
342 NEXT(1, 3)
343 state->i = 0;
344 }
345 continue;
346 }
347
348 UCS4INVALID(c)
349
350 TRYMAP_ENC(gbcommon, code, c);
351 else return 1;
352
353 if (code & 0x8000) /* MSB set: GBK */
354 return 1;
355
356 if (state->i == 0) {
357 WRITE4('~', '{', code >> 8, code & 0xff)
358 NEXT(1, 4)
359 state->i = 1;
360 }
361 else {
362 WRITE2(code >> 8, code & 0xff)
363 NEXT(1, 2)
364 }
365 }
366
367 return 0;
368 }
369
370 DECODER_INIT(hz)
371 {
372 state->i = 0;
373 return 0;
374 }
375
376 DECODER_RESET(hz)
377 {
378 state->i = 0;
379 return 0;
380 }
381
382 DECODER(hz)
383 {
384 while (inleft > 0) {
385 unsigned char c = IN1;
386
387 if (c == '~') {
388 unsigned char c2 = IN2;
389
390 REQUIRE_INBUF(2)
391 if (c2 == '~') {
392 WRITE1('~')
393 NEXT(2, 1)
394 continue;
395 }
396 else if (c2 == '{' && state->i == 0)
397 state->i = 1; /* set GB */
398 else if (c2 == '}' && state->i == 1)
399 state->i = 0; /* set ASCII */
400 else if (c2 == '\n')
401 ; /* line-continuation */
402 else
403 return 2;
404 NEXT(2, 0);
405 continue;
406 }
407
408 if (c & 0x80)
409 return 1;
410
411 if (state->i == 0) { /* ASCII mode */
412 WRITE1(c)
413 NEXT(1, 1)
414 }
415 else { /* GB mode */
416 REQUIRE_INBUF(2)
417 REQUIRE_OUTBUF(1)
418 TRYMAP_DEC(gb2312, **outbuf, c, IN2) {
419 NEXT(2, 1)
420 }
421 else
422 return 2;
423 }
424 }
425
426 return 0;
427 }
428
429
430 BEGIN_MAPPINGS_LIST
431 MAPPING_DECONLY(gb2312)
432 MAPPING_DECONLY(gbkext)
433 MAPPING_ENCONLY(gbcommon)
434 MAPPING_ENCDEC(gb18030ext)
435 END_MAPPINGS_LIST
436
437 BEGIN_CODECS_LIST
438 CODEC_STATELESS(gb2312)
439 CODEC_STATELESS(gbk)
440 CODEC_STATELESS(gb18030)
441 CODEC_STATEFUL(hz)
442 END_CODECS_LIST
443
444 I_AM_A_MODULE_FOR(cn)