xmltok.cc

Go to the documentation of this file.
00001 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
00002    See the file COPYING for copying permission.
00003 */
00004 
00005 #include <stddef.h>
00006 
00007 #ifdef COMPILED_FROM_DSP
00008 #include "winconfig.h"
00009 #elif defined(MACOS_CLASSIC)
00010 #include "macconfig.h"
00011 #elif defined(__amigaos4__)
00012 #include "amigaconfig.h"
00013 #elif defined(__WATCOMC__)
00014 #include "watcomconfig.h"
00015 #else
00016 #ifdef HAVE_EXPAT_CONFIG_H
00017 #include <expat_config.h>
00018 #endif
00019 #endif /* ndef COMPILED_FROM_DSP */
00020 
00021 #include "expat_external.h"
00022 #include "internal.h"
00023 #include "xmltok.h"
00024 #include "nametab.h"
00025 
00026 #ifdef XML_DTD
00027 #define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
00028 #else
00029 #define IGNORE_SECTION_TOK_VTABLE /* as nothing */
00030 #endif
00031 
00032 #define VTABLE1 \
00033   { PREFIX(prologTok), PREFIX(contentTok), \
00034     PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
00035   { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
00036   PREFIX(sameName), \
00037   PREFIX(nameMatchesAscii), \
00038   PREFIX(nameLength), \
00039   PREFIX(skipS), \
00040   PREFIX(getAtts), \
00041   PREFIX(charRefNumber), \
00042   PREFIX(predefinedEntityName), \
00043   PREFIX(updatePosition), \
00044   PREFIX(isPublicId)
00045 
00046 #define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
00047 
00048 #define UCS2_GET_NAMING(pages, hi, lo) \
00049    (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
00050 
00051 /* A 2 byte UTF-8 representation splits the characters 11 bits between
00052    the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
00053    pages, 3 bits to add to that index and 5 bits to generate the mask.
00054 */
00055 #define UTF8_GET_NAMING2(pages, byte) \
00056     (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
00057                       + ((((byte)[0]) & 3) << 1) \
00058                       + ((((byte)[1]) >> 5) & 1)] \
00059          & (1 << (((byte)[1]) & 0x1F)))
00060 
00061 /* A 3 byte UTF-8 representation splits the characters 16 bits between
00062    the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
00063    into pages, 3 bits to add to that index and 5 bits to generate the
00064    mask.
00065 */
00066 #define UTF8_GET_NAMING3(pages, byte) \
00067   (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
00068                              + ((((byte)[1]) >> 2) & 0xF)] \
00069                        << 3) \
00070                       + ((((byte)[1]) & 3) << 1) \
00071                       + ((((byte)[2]) >> 5) & 1)] \
00072          & (1 << (((byte)[2]) & 0x1F)))
00073 
00074 #define UTF8_GET_NAMING(pages, p, n) \
00075   ((n) == 2 \
00076   ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
00077   : ((n) == 3 \
00078      ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
00079      : 0))
00080 
00081 /* Detection of invalid UTF-8 sequences is based on Table 3.1B
00082    of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
00083    with the additional restriction of not allowing the Unicode
00084    code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
00085    Implementation details:
00086      (A & 0x80) == 0     means A < 0x80
00087    and
00088      (A & 0xC0) == 0xC0  means A > 0xBF
00089 */
00090 
00091 #define UTF8_INVALID2(p) \
00092   ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
00093 
00094 #define UTF8_INVALID3(p) \
00095   (((p)[2] & 0x80) == 0 \
00096   || \
00097   ((*p) == 0xEF && (p)[1] == 0xBF \
00098     ? \
00099     (p)[2] > 0xBD \
00100     : \
00101     ((p)[2] & 0xC0) == 0xC0) \
00102   || \
00103   ((*p) == 0xE0 \
00104     ? \
00105     (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
00106     : \
00107     ((p)[1] & 0x80) == 0 \
00108     || \
00109     ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
00110 
00111 #define UTF8_INVALID4(p) \
00112   (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
00113   || \
00114   ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
00115   || \
00116   ((*p) == 0xF0 \
00117     ? \
00118     (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
00119     : \
00120     ((p)[1] & 0x80) == 0 \
00121     || \
00122     ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
00123 
00124 static int PTRFASTCALL
00125 isNever(const ENCODING *enc, const char *p)
00126 {
00127   return 0;
00128 }
00129 
00130 static int PTRFASTCALL
00131 utf8_isName2(const ENCODING *enc, const char *p)
00132 {
00133   return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
00134 }
00135 
00136 static int PTRFASTCALL
00137 utf8_isName3(const ENCODING *enc, const char *p)
00138 {
00139   return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
00140 }
00141 
00142 #define utf8_isName4 isNever
00143 
00144 static int PTRFASTCALL
00145 utf8_isNmstrt2(const ENCODING *enc, const char *p)
00146 {
00147   return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
00148 }
00149 
00150 static int PTRFASTCALL
00151 utf8_isNmstrt3(const ENCODING *enc, const char *p)
00152 {
00153   return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
00154 }
00155 
00156 #define utf8_isNmstrt4 isNever
00157 
00158 static int PTRFASTCALL
00159 utf8_isInvalid2(const ENCODING *enc, const char *p)
00160 {
00161   return UTF8_INVALID2((const unsigned char *)p);
00162 }
00163 
00164 static int PTRFASTCALL
00165 utf8_isInvalid3(const ENCODING *enc, const char *p)
00166 {
00167   return UTF8_INVALID3((const unsigned char *)p);
00168 }
00169 
00170 static int PTRFASTCALL
00171 utf8_isInvalid4(const ENCODING *enc, const char *p)
00172 {
00173   return UTF8_INVALID4((const unsigned char *)p);
00174 }
00175 
00176 struct normal_encoding {
00177   ENCODING enc;
00178   unsigned char type[256];
00179 #ifdef XML_MIN_SIZE
00180   int (PTRFASTCALL *byteType)(const ENCODING *, const char *);
00181   int (PTRFASTCALL *isNameMin)(const ENCODING *, const char *);
00182   int (PTRFASTCALL *isNmstrtMin)(const ENCODING *, const char *);
00183   int (PTRFASTCALL *byteToAscii)(const ENCODING *, const char *);
00184   int (PTRCALL *charMatches)(const ENCODING *, const char *, int);
00185 #endif /* XML_MIN_SIZE */
00186   int (PTRFASTCALL *isName2)(const ENCODING *, const char *);
00187   int (PTRFASTCALL *isName3)(const ENCODING *, const char *);
00188   int (PTRFASTCALL *isName4)(const ENCODING *, const char *);
00189   int (PTRFASTCALL *isNmstrt2)(const ENCODING *, const char *);
00190   int (PTRFASTCALL *isNmstrt3)(const ENCODING *, const char *);
00191   int (PTRFASTCALL *isNmstrt4)(const ENCODING *, const char *);
00192   int (PTRFASTCALL *isInvalid2)(const ENCODING *, const char *);
00193   int (PTRFASTCALL *isInvalid3)(const ENCODING *, const char *);
00194   int (PTRFASTCALL *isInvalid4)(const ENCODING *, const char *);
00195 };
00196 
00197 #define AS_NORMAL_ENCODING(enc)   ((const struct normal_encoding *) (enc))
00198 
00199 #ifdef XML_MIN_SIZE
00200 
00201 #define STANDARD_VTABLE(E) \
00202  E ## byteType, \
00203  E ## isNameMin, \
00204  E ## isNmstrtMin, \
00205  E ## byteToAscii, \
00206  E ## charMatches,
00207 
00208 #else
00209 
00210 #define STANDARD_VTABLE(E) /* as nothing */
00211 
00212 #endif
00213 
00214 #define NORMAL_VTABLE(E) \
00215  E ## isName2, \
00216  E ## isName3, \
00217  E ## isName4, \
00218  E ## isNmstrt2, \
00219  E ## isNmstrt3, \
00220  E ## isNmstrt4, \
00221  E ## isInvalid2, \
00222  E ## isInvalid3, \
00223  E ## isInvalid4
00224 
00225 static int FASTCALL checkCharRefNumber(int);
00226 
00227 #include "xmltok_impl.h"
00228 #include "ascii.h"
00229 
00230 #ifdef XML_MIN_SIZE
00231 #define sb_isNameMin isNever
00232 #define sb_isNmstrtMin isNever
00233 #endif
00234 
00235 #ifdef XML_MIN_SIZE
00236 #define MINBPC(enc) ((enc)->minBytesPerChar)
00237 #else
00238 /* minimum bytes per character */
00239 #define MINBPC(enc) 1
00240 #endif
00241 
00242 #define SB_BYTE_TYPE(enc, p) \
00243   (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
00244 
00245 #ifdef XML_MIN_SIZE
00246 static int PTRFASTCALL
00247 sb_byteType(const ENCODING *enc, const char *p)
00248 {
00249   return SB_BYTE_TYPE(enc, p);
00250 }
00251 #define BYTE_TYPE(enc, p) \
00252  (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
00253 #else
00254 #define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
00255 #endif
00256 
00257 #ifdef XML_MIN_SIZE
00258 #define BYTE_TO_ASCII(enc, p) \
00259  (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
00260 static int PTRFASTCALL
00261 sb_byteToAscii(const ENCODING *enc, const char *p)
00262 {
00263   return *p;
00264 }
00265 #else
00266 #define BYTE_TO_ASCII(enc, p) (*(p))
00267 #endif
00268 
00269 #define IS_NAME_CHAR(enc, p, n) \
00270  (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
00271 #define IS_NMSTRT_CHAR(enc, p, n) \
00272  (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
00273 #define IS_INVALID_CHAR(enc, p, n) \
00274  (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
00275 
00276 #ifdef XML_MIN_SIZE
00277 #define IS_NAME_CHAR_MINBPC(enc, p) \
00278  (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
00279 #define IS_NMSTRT_CHAR_MINBPC(enc, p) \
00280  (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
00281 #else
00282 #define IS_NAME_CHAR_MINBPC(enc, p) (0)
00283 #define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
00284 #endif
00285 
00286 #ifdef XML_MIN_SIZE
00287 #define CHAR_MATCHES(enc, p, c) \
00288  (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
00289 static int PTRCALL
00290 sb_charMatches(const ENCODING *enc, const char *p, int c)
00291 {
00292   return *p == c;
00293 }
00294 #else
00295 /* c is an ASCII character */
00296 #define CHAR_MATCHES(enc, p, c) (*(p) == c)
00297 #endif
00298 
00299 #define PREFIX(ident) normal_ ## ident
00300 #define XML_TOK_IMPL_C
00301 #include "xmltok_impl.c"
00302 #undef XML_TOK_IMPL_C
00303 
00304 #undef MINBPC
00305 #undef BYTE_TYPE
00306 #undef BYTE_TO_ASCII
00307 #undef CHAR_MATCHES
00308 #undef IS_NAME_CHAR
00309 #undef IS_NAME_CHAR_MINBPC
00310 #undef IS_NMSTRT_CHAR
00311 #undef IS_NMSTRT_CHAR_MINBPC
00312 #undef IS_INVALID_CHAR
00313 
00314 enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
00315   UTF8_cval1 = 0x00,
00316   UTF8_cval2 = 0xc0,
00317   UTF8_cval3 = 0xe0,
00318   UTF8_cval4 = 0xf0
00319 };
00320 
00321 static void PTRCALL
00322 utf8_toUtf8(const ENCODING *enc,
00323             const char **fromP, const char *fromLim,
00324             char **toP, const char *toLim)
00325 {
00326   char *to;
00327   const char *from;
00328   if (fromLim - *fromP > toLim - *toP) {
00329     /* Avoid copying partial characters. */
00330     for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
00331       if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
00332         break;
00333   }
00334   for (to = *toP, from = *fromP; from != fromLim; from++, to++)
00335     *to = *from;
00336   *fromP = from;
00337   *toP = to;
00338 }
00339 
00340 static void PTRCALL
00341 utf8_toUtf16(const ENCODING *enc,
00342              const char **fromP, const char *fromLim,
00343              unsigned short **toP, const unsigned short *toLim)
00344 {
00345   unsigned short *to = *toP;
00346   const char *from = *fromP;
00347   while (from != fromLim && to != toLim) {
00348     switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
00349     case BT_LEAD2:
00350       *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
00351       from += 2;
00352       break;
00353     case BT_LEAD3:
00354       *to++ = (unsigned short)(((from[0] & 0xf) << 12)
00355                                | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
00356       from += 3;
00357       break;
00358     case BT_LEAD4:
00359       {
00360         unsigned long n;
00361         if (to + 1 == toLim)
00362           goto after;
00363         n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
00364             | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
00365         n -= 0x10000;
00366         to[0] = (unsigned short)((n >> 10) | 0xD800);
00367         to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
00368         to += 2;
00369         from += 4;
00370       }
00371       break;
00372     default:
00373       *to++ = *from++;
00374       break;
00375     }
00376   }
00377 after:
00378   *fromP = from;
00379   *toP = to;
00380 }
00381 
00382 #ifdef XML_NS
00383 static const struct normal_encoding utf8_encoding_ns = {
00384   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00385   {
00386 #include "asciitab.h"
00387 #include "utf8tab.h"
00388   },
00389   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00390 };
00391 #endif
00392 
00393 static const struct normal_encoding utf8_encoding = {
00394   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00395   {
00396 #define BT_COLON BT_NMSTRT
00397 #include "asciitab.h"
00398 #undef BT_COLON
00399 #include "utf8tab.h"
00400   },
00401   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00402 };
00403 
00404 #ifdef XML_NS
00405 
00406 static const struct normal_encoding internal_utf8_encoding_ns = {
00407   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00408   {
00409 #include "iasciitab.h"
00410 #include "utf8tab.h"
00411   },
00412   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00413 };
00414 
00415 #endif
00416 
00417 static const struct normal_encoding internal_utf8_encoding = {
00418   { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
00419   {
00420 #define BT_COLON BT_NMSTRT
00421 #include "iasciitab.h"
00422 #undef BT_COLON
00423 #include "utf8tab.h"
00424   },
00425   STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
00426 };
00427 
00428 static void PTRCALL
00429 latin1_toUtf8(const ENCODING *enc,
00430               const char **fromP, const char *fromLim,
00431               char **toP, const char *toLim)
00432 {
00433   for (;;) {
00434     unsigned char c;
00435     if (*fromP == fromLim)
00436       break;
00437     c = (unsigned char)**fromP;
00438     if (c & 0x80) {
00439       if (toLim - *toP < 2)
00440         break;
00441       *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
00442       *(*toP)++ = (char)((c & 0x3f) | 0x80);
00443       (*fromP)++;
00444     }
00445     else {
00446       if (*toP == toLim)
00447         break;
00448       *(*toP)++ = *(*fromP)++;
00449     }
00450   }
00451 }
00452 
00453 static void PTRCALL
00454 latin1_toUtf16(const ENCODING *enc,
00455                const char **fromP, const char *fromLim,
00456                unsigned short **toP, const unsigned short *toLim)
00457 {
00458   while (*fromP != fromLim && *toP != toLim)
00459     *(*toP)++ = (unsigned char)*(*fromP)++;
00460 }
00461 
00462 #ifdef XML_NS
00463 
00464 static const struct normal_encoding latin1_encoding_ns = {
00465   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
00466   {
00467 #include "asciitab.h"
00468 #include "latin1tab.h"
00469   },
00470   STANDARD_VTABLE(sb_)
00471 };
00472 
00473 #endif
00474 
00475 static const struct normal_encoding latin1_encoding = {
00476   { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
00477   {
00478 #define BT_COLON BT_NMSTRT
00479 #include "asciitab.h"
00480 #undef BT_COLON
00481 #include "latin1tab.h"
00482   },
00483   STANDARD_VTABLE(sb_)
00484 };
00485 
00486 static void PTRCALL
00487 ascii_toUtf8(const ENCODING *enc,
00488              const char **fromP, const char *fromLim,
00489              char **toP, const char *toLim)
00490 {
00491   while (*fromP != fromLim && *toP != toLim)
00492     *(*toP)++ = *(*fromP)++;
00493 }
00494 
00495 #ifdef XML_NS
00496 
00497 static const struct normal_encoding ascii_encoding_ns = {
00498   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
00499   {
00500 #include "asciitab.h"
00501 /* BT_NONXML == 0 */
00502   },
00503   STANDARD_VTABLE(sb_)
00504 };
00505 
00506 #endif
00507 
00508 static const struct normal_encoding ascii_encoding = {
00509   { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
00510   {
00511 #define BT_COLON BT_NMSTRT
00512 #include "asciitab.h"
00513 #undef BT_COLON
00514 /* BT_NONXML == 0 */
00515   },
00516   STANDARD_VTABLE(sb_)
00517 };
00518 
00519 static int PTRFASTCALL
00520 unicode_byte_type(char hi, char lo)
00521 {
00522   switch ((unsigned char)hi) {
00523   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
00524     return BT_LEAD4;
00525   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
00526     return BT_TRAIL;
00527   case 0xFF:
00528     switch ((unsigned char)lo) {
00529     case 0xFF:
00530     case 0xFE:
00531       return BT_NONXML;
00532     }
00533     break;
00534   }
00535   return BT_NONASCII;
00536 }
00537 
00538 #define DEFINE_UTF16_TO_UTF8(E) \
00539 static void  PTRCALL \
00540 E ## toUtf8(const ENCODING *enc, \
00541             const char **fromP, const char *fromLim, \
00542             char **toP, const char *toLim) \
00543 { \
00544   const char *from; \
00545   for (from = *fromP; from != fromLim; from += 2) { \
00546     int plane; \
00547     unsigned char lo2; \
00548     unsigned char lo = GET_LO(from); \
00549     unsigned char hi = GET_HI(from); \
00550     switch (hi) { \
00551     case 0: \
00552       if (lo < 0x80) { \
00553         if (*toP == toLim) { \
00554           *fromP = from; \
00555           return; \
00556         } \
00557         *(*toP)++ = lo; \
00558         break; \
00559       } \
00560       /* fall through */ \
00561     case 0x1: case 0x2: case 0x3: \
00562     case 0x4: case 0x5: case 0x6: case 0x7: \
00563       if (toLim -  *toP < 2) { \
00564         *fromP = from; \
00565         return; \
00566       } \
00567       *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
00568       *(*toP)++ = ((lo & 0x3f) | 0x80); \
00569       break; \
00570     default: \
00571       if (toLim -  *toP < 3)  { \
00572         *fromP = from; \
00573         return; \
00574       } \
00575       /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
00576       *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
00577       *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
00578       *(*toP)++ = ((lo & 0x3f) | 0x80); \
00579       break; \
00580     case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
00581       if (toLim -  *toP < 4) { \
00582         *fromP = from; \
00583         return; \
00584       } \
00585       plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
00586       *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
00587       *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
00588       from += 2; \
00589       lo2 = GET_LO(from); \
00590       *(*toP)++ = (((lo & 0x3) << 4) \
00591                    | ((GET_HI(from) & 0x3) << 2) \
00592                    | (lo2 >> 6) \
00593                    | 0x80); \
00594       *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
00595       break; \
00596     } \
00597   } \
00598   *fromP = from; \
00599 }
00600 
00601 #define DEFINE_UTF16_TO_UTF16(E) \
00602 static void  PTRCALL \
00603 E ## toUtf16(const ENCODING *enc, \
00604              const char **fromP, const char *fromLim, \
00605              unsigned short **toP, const unsigned short *toLim) \
00606 { \
00607   /* Avoid copying first half only of surrogate */ \
00608   if (fromLim - *fromP > ((toLim - *toP) << 1) \
00609       && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
00610     fromLim -= 2; \
00611   for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
00612     *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
00613 }
00614 
00615 #define SET2(ptr, ch) \
00616   (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
00617 #define GET_LO(ptr) ((unsigned char)(ptr)[0])
00618 #define GET_HI(ptr) ((unsigned char)(ptr)[1])
00619 
00620 DEFINE_UTF16_TO_UTF8(little2_)
00621 DEFINE_UTF16_TO_UTF16(little2_)
00622 
00623 #undef SET2
00624 #undef GET_LO
00625 #undef GET_HI
00626 
00627 #define SET2(ptr, ch) \
00628   (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
00629 #define GET_LO(ptr) ((unsigned char)(ptr)[1])
00630 #define GET_HI(ptr) ((unsigned char)(ptr)[0])
00631 
00632 DEFINE_UTF16_TO_UTF8(big2_)
00633 DEFINE_UTF16_TO_UTF16(big2_)
00634 
00635 #undef SET2
00636 #undef GET_LO
00637 #undef GET_HI
00638 
00639 #define LITTLE2_BYTE_TYPE(enc, p) \
00640  ((p)[1] == 0 \
00641   ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
00642   : unicode_byte_type((p)[1], (p)[0]))
00643 #define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
00644 #define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
00645 #define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
00646   UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
00647 #define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
00648   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
00649 
00650 #ifdef XML_MIN_SIZE
00651 
00652 static int PTRFASTCALL
00653 little2_byteType(const ENCODING *enc, const char *p)
00654 {
00655   return LITTLE2_BYTE_TYPE(enc, p);
00656 }
00657 
00658 static int PTRFASTCALL
00659 little2_byteToAscii(const ENCODING *enc, const char *p)
00660 {
00661   return LITTLE2_BYTE_TO_ASCII(enc, p);
00662 }
00663 
00664 static int PTRCALL
00665 little2_charMatches(const ENCODING *enc, const char *p, int c)
00666 {
00667   return LITTLE2_CHAR_MATCHES(enc, p, c);
00668 }
00669 
00670 static int PTRFASTCALL
00671 little2_isNameMin(const ENCODING *enc, const char *p)
00672 {
00673   return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
00674 }
00675 
00676 static int PTRFASTCALL
00677 little2_isNmstrtMin(const ENCODING *enc, const char *p)
00678 {
00679   return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
00680 }
00681 
00682 #undef VTABLE
00683 #define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
00684 
00685 #else /* not XML_MIN_SIZE */
00686 
00687 #undef PREFIX
00688 #define PREFIX(ident) little2_ ## ident
00689 #define MINBPC(enc) 2
00690 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
00691 #define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
00692 #define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
00693 #define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
00694 #define IS_NAME_CHAR(enc, p, n) 0
00695 #define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
00696 #define IS_NMSTRT_CHAR(enc, p, n) (0)
00697 #define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
00698 
00699 #define XML_TOK_IMPL_C
00700 #include "xmltok_impl.c"
00701 #undef XML_TOK_IMPL_C
00702 
00703 #undef MINBPC
00704 #undef BYTE_TYPE
00705 #undef BYTE_TO_ASCII
00706 #undef CHAR_MATCHES
00707 #undef IS_NAME_CHAR
00708 #undef IS_NAME_CHAR_MINBPC
00709 #undef IS_NMSTRT_CHAR
00710 #undef IS_NMSTRT_CHAR_MINBPC
00711 #undef IS_INVALID_CHAR
00712 
00713 #endif /* not XML_MIN_SIZE */
00714 
00715 #ifdef XML_NS
00716 
00717 static const struct normal_encoding little2_encoding_ns = {
00718   { VTABLE, 2, 0,
00719 #if BYTEORDER == 1234
00720     1
00721 #else
00722     0
00723 #endif
00724   },
00725   {
00726 #include "asciitab.h"
00727 #include "latin1tab.h"
00728   },
00729   STANDARD_VTABLE(little2_)
00730 };
00731 
00732 #endif
00733 
00734 static const struct normal_encoding little2_encoding = {
00735   { VTABLE, 2, 0,
00736 #if BYTEORDER == 1234
00737     1
00738 #else
00739     0
00740 #endif
00741   },
00742   {
00743 #define BT_COLON BT_NMSTRT
00744 #include "asciitab.h"
00745 #undef BT_COLON
00746 #include "latin1tab.h"
00747   },
00748   STANDARD_VTABLE(little2_)
00749 };
00750 
00751 #if BYTEORDER != 4321
00752 
00753 #ifdef XML_NS
00754 
00755 static const struct normal_encoding internal_little2_encoding_ns = {
00756   { VTABLE, 2, 0, 1 },
00757   {
00758 #include "iasciitab.h"
00759 #include "latin1tab.h"
00760   },
00761   STANDARD_VTABLE(little2_)
00762 };
00763 
00764 #endif
00765 
00766 static const struct normal_encoding internal_little2_encoding = {
00767   { VTABLE, 2, 0, 1 },
00768   {
00769 #define BT_COLON BT_NMSTRT
00770 #include "iasciitab.h"
00771 #undef BT_COLON
00772 #include "latin1tab.h"
00773   },
00774   STANDARD_VTABLE(little2_)
00775 };
00776 
00777 #endif
00778 
00779 
00780 #define BIG2_BYTE_TYPE(enc, p) \
00781  ((p)[0] == 0 \
00782   ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
00783   : unicode_byte_type((p)[0], (p)[1]))
00784 #define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
00785 #define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
00786 #define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
00787   UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
00788 #define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
00789   UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
00790 
00791 #ifdef XML_MIN_SIZE
00792 
00793 static int PTRFASTCALL
00794 big2_byteType(const ENCODING *enc, const char *p)
00795 {
00796   return BIG2_BYTE_TYPE(enc, p);
00797 }
00798 
00799 static int PTRFASTCALL
00800 big2_byteToAscii(const ENCODING *enc, const char *p)
00801 {
00802   return BIG2_BYTE_TO_ASCII(enc, p);
00803 }
00804 
00805 static int PTRCALL
00806 big2_charMatches(const ENCODING *enc, const char *p, int c)
00807 {
00808   return BIG2_CHAR_MATCHES(enc, p, c);
00809 }
00810 
00811 static int PTRFASTCALL
00812 big2_isNameMin(const ENCODING *enc, const char *p)
00813 {
00814   return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
00815 }
00816 
00817 static int PTRFASTCALL
00818 big2_isNmstrtMin(const ENCODING *enc, const char *p)
00819 {
00820   return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
00821 }
00822 
00823 #undef VTABLE
00824 #define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
00825 
00826 #else /* not XML_MIN_SIZE */
00827 
00828 #undef PREFIX
00829 #define PREFIX(ident) big2_ ## ident
00830 #define MINBPC(enc) 2
00831 /* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
00832 #define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
00833 #define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
00834 #define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
00835 #define IS_NAME_CHAR(enc, p, n) 0
00836 #define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
00837 #define IS_NMSTRT_CHAR(enc, p, n) (0)
00838 #define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
00839 
00840 #define XML_TOK_IMPL_C
00841 #include "xmltok_impl.c"
00842 #undef XML_TOK_IMPL_C
00843 
00844 #undef MINBPC
00845 #undef BYTE_TYPE
00846 #undef BYTE_TO_ASCII
00847 #undef CHAR_MATCHES
00848 #undef IS_NAME_CHAR
00849 #undef IS_NAME_CHAR_MINBPC
00850 #undef IS_NMSTRT_CHAR
00851 #undef IS_NMSTRT_CHAR_MINBPC
00852 #undef IS_INVALID_CHAR
00853 
00854 #endif /* not XML_MIN_SIZE */
00855 
00856 #ifdef XML_NS
00857 
00858 static const struct normal_encoding big2_encoding_ns = {
00859   { VTABLE, 2, 0,
00860 #if BYTEORDER == 4321
00861   1
00862 #else
00863   0
00864 #endif
00865   },
00866   {
00867 #include "asciitab.h"
00868 #include "latin1tab.h"
00869   },
00870   STANDARD_VTABLE(big2_)
00871 };
00872 
00873 #endif
00874 
00875 static const struct normal_encoding big2_encoding = {
00876   { VTABLE, 2, 0,
00877 #if BYTEORDER == 4321
00878   1
00879 #else
00880   0
00881 #endif
00882   },
00883   {
00884 #define BT_COLON BT_NMSTRT
00885 #include "asciitab.h"
00886 #undef BT_COLON
00887 #include "latin1tab.h"
00888   },
00889   STANDARD_VTABLE(big2_)
00890 };
00891 
00892 #if BYTEORDER != 1234
00893 
00894 #ifdef XML_NS
00895 
00896 static const struct normal_encoding internal_big2_encoding_ns = {
00897   { VTABLE, 2, 0, 1 },
00898   {
00899 #include "iasciitab.h"
00900 #include "latin1tab.h"
00901   },
00902   STANDARD_VTABLE(big2_)
00903 };
00904 
00905 #endif
00906 
00907 static const struct normal_encoding internal_big2_encoding = {
00908   { VTABLE, 2, 0, 1 },
00909   {
00910 #define BT_COLON BT_NMSTRT
00911 #include "iasciitab.h"
00912 #undef BT_COLON
00913 #include "latin1tab.h"
00914   },
00915   STANDARD_VTABLE(big2_)
00916 };
00917 
00918 #endif
00919 
00920 #undef PREFIX
00921 
00922 static int FASTCALL
00923 streqci(const char *s1, const char *s2)
00924 {
00925   for (;;) {
00926     char c1 = *s1++;
00927     char c2 = *s2++;
00928     if (ASCII_a <= c1 && c1 <= ASCII_z)
00929       c1 += ASCII_A - ASCII_a;
00930     if (ASCII_a <= c2 && c2 <= ASCII_z)
00931       c2 += ASCII_A - ASCII_a;
00932     if (c1 != c2)
00933       return 0;
00934     if (!c1)
00935       break;
00936   }
00937   return 1;
00938 }
00939 
00940 static void PTRCALL
00941 initUpdatePosition(const ENCODING *enc, const char *ptr,
00942                    const char *end, POSITION *pos)
00943 {
00944   normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
00945 }
00946 
00947 static int
00948 toAscii(const ENCODING *enc, const char *ptr, const char *end)
00949 {
00950   char buf[1];
00951   char *p = buf;
00952   XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
00953   if (p == buf)
00954     return -1;
00955   else
00956     return buf[0];
00957 }
00958 
00959 static int FASTCALL
00960 isSpace(int c)
00961 {
00962   switch (c) {
00963   case 0x20:
00964   case 0xD:
00965   case 0xA:
00966   case 0x9:
00967     return 1;
00968   }
00969   return 0;
00970 }
00971 
00972 /* Return 1 if there's just optional white space or there's an S
00973    followed by name=val.
00974 */
00975 static int
00976 parsePseudoAttribute(const ENCODING *enc,
00977                      const char *ptr,
00978                      const char *end,
00979                      const char **namePtr,
00980                      const char **nameEndPtr,
00981                      const char **valPtr,
00982                      const char **nextTokPtr)
00983 {
00984   int c;
00985   char open;
00986   if (ptr == end) {
00987     *namePtr = NULL;
00988     return 1;
00989   }
00990   if (!isSpace(toAscii(enc, ptr, end))) {
00991     *nextTokPtr = ptr;
00992     return 0;
00993   }
00994   do {
00995     ptr += enc->minBytesPerChar;
00996   } while (isSpace(toAscii(enc, ptr, end)));
00997   if (ptr == end) {
00998     *namePtr = NULL;
00999     return 1;
01000   }
01001   *namePtr = ptr;
01002   for (;;) {
01003     c = toAscii(enc, ptr, end);
01004     if (c == -1) {
01005       *nextTokPtr = ptr;
01006       return 0;
01007     }
01008     if (c == ASCII_EQUALS) {
01009       *nameEndPtr = ptr;
01010       break;
01011     }
01012     if (isSpace(c)) {
01013       *nameEndPtr = ptr;
01014       do {
01015         ptr += enc->minBytesPerChar;
01016       } while (isSpace(c = toAscii(enc, ptr, end)));
01017       if (c != ASCII_EQUALS) {
01018         *nextTokPtr = ptr;
01019         return 0;
01020       }
01021       break;
01022     }
01023     ptr += enc->minBytesPerChar;
01024   }
01025   if (ptr == *namePtr) {
01026     *nextTokPtr = ptr;
01027     return 0;
01028   }
01029   ptr += enc->minBytesPerChar;
01030   c = toAscii(enc, ptr, end);
01031   while (isSpace(c)) {
01032     ptr += enc->minBytesPerChar;
01033     c = toAscii(enc, ptr, end);
01034   }
01035   if (c != ASCII_QUOT && c != ASCII_APOS) {
01036     *nextTokPtr = ptr;
01037     return 0;
01038   }
01039   open = (char)c;
01040   ptr += enc->minBytesPerChar;
01041   *valPtr = ptr;
01042   for (;; ptr += enc->minBytesPerChar) {
01043     c = toAscii(enc, ptr, end);
01044     if (c == open)
01045       break;
01046     if (!(ASCII_a <= c && c <= ASCII_z)
01047         && !(ASCII_A <= c && c <= ASCII_Z)
01048         && !(ASCII_0 <= c && c <= ASCII_9)
01049         && c != ASCII_PERIOD
01050         && c != ASCII_MINUS
01051         && c != ASCII_UNDERSCORE) {
01052       *nextTokPtr = ptr;
01053       return 0;
01054     }
01055   }
01056   *nextTokPtr = ptr + enc->minBytesPerChar;
01057   return 1;
01058 }
01059 
01060 static const char KW_version[] = {
01061   ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
01062 };
01063 
01064 static const char KW_encoding[] = {
01065   ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
01066 };
01067 
01068 static const char KW_standalone[] = {
01069   ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
01070   ASCII_n, ASCII_e, '\0'
01071 };
01072 
01073 static const char KW_yes[] = {
01074   ASCII_y, ASCII_e, ASCII_s,  '\0'
01075 };
01076 
01077 static const char KW_no[] = {
01078   ASCII_n, ASCII_o,  '\0'
01079 };
01080 
01081 static int
01082 doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
01083                                                  const char *,
01084                                                  const char *),
01085                int isGeneralTextEntity,
01086                const ENCODING *enc,
01087                const char *ptr,
01088                const char *end,
01089                const char **badPtr,
01090                const char **versionPtr,
01091                const char **versionEndPtr,
01092                const char **encodingName,
01093                const ENCODING **encoding,
01094                int *standalone)
01095 {
01096   const char *val = NULL;
01097   const char *name = NULL;
01098   const char *nameEnd = NULL;
01099   ptr += 5 * enc->minBytesPerChar;
01100   end -= 2 * enc->minBytesPerChar;
01101   if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
01102       || !name) {
01103     *badPtr = ptr;
01104     return 0;
01105   }
01106   if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
01107     if (!isGeneralTextEntity) {
01108       *badPtr = name;
01109       return 0;
01110     }
01111   }
01112   else {
01113     if (versionPtr)
01114       *versionPtr = val;
01115     if (versionEndPtr)
01116       *versionEndPtr = ptr;
01117     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
01118       *badPtr = ptr;
01119       return 0;
01120     }
01121     if (!name) {
01122       if (isGeneralTextEntity) {
01123         /* a TextDecl must have an EncodingDecl */
01124         *badPtr = ptr;
01125         return 0;
01126       }
01127       return 1;
01128     }
01129   }
01130   if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
01131     int c = toAscii(enc, val, end);
01132     if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
01133       *badPtr = val;
01134       return 0;
01135     }
01136     if (encodingName)
01137       *encodingName = val;
01138     if (encoding)
01139       *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
01140     if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
01141       *badPtr = ptr;
01142       return 0;
01143     }
01144     if (!name)
01145       return 1;
01146   }
01147   if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
01148       || isGeneralTextEntity) {
01149     *badPtr = name;
01150     return 0;
01151   }
01152   if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
01153     if (standalone)
01154       *standalone = 1;
01155   }
01156   else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
01157     if (standalone)
01158       *standalone = 0;
01159   }
01160   else {
01161     *badPtr = val;
01162     return 0;
01163   }
01164   while (isSpace(toAscii(enc, ptr, end)))
01165     ptr += enc->minBytesPerChar;
01166   if (ptr != end) {
01167     *badPtr = ptr;
01168     return 0;
01169   }
01170   return 1;
01171 }
01172 
01173 static int FASTCALL
01174 checkCharRefNumber(int result)
01175 {
01176   switch (result >> 8) {
01177   case 0xD8: case 0xD9: case 0xDA: case 0xDB:
01178   case 0xDC: case 0xDD: case 0xDE: case 0xDF:
01179     return -1;
01180   case 0:
01181     if (latin1_encoding.type[result] == BT_NONXML)
01182       return -1;
01183     break;
01184   case 0xFF:
01185     if (result == 0xFFFE || result == 0xFFFF)
01186       return -1;
01187     break;
01188   }
01189   return result;
01190 }
01191 
01192 int FASTCALL
01193 XmlUtf8Encode(int c, char *buf)
01194 {
01195   enum {
01196     /* minN is minimum legal resulting value for N byte sequence */
01197     min2 = 0x80,
01198     min3 = 0x800,
01199     min4 = 0x10000
01200   };
01201 
01202   if (c < 0)
01203     return 0;
01204   if (c < min2) {
01205     buf[0] = (char)(c | UTF8_cval1);
01206     return 1;
01207   }
01208   if (c < min3) {
01209     buf[0] = (char)((c >> 6) | UTF8_cval2);
01210     buf[1] = (char)((c & 0x3f) | 0x80);
01211     return 2;
01212   }
01213   if (c < min4) {
01214     buf[0] = (char)((c >> 12) | UTF8_cval3);
01215     buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
01216     buf[2] = (char)((c & 0x3f) | 0x80);
01217     return 3;
01218   }
01219   if (c < 0x110000) {
01220     buf[0] = (char)((c >> 18) | UTF8_cval4);
01221     buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
01222     buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
01223     buf[3] = (char)((c & 0x3f) | 0x80);
01224     return 4;
01225   }
01226   return 0;
01227 }
01228 
01229 int FASTCALL
01230 XmlUtf16Encode(int charNum, unsigned short *buf)
01231 {
01232   if (charNum < 0)
01233     return 0;
01234   if (charNum < 0x10000) {
01235     buf[0] = (unsigned short)charNum;
01236     return 1;
01237   }
01238   if (charNum < 0x110000) {
01239     charNum -= 0x10000;
01240     buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
01241     buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
01242     return 2;
01243   }
01244   return 0;
01245 }
01246 
01247 struct unknown_encoding {
01248   struct normal_encoding normal;
01249   CONVERTER convert;
01250   void *userData;
01251   unsigned short utf16[256];
01252   char utf8[256][4];
01253 };
01254 
01255 #define AS_UNKNOWN_ENCODING(enc)  ((const struct unknown_encoding *) (enc))
01256 
01257 int
01258 XmlSizeOfUnknownEncoding(void)
01259 {
01260   return sizeof(struct unknown_encoding);
01261 }
01262 
01263 static int PTRFASTCALL
01264 unknown_isName(const ENCODING *enc, const char *p)
01265 {
01266   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
01267   int c = uenc->convert(uenc->userData, p);
01268   if (c & ~0xFFFF)
01269     return 0;
01270   return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
01271 }
01272 
01273 static int PTRFASTCALL
01274 unknown_isNmstrt(const ENCODING *enc, const char *p)
01275 {
01276   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
01277   int c = uenc->convert(uenc->userData, p);
01278   if (c & ~0xFFFF)
01279     return 0;
01280   return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
01281 }
01282 
01283 static int PTRFASTCALL
01284 unknown_isInvalid(const ENCODING *enc, const char *p)
01285 {
01286   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
01287   int c = uenc->convert(uenc->userData, p);
01288   return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
01289 }
01290 
01291 static void PTRCALL
01292 unknown_toUtf8(const ENCODING *enc,
01293                const char **fromP, const char *fromLim,
01294                char **toP, const char *toLim)
01295 {
01296   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
01297   char buf[XML_UTF8_ENCODE_MAX];
01298   for (;;) {
01299     const char *utf8;
01300     int n;
01301     if (*fromP == fromLim)
01302       break;
01303     utf8 = uenc->utf8[(unsigned char)**fromP];
01304     n = *utf8++;
01305     if (n == 0) {
01306       int c = uenc->convert(uenc->userData, *fromP);
01307       n = XmlUtf8Encode(c, buf);
01308       if (n > toLim - *toP)
01309         break;
01310       utf8 = buf;
01311       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
01312                  - (BT_LEAD2 - 2));
01313     }
01314     else {
01315       if (n > toLim - *toP)
01316         break;
01317       (*fromP)++;
01318     }
01319     do {
01320       *(*toP)++ = *utf8++;
01321     } while (--n != 0);
01322   }
01323 }
01324 
01325 static void PTRCALL
01326 unknown_toUtf16(const ENCODING *enc,
01327                 const char **fromP, const char *fromLim,
01328                 unsigned short **toP, const unsigned short *toLim)
01329 {
01330   const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
01331   while (*fromP != fromLim && *toP != toLim) {
01332     unsigned short c = uenc->utf16[(unsigned char)**fromP];
01333     if (c == 0) {
01334       c = (unsigned short)
01335           uenc->convert(uenc->userData, *fromP);
01336       *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
01337                  - (BT_LEAD2 - 2));
01338     }
01339     else
01340       (*fromP)++;
01341     *(*toP)++ = c;
01342   }
01343 }
01344 
01345 ENCODING *
01346 XmlInitUnknownEncoding(void *mem,
01347                        int *table,
01348                        CONVERTER convert, 
01349                        void *userData)
01350 {
01351   int i;
01352   struct unknown_encoding *e = (struct unknown_encoding *)mem;
01353   for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
01354     ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
01355   for (i = 0; i < 128; i++)
01356     if (latin1_encoding.type[i] != BT_OTHER
01357         && latin1_encoding.type[i] != BT_NONXML
01358         && table[i] != i)
01359       return 0;
01360   for (i = 0; i < 256; i++) {
01361     int c = table[i];
01362     if (c == -1) {
01363       e->normal.type[i] = BT_MALFORM;
01364       /* This shouldn't really get used. */
01365       e->utf16[i] = 0xFFFF;
01366       e->utf8[i][0] = 1;
01367       e->utf8[i][1] = 0;
01368     }
01369     else if (c < 0) {
01370       if (c < -4)
01371         return 0;
01372       e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
01373       e->utf8[i][0] = 0;
01374       e->utf16[i] = 0;
01375     }
01376     else if (c < 0x80) {
01377       if (latin1_encoding.type[c] != BT_OTHER
01378           && latin1_encoding.type[c] != BT_NONXML
01379           && c != i)
01380         return 0;
01381       e->normal.type[i] = latin1_encoding.type[c];
01382       e->utf8[i][0] = 1;
01383       e->utf8[i][1] = (char)c;
01384       e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
01385     }
01386     else if (checkCharRefNumber(c) < 0) {
01387       e->normal.type[i] = BT_NONXML;
01388       /* This shouldn't really get used. */
01389       e->utf16[i] = 0xFFFF;
01390       e->utf8[i][0] = 1;
01391       e->utf8[i][1] = 0;
01392     }
01393     else {
01394       if (c > 0xFFFF)
01395         return 0;
01396       if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
01397         e->normal.type[i] = BT_NMSTRT;
01398       else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
01399         e->normal.type[i] = BT_NAME;
01400       else
01401         e->normal.type[i] = BT_OTHER;
01402       e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
01403       e->utf16[i] = (unsigned short)c;
01404     }
01405   }
01406   e->userData = userData;
01407   e->convert = convert;
01408   if (convert) {
01409     e->normal.isName2 = unknown_isName;
01410     e->normal.isName3 = unknown_isName;
01411     e->normal.isName4 = unknown_isName;
01412     e->normal.isNmstrt2 = unknown_isNmstrt;
01413     e->normal.isNmstrt3 = unknown_isNmstrt;
01414     e->normal.isNmstrt4 = unknown_isNmstrt;
01415     e->normal.isInvalid2 = unknown_isInvalid;
01416     e->normal.isInvalid3 = unknown_isInvalid;
01417     e->normal.isInvalid4 = unknown_isInvalid;
01418   }
01419   e->normal.enc.utf8Convert = unknown_toUtf8;
01420   e->normal.enc.utf16Convert = unknown_toUtf16;
01421   return &(e->normal.enc);
01422 }
01423 
01424 /* If this enumeration is changed, getEncodingIndex and encodings
01425 must also be changed. */
01426 enum {
01427   UNKNOWN_ENC = -1,
01428   ISO_8859_1_ENC = 0,
01429   US_ASCII_ENC,
01430   UTF_8_ENC,
01431   UTF_16_ENC,
01432   UTF_16BE_ENC,
01433   UTF_16LE_ENC,
01434   /* must match encodingNames up to here */
01435   NO_ENC
01436 };
01437 
01438 static const char KW_ISO_8859_1[] = {
01439   ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
01440   ASCII_MINUS, ASCII_1, '\0'
01441 };
01442 static const char KW_US_ASCII[] = {
01443   ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
01444   '\0'
01445 };
01446 static const char KW_UTF_8[] =  {
01447   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
01448 };
01449 static const char KW_UTF_16[] = {
01450   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
01451 };
01452 static const char KW_UTF_16BE[] = {
01453   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
01454   '\0'
01455 };
01456 static const char KW_UTF_16LE[] = {
01457   ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
01458   '\0'
01459 };
01460 
01461 static int FASTCALL
01462 getEncodingIndex(const char *name)
01463 {
01464   static const char * const encodingNames[] = {
01465     KW_ISO_8859_1,
01466     KW_US_ASCII,
01467     KW_UTF_8,
01468     KW_UTF_16,
01469     KW_UTF_16BE,
01470     KW_UTF_16LE,
01471   };
01472   int i;
01473   if (name == NULL)
01474     return NO_ENC;
01475   for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
01476     if (streqci(name, encodingNames[i]))
01477       return i;
01478   return UNKNOWN_ENC;
01479 }
01480 
01481 /* For binary compatibility, we store the index of the encoding
01482    specified at initialization in the isUtf16 member.
01483 */
01484 
01485 #define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
01486 #define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
01487 
01488 /* This is what detects the encoding.  encodingTable maps from
01489    encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
01490    the external (protocol) specified encoding; state is
01491    XML_CONTENT_STATE if we're parsing an external text entity, and
01492    XML_PROLOG_STATE otherwise.
01493 */
01494 
01495 
01496 static int
01497 initScan(const ENCODING * const *encodingTable,
01498          const INIT_ENCODING *enc,
01499          int state,
01500          const char *ptr,
01501          const char *end,
01502          const char **nextTokPtr)
01503 {
01504   const ENCODING **encPtr;
01505 
01506   if (ptr == end)
01507     return XML_TOK_NONE;
01508   encPtr = enc->encPtr;
01509   if (ptr + 1 == end) {
01510     /* only a single byte available for auto-detection */
01511 #ifndef XML_DTD /* FIXME */
01512     /* a well-formed document entity must have more than one byte */
01513     if (state != XML_CONTENT_STATE)
01514       return XML_TOK_PARTIAL;
01515 #endif
01516     /* so we're parsing an external text entity... */
01517     /* if UTF-16 was externally specified, then we need at least 2 bytes */
01518     switch (INIT_ENC_INDEX(enc)) {
01519     case UTF_16_ENC:
01520     case UTF_16LE_ENC:
01521     case UTF_16BE_ENC:
01522       return XML_TOK_PARTIAL;
01523     }
01524     switch ((unsigned char)*ptr) {
01525     case 0xFE:
01526     case 0xFF:
01527     case 0xEF: /* possibly first byte of UTF-8 BOM */
01528       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01529           && state == XML_CONTENT_STATE)
01530         break;
01531       /* fall through */
01532     case 0x00:
01533     case 0x3C:
01534       return XML_TOK_PARTIAL;
01535     }
01536   }
01537   else {
01538     switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
01539     case 0xFEFF:
01540       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01541           && state == XML_CONTENT_STATE)
01542         break;
01543       *nextTokPtr = ptr + 2;
01544       *encPtr = encodingTable[UTF_16BE_ENC];
01545       return XML_TOK_BOM;
01546     /* 00 3C is handled in the default case */
01547     case 0x3C00:
01548       if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
01549            || INIT_ENC_INDEX(enc) == UTF_16_ENC)
01550           && state == XML_CONTENT_STATE)
01551         break;
01552       *encPtr = encodingTable[UTF_16LE_ENC];
01553       return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01554     case 0xFFFE:
01555       if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
01556           && state == XML_CONTENT_STATE)
01557         break;
01558       *nextTokPtr = ptr + 2;
01559       *encPtr = encodingTable[UTF_16LE_ENC];
01560       return XML_TOK_BOM;
01561     case 0xEFBB:
01562       /* Maybe a UTF-8 BOM (EF BB BF) */
01563       /* If there's an explicitly specified (external) encoding
01564          of ISO-8859-1 or some flavour of UTF-16
01565          and this is an external text entity,
01566          don't look for the BOM,
01567          because it might be a legal data.
01568       */
01569       if (state == XML_CONTENT_STATE) {
01570         int e = INIT_ENC_INDEX(enc);
01571         if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
01572             || e == UTF_16LE_ENC || e == UTF_16_ENC)
01573           break;
01574       }
01575       if (ptr + 2 == end)
01576         return XML_TOK_PARTIAL;
01577       if ((unsigned char)ptr[2] == 0xBF) {
01578         *nextTokPtr = ptr + 3;
01579         *encPtr = encodingTable[UTF_8_ENC];
01580         return XML_TOK_BOM;
01581       }
01582       break;
01583     default:
01584       if (ptr[0] == '\0') {
01585         /* 0 isn't a legal data character. Furthermore a document
01586            entity can only start with ASCII characters.  So the only
01587            way this can fail to be big-endian UTF-16 if it it's an
01588            external parsed general entity that's labelled as
01589            UTF-16LE.
01590         */
01591         if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
01592           break;
01593         *encPtr = encodingTable[UTF_16BE_ENC];
01594         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01595       }
01596       else if (ptr[1] == '\0') {
01597         /* We could recover here in the case:
01598             - parsing an external entity
01599             - second byte is 0
01600             - no externally specified encoding
01601             - no encoding declaration
01602            by assuming UTF-16LE.  But we don't, because this would mean when
01603            presented just with a single byte, we couldn't reliably determine
01604            whether we needed further bytes.
01605         */
01606         if (state == XML_CONTENT_STATE)
01607           break;
01608         *encPtr = encodingTable[UTF_16LE_ENC];
01609         return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01610       }
01611       break;
01612     }
01613   }
01614   *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
01615   return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
01616 }
01617 
01618 
01619 #define NS(x) x
01620 #define ns(x) x
01621 #define XML_TOK_NS_C
01622 #include "xmltok_ns.c"
01623 #undef XML_TOK_NS_C
01624 #undef NS
01625 #undef ns
01626 
01627 #ifdef XML_NS
01628 
01629 #define NS(x) x ## NS
01630 #define ns(x) x ## _ns
01631 
01632 #define XML_TOK_NS_C
01633 #include "xmltok_ns.c"
01634 #undef XML_TOK_NS_C
01635 
01636 #undef NS
01637 #undef ns
01638 
01639 ENCODING *
01640 XmlInitUnknownEncodingNS(void *mem,
01641                          int *table,
01642                          CONVERTER convert, 
01643                          void *userData)
01644 {
01645   ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
01646   if (enc)
01647     ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
01648   return enc;
01649 }
01650 
01651 #endif /* XML_NS */

Generated on Mon May 27 17:50:36 2013 for Geant4 by  doxygen 1.4.7