/*************************************************************************** * * charmap.cpp * * $Id: charmap.cpp 650678 2008-04-22 22:24:48Z sebor $ * *************************************************************************** * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. * * Copyright 2001-2008 Rogue Wave Software, Inc. * **************************************************************************/ #include // On Compaq Tru64 UNIX if included after assert.h, the definition of // _XOPEN_SOURCE macro in assert.h selects a different declaration for // iconv than the one used in comp test. #ifndef _MSC_VER # ifndef _RWSTD_NO_ICONV # include # endif # include _RWSTD_CERRNO #else # include #endif // _MSC_VER #include #include #include // for errno #include #include // for LC_CTYPE, setlocale() #include #include #include // for strrchr(), strerror() #include #include #include #include #include #include "aliases.h" #include "scanner.h" #include "charmap.h" #include "loc_exception.h" #include "diagnostic.h" // This value specifies the largest allowed symbolic name length // If necessary this can be increased, but it is very doubtful that // that would ever be necessary #define MAX_SYM_NAME_LEN 256 // this is the maximum size of a single byte of a character in the // charmap file. According to POSIX this cannot be larger then 5 // because all bytes are in the format "\x%x", "\d%x" or "\%o" and // the numeric values cannot be greater then 3 digits long #define MAX_BYTE_LEN 5 #ifndef _RWSTD_NO_ICONV static iconv_t my_iconv_open (const char *to_codeset, const char *from_codeset) { typedef std::vector StrVec; StrVec aliases [2]; const bool to_utf8 = !std::strcmp (to_codeset, "UTF-8"); const bool from_utf8 = !to_utf8; // aliases [to_utf8].push_back (to_codeset); // aliases [from_utf8].push_back (from_codeset); get_cname_aliases (to_codeset, aliases [to_utf8]); get_cname_aliases (from_codeset, aliases [from_utf8]); typedef StrVec::iterator VecIter; std::string tried_names [2]; for (VecIter i = aliases [to_utf8].begin (); i != aliases [to_utf8].end (); ++i) { for (VecIter j = aliases [from_utf8].begin (); j != aliases [from_utf8].end (); ++j) { const char* const to_code = (*i).c_str (); const char* const from_code = (*j).c_str (); const iconv_t ret = iconv_open (to_code, from_code); if (ret != iconv_t (-1)) return ret; if (i == aliases [to_utf8].begin ()) { if (tried_names [from_utf8].size ()) { tried_names [from_utf8] += ','; tried_names [from_utf8] += ' '; } tried_names [from_utf8] += '"'; tried_names [from_utf8] += *j; tried_names [from_utf8] += '"'; } } if (tried_names [to_utf8].size ()) { tried_names [to_utf8] += ','; tried_names [to_utf8] += ' '; } tried_names [to_utf8] += '"'; tried_names [to_utf8] += *i; tried_names [to_utf8] += '"'; } assert (0 != aliases [0].size ()); assert (0 != aliases [1].size ()); issue_diag (W_ICONV, false, 0, "iconv_open(\"%s\", \"%s\") failed; " "tried { %s } and { %s }\n", aliases [to_utf8][0].c_str (), aliases [from_utf8][0].c_str (), tried_names [to_utf8].c_str (), tried_names [from_utf8].c_str ()); return iconv_t (-1); } // open an iconv file descriptor to convert from the codeset to utf8 iconv_t Charmap::open_iconv_to_utf8 () const { if (in_utf8_) return 0; return my_iconv_open ("UTF-8", code_set_name_.c_str ()); } # ifndef _RWSTD_NO_ISO_10646_WCHAR_T iconv_t Charmap::open_iconv_to_ext () { return my_iconv_open (code_set_name_.c_str (), "UTF-8"); } # endif // _RWSTD_NO_ISO_10646_WCHAR_T #endif // _RWSTD_NO_ICONV // utf8_decode translates the UTF-8 encoded character (specified // by the range [from, to) into an object of type wchar_t // algorithm derived from RFC2279 static wchar_t utf8_decode (const char* from, const char* to) { assert (from <= to); const unsigned char* const ch = _RWSTD_REINTERPRET_CAST (const unsigned char*, from); const unsigned char* const ch_end = _RWSTD_REINTERPRET_CAST (const unsigned char*, to); size_t num_bytes; wchar_t ret = 0; // if the first character is below 0x80 then the value of *ch is the // actual value of the character so return that value as a wchar_t if (*ch < 0x80) return wchar_t (*ch); // if *ch is between 0xc2 and 0xe0 there are 2 bytes in the multi-byte // character if (*ch >= 0xc2 && *ch < 0xe0) { ret = (*ch & 0x1f); num_bytes = 2; } // if *ch is between 0xe0 and 0xf0 there are 3 bytes in the multi-byte // character else if (*ch >= 0xe0 && *ch < 0xf0) { ret = *ch & 0x0f; num_bytes = 3; } else if (*ch >= 0xf0 && *ch < 0xf8) { ret = *ch & 0x07; num_bytes = 4; } else if (*ch >= 0xf8 && *ch < 0xfc) { ret = *ch & 0x03; num_bytes = 5; } else if (*ch >= 0xfc && *ch < 0xfe) { ret = *ch & 0x01; num_bytes = 6; } else { issue_diag (E_MBCHAR, true, 0, "illegal multibyte prefix '\\x%02x' in character " "map file\n", *ch); } if (ch_end < ch + num_bytes - 1) { // the input doesn't have enough characters issue_diag (E_MBCHAR, true, 0, "incomplete multibyte character in character " "map file: expecting %u bytes, found %u\n", num_bytes, ch_end - ch); } // for each byte in the character extract the useful data by shifting // and bit or it into the wchar_t for (size_t i = 1; i < num_bytes; ++i) ret = (ret << 6) | (ch [i] & 0x3f); return ret; } // count the number of bytes in a multibyte sequence denoted // by the argument by counting the number of escape characters std::size_t Charmap::mbcharlen (const std::string &str) const { std::size_t count = 1; const char escape = scanner_.escape_char (); for (std::size_t idx = 0; ; ++idx, ++count) { idx = str.find (escape, idx); if (std::string::npos == idx) break; } return count; } /**************************************************************************/ const char* const Charmap:: portable_charset[] = { /* 0x00 */ "", /* 0x01 SOH */ 0, /* 0x02 STX */ 0, /* 0x03 ETX */ 0, /* 0x04 EOT */ 0, /* 0x05 ENQ */ 0, /* 0x06 ACK */ 0, /* 0x07 BEL */ "", /* 0x08 */ "", /* 0x09 TAB */ "", /* 0x0a */ "", /* 0x0b */ "", /* 0x0c */ "", /* 0x0d */ "", /* 0x0e SO */ 0, /* 0x0f SI */ 0, /* 0x10 DLE */ 0, /* 0x11 DC1 */ 0, /* 0x12 DC2 */ 0, /* 0x13 DC3 */ 0, /* 0x14 DC4 */ 0, /* 0x15 NAK */ 0, /* 0x16 SYN */ 0, /* 0x17 ETB */ 0, /* 0x18 CAN */ 0, /* 0x19 EM */ 0, /* 0x1a SUB */ 0, /* 0x1b ESC */ 0, /* 0x1c IS4 */ 0, /* 0x1d IS3 */ 0, /* 0x1e IS2 */ 0, /* 0x1f IS1 */ 0, /* 0x20 SPC */ "", /* 0x21 ! */ "", /* 0x22 ' */ "", /* 0x23 # */ "", /* 0x24 $ */ "", /* 0x25 % */ "", /* 0x26 & */ "", /* 0x27 ' */ "", /* 0x28 ( */ "", /* 0x29 ) */ "", /* 0x2a * */ "", /* 0x2b + */ "", /* 0x2c , */ "", /* 0x2d - */ "", // "", /* 0x2e . */ "", // "", /* 0x2f / */ "", // "", /* 0x30 0 */ "", /* 0x31 1 */ "", /* 0x32 2 */ "", /* 0x33 3 */ "", /* 0x34 4 */ "", /* 0x35 5 */ "", /* 0x36 6 */ "", /* 0x37 7 */ "", /* 0x38 8 */ "", /* 0x39 9 */ "", /* 0x3a : */ "", /* 0x3b ; */ "", /* 0x3c < */ "", /* 0x3d = */ "", /* 0x3e > */ "", /* 0x3f ? */ "", /* 0x40 @ */ "", /* 0x41 A */ "", /* 0x42 B */ "", /* 0x43 C */ "", /* 0x44 D */ "", /* 0x45 E */ "", /* 0x46 F */ "", /* 0x47 G */ "", /* 0x48 H */ "", /* 0x49 I */ "", /* 0x4a J */ "", /* 0x4b K */ "", /* 0x4c L */ "", /* 0x4d M */ "", /* 0x4e N */ "", /* 0x4f O */ "", /* 0x50 P */ "

", /* 0x51 Q */ "", /* 0x52 R */ "", /* 0x53 S */ "", /* 0x54 T */ "", /* 0x55 U */ "", /* 0x56 V */ "", /* 0x57 W */ "", /* 0x58 X */ "", /* 0x59 Y */ "", /* 0x5a Z */ "", /* 0x5b [ */ "", /* 0x5c \ */ "", // "", /* 0x5d ] */ "", /* 0x5e ^ */ "", // "", /* 0x5f _ */ "", // "", /* 0x60 ` */ "", /* 0x61 a */ "", /* 0x62 b */ "", /* 0x63 c */ "", /* 0x64 d */ "", /* 0x65 e */ "", /* 0x66 f */ "", /* 0x67 g */ "", /* 0x68 h */ "", /* 0x69 i */ "", /* 0x6a j */ "", /* 0x6b k */ "", /* 0x6c l */ "", /* 0x6d m */ "", /* 0x6e n */ "", /* 0x6f o */ "", /* 0x70 p */ "

", /* 0x71 q */ "", /* 0x72 r */ "", /* 0x73 s */ "", /* 0x74 t */ "", /* 0x75 u */ "", /* 0x76 v */ "", /* 0x77 w */ "", /* 0x78 x */ "", /* 0x79 y */ "", /* 0x7a z */ "", /* 0x7b { */ "", // "", /* 0x7c | */ "", /* 0x7d } */ "", // "", /* 0x7e ~ */ "", /* 0x7f */ 0 }; // convert a string of narrow character into a wchar_t bool Charmap::convert_to_wc (const std::string& sym_name, const std::string& ext_enc, wchar_t& wc) { #ifndef _RWSTD_NO_ISO_10646_WCHAR_T // the internal wchar_t representation for all characters // in all locales is always ISO-10646 (UCS) on this system return convert_to_ucs (sym_name, ext_enc, wc); #else // if defined _RWSTD_NO_ISO_10646_WCHAR_T if (UCS4_internal_ || Clocale_.empty ()) { // when using UCS as the internal encoding or for a locale // that has no corresponding C library locale convert the // character to ISO-10646 (UCS) return convert_to_ucs (sym_name, ext_enc, wc); } // otherwise use libc to convert the multi-byte character // to its wchar_t value if (-1 == std::mbtowc (&wc, ext_enc.c_str (), ext_enc.size ())) { const char* const locname = std::setlocale (LC_CTYPE, 0); const char* const errtext = std::strerror (errno); // diagnose the failure to convert the character as just // a warning and (try to) convert it to ISO-10646 (UCS) issue_diag (W_CALL, true, &next, "mbtowc failed to convert character in locale " "\"%s\": %s\n", locname, errtext); return convert_to_ucs (sym_name, ext_enc, wc); } return true; #endif // _RWSTD_NO_ISO_10646_WCHAR_T } char* Charmap::convert_to_utf8 (const char *inbuf, size_t inbuf_s, char *outbuf, size_t outbuf_s) const { #ifndef _RWSTD_NO_ICONV if (ic_to_utf8_ == iconv_t (-1)) return 0; char* outbufp = outbuf; # ifndef _RWSTD_NO_ICONV_CONST_CHAR const char* inbufp = inbuf; # else char* inbufp = _RWSTD_CONST_CAST(char*, inbuf); # endif // _RWSTD_NO_ICONV_CONST_CHAR if (std::size_t (-1) == iconv (ic_to_utf8_, &inbufp, &inbuf_s, &outbufp, &outbuf_s)) { const char* const errtext = std::strerror (errno); issue_diag (W_ICONV, false, &next, "iconv failed to convert \"%s\" " "to UTF-8: %s\n", inbuf, errtext); return 0; } return outbufp; #else // if defined (_RWSTD_NO_ICONV) return 0; #endif // _RWSTD_NO_ICONV } std::string Charmap::get_charmap_name () const { const std::string::size_type idx = charmap_name_.rfind (_RWSTD_PATH_SEP); if (idx != std::string::npos) return charmap_name_.substr (idx + 1); return charmap_name_; } wchar_t Charmap::increment_wchar (wchar_t val) const { #ifndef _RWSTD_NO_ISO_10646_WCHAR_T // to increment a wchar_t value and keep the encoding all we have // to do is increment the val because the internal encoding is UCS return val + 1; #else // to increment a wchar_t value and keep the encoding we have to // convert the wchar_t to the external encoding, increment that // string value, and convert back to the internal representation const rmb_cmap_iter it = rmb_cmap_.find (val); if (it != rmb_cmap_.end ()) { mb_cmap_iter ret; // multibyte character corresponding to the wchar_t value std::string encoding = it->second; // continue incrementing the multi-byte value until we get a valid // character. NOTE: this must be done for encodings such as SJIS where // \x7f in the last byte of a multibyte string is not a valid character // NOTE: this will not detect errors in the sequence, since the program // will continue until it finds a valid character do { int last_elm = encoding.size () - 1; while (last_elm >= 0) { typedef unsigned char UChar; const unsigned ic = UChar (encoding [last_elm]) + 1; // if incrementing the last element caused it to exceed // UCHAR_MAX increment the next higher byte if there is // one if (UCHAR_MAX < ic) encoding [last_elm--] = '\0'; else { encoding [last_elm] = char (ic); break; } } if (last_elm < 0) return -1; // error } while ((ret = mb_cmap_.find (encoding)) == mb_cmap_.end ()); return ret->second; } return -1; // error #endif // _RWSTD_NO_ISO_10646_WCHAR_T } bool Charmap:: increment_encoding (std::string &encoding) { // find the last escape character in the human readable representation // of the encoding (i.e., in the multibyte character such as "/xf0/x80") const std::string::size_type pos = encoding.rfind (scanner_.escape_char ()); // the escape character must be there (guaranteed by the scanner) assert (pos < encoding.size ()); const char* end = 0; // convert the last character in the multibyte character to a numeric // value representing the last byte of the sequence unsigned long last_byte = scanner_.convert_escape (encoding.c_str () + pos, &end); // POSIX requires that the incremented value be non-NUL if (UCHAR_MAX <= last_byte || *end) return false; // increment the last byte ++last_byte; // format the last byte in the same notation (octal, decimal, // or hexadecimal escape sequence) static const char xdigits[] = "0123456789ABCDEF"; char byte_str [5]; char *pdig = byte_str; switch (encoding [pos + 1]) { case 'd': { // decimal escape const unsigned hundreds = last_byte / 100; const unsigned tens = (last_byte - hundreds) / 10; const unsigned units = last_byte % 10; *pdig++ = 'd'; if (hundreds) *pdig++ = xdigits [hundreds]; *pdig++ = xdigits [tens]; *pdig++ = xdigits [units]; *pdig = '\0'; break; } case 'x': { // hex escape const unsigned hi = last_byte >> 4; const unsigned lo = last_byte & 0xfU; *pdig++ = 'x'; *pdig++ = xdigits [hi]; *pdig++ = xdigits [lo]; *pdig = '\0'; break; } default: { // octal escape const unsigned hi = last_byte >> 6; const unsigned mid = (last_byte >> 3) & 07U; const unsigned lo = last_byte & 07U; if (hi) *pdig++ = xdigits [hi]; *pdig++ = xdigits [mid]; *pdig++ = xdigits [lo]; *pdig = '\0'; } } // switch // replace the last escape sequence with the new one encoding.replace (pos + 1, std::string::npos, byte_str); return true; } std::string Charmap:: encoding_to_mbchar (const std::string &encoding) const { std::string mbchar; for (const char *pbyte = encoding.c_str (); pbyte && *pbyte; ) mbchar += char (scanner_.convert_escape (pbyte, &pbyte)); return mbchar; } // convert the locale's encoded character to UCS4 wchar_t wchar_t Charmap:: convert_sym_to_ucs (const std::string &sym) const { std::string::const_iterator it (sym.begin ()); if ( sym.size () < 4 || *it != '<' || *++it != 'U' || !(std::isxdigit)(*++it)) { issue_diag (E_UCS, true, 0, "Unable to convert symbolic name %s to UCS.\n", sym.c_str ()); } const unsigned long val = std::strtoul (&*++it, (char**)0, 16); if (_RWSTD_WCHAR_MAX <= val) issue_diag (E_UCS, true, 0, "UCS value %lu of symbolic character %s out of range.\n", val, sym.c_str ()); return wchar_t (val); } // convert the locale's encoded character to UCS4/UCS2 wchar_t bool Charmap::convert_to_ucs (const std::string &sym_name, const std::string &encoding, wchar_t& wc) { #ifndef _MSC_VER if (in_utf8_) { wc = utf8_decode (encoding.c_str (), &*(encoding.end () - 1)); return true; } // allocate enough space for the longest possible UTF-8 character char utf8_enc [8 + 1 /* NUL */]; const char* const ch_end = convert_to_utf8 (encoding.c_str (), encoding.size (), utf8_enc, sizeof utf8_enc); if (ch_end) // only if conversion to utf8 succeeded wc = utf8_decode (utf8_enc, ch_end); else // if not, try to convert the symbolic name directly wc = convert_sym_to_ucs (sym_name); return true; #else if (0 != codepage_) { wchar_t ret[2] = {0}; MultiByteToWideChar (codepage_, 0, encoding.c_str(), -1, ret, 2); if (ret[1] != 0) return false; wc = ret[0]; return true; } else { wc = convert_sym_to_ucs (sym_name); return true; } return false; #endif // _MSC_VER } void Charmap::add_to_cmaps (const std::string &sym_name, const std::string &encoding, bool is_mbchar /* = false */) { // compute the external (multibyte) encoding of the character // if necessary (i.e., unless already done by the caller) const std::string mbchar = is_mbchar ? encoding : encoding_to_mbchar (encoding); symnames_list_.push_back (sym_name); if (1 == mbchar.size ()) { // strval is a single-byte character const unsigned char ch = mbchar [0]; // add the wide character and its symbolic name to the narrow // character maps if (forward_maps) { // the locale utility doesn't need reverse maps n_cmap_.insert (std::make_pair (sym_name, ch)); } if (reverse_maps) rn_cmap_.insert (std::make_pair (ch, sym_name)); if (ch > largest_nchar_) largest_nchar_ = ch; } // (try to) compute the wide character value of the character wchar_t wch; if (convert_to_wc (sym_name, mbchar, wch)) { // add the wide character and its symbolic name to the wide // character maps if (forward_maps) { // the locale utility doesn't need forward maps w_cmap_.insert (std::make_pair (sym_name, wch)); } if (reverse_maps) rw_cmap_.insert (std::make_pair (wch, sym_name)); // add the corresponding multibyte character to the multibyte // character maps mb_cmap_.insert (std::make_pair (mbchar, wch)); rmb_cmap_.insert (std::make_pair (wch, mbchar)); } // compute the UCS value of the character wchar_t uch; if (convert_to_ucs (sym_name, mbchar, uch)) { // add UCS character and its symbolic name to the UCS // character maps ucs4_cmap_.insert (std::make_pair (sym_name, uch)); rucs4_cmap_.insert (std::make_pair (uch, sym_name)); } } // process the characters implicitly defined by using ellipsis between // two explicitly defined characters std::size_t Charmap:: process_ellipsis (const Scanner::token_t &beg_tok, int num_ellipsis) { // get the upper end of the range denoted by the ellipsis const Scanner::token_t end_tok = scanner_.next_token (); // get the human readabale encoding of the character // denoted by the lower end of the ellipsis const std::string encoding = scanner_.next_token ().name; // convert the encoding to a multibyte character std::string mbchar = encoding_to_mbchar (encoding); // add the beg_tok symbol name to the maps add_to_cmaps (beg_tok.name, mbchar, true); // extract the numeric portion of the symbolic character name // denoted by the lower end of the ellipsis std::size_t idx = 0; int base; // numeric base const char *fmat; // sprintf() format specifier const std::size_t beg_len = beg_tok.name.size (); // determine the value of the beginning of the range // denoted by the ellipsis if (2 == num_ellipsis) { base = 16; fmat = "%.*s%0*lX>"; // advance to the first hex digit while (idx < beg_len && !(std::isxdigit)(beg_tok.name [idx])) ++idx; } else { base = 10; fmat = "%.*s%0*ld>"; // advance to the first decimal digit while (idx < beg_len && !(std::isdigit)(beg_tok.name [idx])) ++idx; } // length of non-numeric prefix of the symbolic character name const std::size_t pfx_len = idx; // get the character value plus one (since the first value // has already been added to the map earlier) char *num_end; const unsigned long beg_val = 1 + std::strtoul (beg_tok.name.c_str () + pfx_len, &num_end, base); // the length of the numeric portion const std::size_t num_size = num_end - (beg_tok.name.c_str () + pfx_len); // find the end of the range denoted by the ellipsis idx = 0; const std::size_t end_len = end_tok.name.size (); if (2 == num_ellipsis) { // advance to the next hex digit while (idx < end_len && !(std::isxdigit)(end_tok.name [idx])) ++idx; } else { // advance to the next dec digit while (idx < end_len && !(std::isdigit)(end_tok.name [idx])) ++idx; } const unsigned long end_val = std::strtoul (end_tok.name.c_str () + idx, (char**)0, base); // the ending numeric value must be greater than or equal // to the beginning numeric value if (end_val < beg_val) issue_diag (E_RANGE, true, &end_tok, "invalid range found in character map file\n"); char next_name [MAX_SYM_NAME_LEN]; std::size_t nchars = 0; const char* const pfx = beg_tok.name.c_str (); for (unsigned long val = beg_val; val <= end_val; ++val, ++nchars) { std::sprintf (next_name, fmat, pfx_len, pfx, num_size, val); // increment the last byte of the multibyte character // and if the result is valid (i.e., doesn't contain // an embedded NUL) add the generated name and the // multibyte character to the maps const unsigned char last_byte = mbchar [mbchar.size () - 1]; if (last_byte < UCHAR_MAX) { mbchar [mbchar.size () - 1] = last_byte + 1; add_to_cmaps (next_name, mbchar, true); } else { // an ellipsis must not specify a range that includes // an encoding with an embedded NUL issue_diag (E_RANGE, true, &beg_tok, "encoding of an element in range contains NUL\n"); } } // return the number of characters denoted by the ellipsis return nchars; } // process all the characters in the character map file. void Charmap::process_chars() { issue_diag (I_STAGE, false, 0, "processing CHARMAP section\n"); std::size_t ntokens = 0; std::size_t nellips = 0; std::size_t nchars = 0; next = scanner_.next_token(); Scanner::token_t nextnext; // loop until we find the closing charmap token for ( ; next.token != Scanner::tok_charmap; ++ntokens) { switch (next.token) { case Scanner::tok_nl: case Scanner::tok_end: break; case Scanner::tok_sym_name: // the next token may be either ellipsis if this line // of the charmap is in the form: // "%s...%s %s\n", , , // or an encoding if this line is in the format: // "%s %s\n", , nextnext = scanner_.next_token (); ntokens += 3; switch (nextnext.token) { case Scanner::tok_abs_ellipsis: // absolute ellipsis (see ISO/IEC TR 14652) nchars += process_ellipsis (next, 3); ++nellips; break; case Scanner::tok_hex_ellipsis: // hexadecimal symbolic ellipsis (see ISO/IEC TR 14652) nchars += process_ellipsis (next, 2); ++nellips; break; case Scanner::tok_char_value: // character represented as a numeric constant add_to_cmaps (next.name, nextnext.name); ++nchars; break; default: issue_diag (E_SYNTAX, true, &next, "byte value expected following symbolic " "name in character map file\n"); } scanner_.ignore_line (); break; default: issue_diag (E_SYNTAX, true, &next, "symbolic name expected in character map file\n"); break; } next = scanner_.next_token(); } issue_diag (I_STAGE, false, 0, "done processing CHARMAP section (%lu tokens, " "%lu ellipses, %lu characters)\n", ntokens, nellips, nchars); // make sure that all characters in the portable character set // are in the charmap if (forward_maps) verify_portable_charset(); } void Charmap::verify_portable_charset () const { const std::size_t nchars = sizeof portable_charset / sizeof *portable_charset; for (std::size_t i = 0; i < nchars; ++i) { if (0 == portable_charset [i]) continue; if (n_cmap_.find (portable_charset [i]) == n_cmap_.end ()) issue_diag (W_NOPCS, false, 0, "member of portable character set %s not found " "in the character map\n", portable_charset [i]); } } Charmap::Charmap(const char* Clocale, const char* fname, bool in_utf8, bool create_forward_maps, bool create_reverse_maps, bool use_UCS4) : mb_cur_max_(1), charmap_name_ (fname), Clocale_ (Clocale), largest_nchar_(0), in_utf8_(in_utf8), forward_maps (create_forward_maps), reverse_maps (create_reverse_maps), UCS4_internal_ (use_UCS4) { #ifndef _RWSTD_NO_ICONV ic_to_utf8_ = 0; ic_to_ext_ = 0; #endif // _RWSTD_NO_ICONV scanner_.open (fname, '#', '\\'); // set code_set_name to the name of the character set description // file by default, in case it's not explicitly specified const char* const slash = std::strrchr (fname, _RWSTD_PATH_SEP); code_set_name_ = slash ? slash + 1 : fname; // loop until we reach the end of the file while ((next = scanner_.next_token()).token != Scanner::tok_end_tokens) { switch (next.token) { case Scanner::tok_code_set_name: next = scanner_.next_token (); if (next.token == Scanner::tok_string) { code_set_name_ = next.name.substr (1, next.name.size () - 2); } else if (next.token == Scanner::tok_ndef) { code_set_name_ = next.name; } else issue_diag (E_SYNTAX, true, &next, "string expected following \n"); // we always need a iconv to utf8 so that we can create // the utf8_charmap unless we are on windows #ifndef _RWSTD_NO_ICONV if (!in_utf8_) { ic_to_utf8_ = open_iconv_to_utf8 (); # if !defined (_RWSTD_NO_ISO_10646_WCHAR_T) ic_to_ext_ = open_iconv_to_ext (); # endif // _RWSTD_NO_ISO_10646_WCHAR_T } #else // if defined (_RWSTD_NO_ICONV) # ifdef _MSC_VER codepage_ = get_codepage (code_set_name_); if (codepage_ == 0) { issue_diag (W_ICONV, false, 0, "iconv_open (%s to UTF-8) failed\n", code_set_name_.c_str()); } # endif // _MSC_VER #endif // _RWSTD_NO_ICONV scanner_.ignore_line (); break; case Scanner::tok_mb_cur_max: mb_cur_max_ = std::atoi (scanner_.next_token ().name.c_str ()); scanner_.ignore_line (); break; case Scanner::tok_mb_cur_min: scanner_.ignore_line (); break; case Scanner::tok_charmap: scanner_.ignore_line (); process_chars(); break; case Scanner::tok_width: // ignore the width section of the character map while ((next = scanner_.next_token ()).token != Scanner::tok_width); break; case Scanner::tok_nl: break; default: issue_diag (E_SYNTAX, false, &next, "unknown token %s in character map file\n", next.name.c_str ()); } } }