1084 lines
32 KiB
C++
1084 lines
32 KiB
C++
/***************************************************************************
|
|
*
|
|
* charmap.cpp
|
|
*
|
|
* $Id: charmap.cpp 650678 2008-04-22 22:24:48Z sebor $
|
|
*
|
|
***************************************************************************
|
|
*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed
|
|
* with this work for additional information regarding copyright
|
|
* ownership. The ASF licenses this file to you under the Apache
|
|
* License, Version 2.0 (the "License"); you may not use this file
|
|
* except in compliance with the License. You may obtain a copy of
|
|
* the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
* implied. See the License for the specific language governing
|
|
* permissions and limitations under the License.
|
|
*
|
|
* Copyright 2001-2008 Rogue Wave Software, Inc.
|
|
*
|
|
**************************************************************************/
|
|
|
|
#include <rw/_defs.h>
|
|
|
|
// On Compaq Tru64 UNIX if included after assert.h, the definition of
|
|
// _XOPEN_SOURCE macro in assert.h selects a different declaration for
|
|
// iconv than the one used in comp test.
|
|
#ifndef _MSC_VER
|
|
# ifndef _RWSTD_NO_ICONV
|
|
# include <iconv.h>
|
|
# endif
|
|
# include _RWSTD_CERRNO
|
|
#else
|
|
# include <windows.h>
|
|
#endif // _MSC_VER
|
|
|
|
#include <cassert>
|
|
#include <cctype>
|
|
#include <cerrno> // for errno
|
|
#include <climits>
|
|
#include <clocale> // for LC_CTYPE, setlocale()
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <cstring> // for strrchr(), strerror()
|
|
|
|
#include <map>
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
#include <iostream>
|
|
#include <fstream>
|
|
|
|
#include "aliases.h"
|
|
#include "scanner.h"
|
|
#include "charmap.h"
|
|
#include "loc_exception.h"
|
|
#include "diagnostic.h"
|
|
|
|
// This value specifies the largest allowed symbolic name length
|
|
// If necessary this can be increased, but it is very doubtful that
|
|
// that would ever be necessary
|
|
#define MAX_SYM_NAME_LEN 256
|
|
|
|
// this is the maximum size of a single byte of a character in the
|
|
// charmap file. According to POSIX this cannot be larger then 5
|
|
// because all bytes are in the format "\x%x", "\d%x" or "\%o" and
|
|
// the numeric values cannot be greater then 3 digits long
|
|
#define MAX_BYTE_LEN 5
|
|
|
|
#ifndef _RWSTD_NO_ICONV
|
|
|
|
static iconv_t
|
|
my_iconv_open (const char *to_codeset, const char *from_codeset)
|
|
{
|
|
typedef std::vector<std::string> StrVec;
|
|
|
|
StrVec aliases [2];
|
|
|
|
const bool to_utf8 = !std::strcmp (to_codeset, "UTF-8");
|
|
const bool from_utf8 = !to_utf8;
|
|
|
|
// aliases [to_utf8].push_back (to_codeset);
|
|
// aliases [from_utf8].push_back (from_codeset);
|
|
|
|
get_cname_aliases (to_codeset, aliases [to_utf8]);
|
|
get_cname_aliases (from_codeset, aliases [from_utf8]);
|
|
|
|
typedef StrVec::iterator VecIter;
|
|
|
|
std::string tried_names [2];
|
|
|
|
for (VecIter i = aliases [to_utf8].begin (); i != aliases [to_utf8].end ();
|
|
++i) {
|
|
|
|
for (VecIter j = aliases [from_utf8].begin ();
|
|
j != aliases [from_utf8].end (); ++j) {
|
|
|
|
const char* const to_code = (*i).c_str ();
|
|
const char* const from_code = (*j).c_str ();
|
|
|
|
const iconv_t ret = iconv_open (to_code, from_code);
|
|
|
|
if (ret != iconv_t (-1))
|
|
return ret;
|
|
|
|
if (i == aliases [to_utf8].begin ()) {
|
|
|
|
if (tried_names [from_utf8].size ()) {
|
|
tried_names [from_utf8] += ',';
|
|
tried_names [from_utf8] += ' ';
|
|
}
|
|
|
|
tried_names [from_utf8] += '"';
|
|
tried_names [from_utf8] += *j;
|
|
tried_names [from_utf8] += '"';
|
|
}
|
|
}
|
|
|
|
if (tried_names [to_utf8].size ()) {
|
|
tried_names [to_utf8] += ',';
|
|
tried_names [to_utf8] += ' ';
|
|
}
|
|
|
|
tried_names [to_utf8] += '"';
|
|
tried_names [to_utf8] += *i;
|
|
tried_names [to_utf8] += '"';
|
|
}
|
|
|
|
assert (0 != aliases [0].size ());
|
|
assert (0 != aliases [1].size ());
|
|
|
|
issue_diag (W_ICONV, false, 0,
|
|
"iconv_open(\"%s\", \"%s\") failed; "
|
|
"tried { %s } and { %s }\n",
|
|
aliases [to_utf8][0].c_str (),
|
|
aliases [from_utf8][0].c_str (),
|
|
tried_names [to_utf8].c_str (),
|
|
tried_names [from_utf8].c_str ());
|
|
|
|
return iconv_t (-1);
|
|
}
|
|
|
|
// open an iconv file descriptor to convert from the codeset to utf8
|
|
iconv_t Charmap::open_iconv_to_utf8 () const
|
|
{
|
|
if (in_utf8_)
|
|
return 0;
|
|
|
|
return my_iconv_open ("UTF-8", code_set_name_.c_str ());
|
|
}
|
|
|
|
# ifndef _RWSTD_NO_ISO_10646_WCHAR_T
|
|
|
|
iconv_t Charmap::open_iconv_to_ext ()
|
|
{
|
|
return my_iconv_open (code_set_name_.c_str (), "UTF-8");
|
|
}
|
|
|
|
# endif // _RWSTD_NO_ISO_10646_WCHAR_T
|
|
#endif // _RWSTD_NO_ICONV
|
|
|
|
|
|
// utf8_decode translates the UTF-8 encoded character (specified
|
|
// by the range [from, to) into an object of type wchar_t
|
|
// algorithm derived from RFC2279
|
|
static wchar_t utf8_decode (const char* from, const char* to)
|
|
{
|
|
assert (from <= to);
|
|
|
|
const unsigned char* const ch =
|
|
_RWSTD_REINTERPRET_CAST (const unsigned char*, from);
|
|
|
|
const unsigned char* const ch_end =
|
|
_RWSTD_REINTERPRET_CAST (const unsigned char*, to);
|
|
|
|
size_t num_bytes;
|
|
|
|
wchar_t ret = 0;
|
|
|
|
// if the first character is below 0x80 then the value of *ch is the
|
|
// actual value of the character so return that value as a wchar_t
|
|
if (*ch < 0x80)
|
|
return wchar_t (*ch);
|
|
|
|
// if *ch is between 0xc2 and 0xe0 there are 2 bytes in the multi-byte
|
|
// character
|
|
if (*ch >= 0xc2 && *ch < 0xe0) {
|
|
ret = (*ch & 0x1f);
|
|
num_bytes = 2;
|
|
}
|
|
|
|
// if *ch is between 0xe0 and 0xf0 there are 3 bytes in the multi-byte
|
|
// character
|
|
else if (*ch >= 0xe0 && *ch < 0xf0) {
|
|
ret = *ch & 0x0f;
|
|
num_bytes = 3;
|
|
}
|
|
else if (*ch >= 0xf0 && *ch < 0xf8) {
|
|
ret = *ch & 0x07;
|
|
num_bytes = 4;
|
|
}
|
|
else if (*ch >= 0xf8 && *ch < 0xfc) {
|
|
ret = *ch & 0x03;
|
|
num_bytes = 5;
|
|
}
|
|
else if (*ch >= 0xfc && *ch < 0xfe) {
|
|
ret = *ch & 0x01;
|
|
num_bytes = 6;
|
|
}
|
|
else {
|
|
issue_diag (E_MBCHAR, true, 0,
|
|
"illegal multibyte prefix '\\x%02x' in character "
|
|
"map file\n", *ch);
|
|
}
|
|
|
|
if (ch_end < ch + num_bytes - 1) {
|
|
// the input doesn't have enough characters
|
|
issue_diag (E_MBCHAR, true, 0,
|
|
"incomplete multibyte character in character "
|
|
"map file: expecting %u bytes, found %u\n",
|
|
num_bytes, ch_end - ch);
|
|
}
|
|
|
|
// for each byte in the character extract the useful data by shifting
|
|
// and bit or it into the wchar_t
|
|
for (size_t i = 1; i < num_bytes; ++i)
|
|
ret = (ret << 6) | (ch [i] & 0x3f);
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
// count the number of bytes in a multibyte sequence denoted
|
|
// by the argument by counting the number of escape characters
|
|
std::size_t Charmap::mbcharlen (const std::string &str) const
|
|
{
|
|
std::size_t count = 1;
|
|
|
|
const char escape = scanner_.escape_char ();
|
|
|
|
for (std::size_t idx = 0; ; ++idx, ++count) {
|
|
idx = str.find (escape, idx);
|
|
|
|
if (std::string::npos == idx)
|
|
break;
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
|
|
/**************************************************************************/
|
|
|
|
const char* const Charmap::
|
|
portable_charset[] = {
|
|
/* 0x00 */ "<NUL>",
|
|
/* 0x01 SOH */ 0,
|
|
/* 0x02 STX */ 0,
|
|
/* 0x03 ETX */ 0,
|
|
/* 0x04 EOT */ 0,
|
|
/* 0x05 ENQ */ 0,
|
|
/* 0x06 ACK */ 0,
|
|
/* 0x07 BEL */ "<alert>",
|
|
/* 0x08 */ "<backspace>",
|
|
/* 0x09 TAB */ "<tab>",
|
|
/* 0x0a */ "<newline>",
|
|
/* 0x0b */ "<vertical-tab>",
|
|
/* 0x0c */ "<form-feed>",
|
|
/* 0x0d */ "<carriage-return>",
|
|
/* 0x0e SO */ 0,
|
|
/* 0x0f SI */ 0,
|
|
/* 0x10 DLE */ 0,
|
|
/* 0x11 DC1 */ 0,
|
|
/* 0x12 DC2 */ 0,
|
|
/* 0x13 DC3 */ 0,
|
|
/* 0x14 DC4 */ 0,
|
|
/* 0x15 NAK */ 0,
|
|
/* 0x16 SYN */ 0,
|
|
/* 0x17 ETB */ 0,
|
|
/* 0x18 CAN */ 0,
|
|
/* 0x19 EM */ 0,
|
|
/* 0x1a SUB */ 0,
|
|
/* 0x1b ESC */ 0,
|
|
/* 0x1c IS4 */ 0,
|
|
/* 0x1d IS3 */ 0,
|
|
/* 0x1e IS2 */ 0,
|
|
/* 0x1f IS1 */ 0,
|
|
/* 0x20 SPC */ "<space>",
|
|
/* 0x21 ! */ "<exclamation-mark>",
|
|
/* 0x22 ' */ "<quotation-mark>",
|
|
/* 0x23 # */ "<number-sign>",
|
|
/* 0x24 $ */ "<dollar-sign>",
|
|
/* 0x25 % */ "<percent-sign>",
|
|
/* 0x26 & */ "<ampersand>",
|
|
/* 0x27 ' */ "<apostrophe>",
|
|
/* 0x28 ( */ "<left-parenthesis>",
|
|
/* 0x29 ) */ "<right-parenthesis>",
|
|
/* 0x2a * */ "<asterisk>",
|
|
/* 0x2b + */ "<plus-sign>",
|
|
/* 0x2c , */ "<comma>",
|
|
/* 0x2d - */ "<hyphen>", // "<hyphen-minus>",
|
|
/* 0x2e . */ "<period>", // "<full-stop>",
|
|
/* 0x2f / */ "<slash>", // "<solidus>",
|
|
/* 0x30 0 */ "<zero>",
|
|
/* 0x31 1 */ "<one>",
|
|
/* 0x32 2 */ "<two>",
|
|
/* 0x33 3 */ "<three>",
|
|
/* 0x34 4 */ "<four>",
|
|
/* 0x35 5 */ "<five>",
|
|
/* 0x36 6 */ "<six>",
|
|
/* 0x37 7 */ "<seven>",
|
|
/* 0x38 8 */ "<eight>",
|
|
/* 0x39 9 */ "<nine>",
|
|
/* 0x3a : */ "<colon>",
|
|
/* 0x3b ; */ "<semicolon>",
|
|
/* 0x3c < */ "<less-than-sign>",
|
|
/* 0x3d = */ "<equals-sign>",
|
|
/* 0x3e > */ "<greater-than-sign>",
|
|
/* 0x3f ? */ "<question-mark>",
|
|
/* 0x40 @ */ "<commercial-at>",
|
|
/* 0x41 A */ "<A>",
|
|
/* 0x42 B */ "<B>",
|
|
/* 0x43 C */ "<C>",
|
|
/* 0x44 D */ "<D>",
|
|
/* 0x45 E */ "<E>",
|
|
/* 0x46 F */ "<F>",
|
|
/* 0x47 G */ "<G>",
|
|
/* 0x48 H */ "<H>",
|
|
/* 0x49 I */ "<I>",
|
|
/* 0x4a J */ "<J>",
|
|
/* 0x4b K */ "<K>",
|
|
/* 0x4c L */ "<L>",
|
|
/* 0x4d M */ "<M>",
|
|
/* 0x4e N */ "<N>",
|
|
/* 0x4f O */ "<O>",
|
|
/* 0x50 P */ "<P>",
|
|
/* 0x51 Q */ "<Q>",
|
|
/* 0x52 R */ "<R>",
|
|
/* 0x53 S */ "<S>",
|
|
/* 0x54 T */ "<T>",
|
|
/* 0x55 U */ "<U>",
|
|
/* 0x56 V */ "<V>",
|
|
/* 0x57 W */ "<W>",
|
|
/* 0x58 X */ "<X>",
|
|
/* 0x59 Y */ "<Y>",
|
|
/* 0x5a Z */ "<Z>",
|
|
/* 0x5b [ */ "<left-square-bracket>",
|
|
/* 0x5c \ */ "<backslash>", // "<reverse-solidus>",
|
|
/* 0x5d ] */ "<right-square-bracket>",
|
|
/* 0x5e ^ */ "<circumflex>", // "<circumflex-accent>",
|
|
/* 0x5f _ */ "<underscore>", // "<low-line>",
|
|
/* 0x60 ` */ "<grave-accent>",
|
|
/* 0x61 a */ "<a>",
|
|
/* 0x62 b */ "<b>",
|
|
/* 0x63 c */ "<c>",
|
|
/* 0x64 d */ "<d>",
|
|
/* 0x65 e */ "<e>",
|
|
/* 0x66 f */ "<f>",
|
|
/* 0x67 g */ "<g>",
|
|
/* 0x68 h */ "<h>",
|
|
/* 0x69 i */ "<i>",
|
|
/* 0x6a j */ "<j>",
|
|
/* 0x6b k */ "<k>",
|
|
/* 0x6c l */ "<l>",
|
|
/* 0x6d m */ "<m>",
|
|
/* 0x6e n */ "<n>",
|
|
/* 0x6f o */ "<o>",
|
|
/* 0x70 p */ "<p>",
|
|
/* 0x71 q */ "<q>",
|
|
/* 0x72 r */ "<r>",
|
|
/* 0x73 s */ "<s>",
|
|
/* 0x74 t */ "<t>",
|
|
/* 0x75 u */ "<u>",
|
|
/* 0x76 v */ "<v>",
|
|
/* 0x77 w */ "<w>",
|
|
/* 0x78 x */ "<x>",
|
|
/* 0x79 y */ "<y>",
|
|
/* 0x7a z */ "<z>",
|
|
/* 0x7b { */ "<left-brace>", // "<left-curly-bracket>",
|
|
/* 0x7c | */ "<vertical-line>",
|
|
/* 0x7d } */ "<right-brace>", // "<right-curly-bracket>",
|
|
/* 0x7e ~ */ "<tilde>",
|
|
/* 0x7f */ 0
|
|
};
|
|
|
|
|
|
// convert a string of narrow character into a wchar_t
|
|
bool Charmap::convert_to_wc (const std::string& sym_name,
|
|
const std::string& ext_enc, wchar_t& wc)
|
|
{
|
|
#ifndef _RWSTD_NO_ISO_10646_WCHAR_T
|
|
|
|
// the internal wchar_t representation for all characters
|
|
// in all locales is always ISO-10646 (UCS) on this system
|
|
return convert_to_ucs (sym_name, ext_enc, wc);
|
|
|
|
#else // if defined _RWSTD_NO_ISO_10646_WCHAR_T
|
|
|
|
if (UCS4_internal_ || Clocale_.empty ()) {
|
|
|
|
// when using UCS as the internal encoding or for a locale
|
|
// that has no corresponding C library locale convert the
|
|
// character to ISO-10646 (UCS)
|
|
return convert_to_ucs (sym_name, ext_enc, wc);
|
|
}
|
|
|
|
// otherwise use libc to convert the multi-byte character
|
|
// to its wchar_t value
|
|
if (-1 == std::mbtowc (&wc, ext_enc.c_str (), ext_enc.size ())) {
|
|
|
|
const char* const locname = std::setlocale (LC_CTYPE, 0);
|
|
const char* const errtext = std::strerror (errno);
|
|
|
|
// diagnose the failure to convert the character as just
|
|
// a warning and (try to) convert it to ISO-10646 (UCS)
|
|
issue_diag (W_CALL, true, &next,
|
|
"mbtowc failed to convert character in locale "
|
|
"\"%s\": %s\n", locname, errtext);
|
|
|
|
return convert_to_ucs (sym_name, ext_enc, wc);
|
|
}
|
|
|
|
return true;
|
|
|
|
#endif // _RWSTD_NO_ISO_10646_WCHAR_T
|
|
|
|
}
|
|
|
|
|
|
char* Charmap::convert_to_utf8 (const char *inbuf, size_t inbuf_s,
|
|
char *outbuf, size_t outbuf_s) const
|
|
{
|
|
#ifndef _RWSTD_NO_ICONV
|
|
|
|
if (ic_to_utf8_ == iconv_t (-1))
|
|
return 0;
|
|
|
|
char* outbufp = outbuf;
|
|
|
|
# ifndef _RWSTD_NO_ICONV_CONST_CHAR
|
|
const char* inbufp = inbuf;
|
|
# else
|
|
char* inbufp = _RWSTD_CONST_CAST(char*, inbuf);
|
|
# endif // _RWSTD_NO_ICONV_CONST_CHAR
|
|
|
|
if (std::size_t (-1) ==
|
|
iconv (ic_to_utf8_, &inbufp, &inbuf_s, &outbufp, &outbuf_s)) {
|
|
const char* const errtext = std::strerror (errno);
|
|
|
|
issue_diag (W_ICONV, false, &next,
|
|
"iconv failed to convert \"%s\" "
|
|
"to UTF-8: %s\n", inbuf, errtext);
|
|
|
|
return 0;
|
|
}
|
|
|
|
return outbufp;
|
|
|
|
#else // if defined (_RWSTD_NO_ICONV)
|
|
|
|
return 0;
|
|
|
|
#endif // _RWSTD_NO_ICONV
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string Charmap::get_charmap_name () const
|
|
{
|
|
const std::string::size_type idx = charmap_name_.rfind (_RWSTD_PATH_SEP);
|
|
|
|
if (idx != std::string::npos)
|
|
return charmap_name_.substr (idx + 1);
|
|
|
|
return charmap_name_;
|
|
}
|
|
|
|
|
|
wchar_t Charmap::increment_wchar (wchar_t val) const
|
|
{
|
|
#ifndef _RWSTD_NO_ISO_10646_WCHAR_T
|
|
|
|
// to increment a wchar_t value and keep the encoding all we have
|
|
// to do is increment the val because the internal encoding is UCS
|
|
return val + 1;
|
|
|
|
#else
|
|
// to increment a wchar_t value and keep the encoding we have to
|
|
// convert the wchar_t to the external encoding, increment that
|
|
// string value, and convert back to the internal representation
|
|
const rmb_cmap_iter it = rmb_cmap_.find (val);
|
|
|
|
if (it != rmb_cmap_.end ()) {
|
|
|
|
mb_cmap_iter ret;
|
|
|
|
// multibyte character corresponding to the wchar_t value
|
|
std::string encoding = it->second;
|
|
|
|
// continue incrementing the multi-byte value until we get a valid
|
|
// character. NOTE: this must be done for encodings such as SJIS where
|
|
// \x7f in the last byte of a multibyte string is not a valid character
|
|
// NOTE: this will not detect errors in the sequence, since the program
|
|
// will continue until it finds a valid character
|
|
do {
|
|
int last_elm = encoding.size () - 1;
|
|
|
|
while (last_elm >= 0) {
|
|
|
|
typedef unsigned char UChar;
|
|
|
|
const unsigned ic = UChar (encoding [last_elm]) + 1;
|
|
|
|
// if incrementing the last element caused it to exceed
|
|
// UCHAR_MAX increment the next higher byte if there is
|
|
// one
|
|
if (UCHAR_MAX < ic)
|
|
encoding [last_elm--] = '\0';
|
|
else {
|
|
encoding [last_elm] = char (ic);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (last_elm < 0)
|
|
return -1; // error
|
|
|
|
} while ((ret = mb_cmap_.find (encoding)) == mb_cmap_.end ());
|
|
|
|
return ret->second;
|
|
}
|
|
|
|
return -1; // error
|
|
|
|
#endif // _RWSTD_NO_ISO_10646_WCHAR_T
|
|
|
|
}
|
|
|
|
|
|
bool Charmap::
|
|
increment_encoding (std::string &encoding)
|
|
{
|
|
// find the last escape character in the human readable representation
|
|
// of the encoding (i.e., in the multibyte character such as "/xf0/x80")
|
|
const std::string::size_type pos =
|
|
encoding.rfind (scanner_.escape_char ());
|
|
|
|
// the escape character must be there (guaranteed by the scanner)
|
|
assert (pos < encoding.size ());
|
|
|
|
const char* end = 0;
|
|
|
|
// convert the last character in the multibyte character to a numeric
|
|
// value representing the last byte of the sequence
|
|
unsigned long last_byte =
|
|
scanner_.convert_escape (encoding.c_str () + pos, &end);
|
|
|
|
// POSIX requires that the incremented value be non-NUL
|
|
if (UCHAR_MAX <= last_byte || *end)
|
|
return false;
|
|
|
|
// increment the last byte
|
|
++last_byte;
|
|
|
|
// format the last byte in the same notation (octal, decimal,
|
|
// or hexadecimal escape sequence)
|
|
static const char xdigits[] = "0123456789ABCDEF";
|
|
|
|
char byte_str [5];
|
|
char *pdig = byte_str;
|
|
|
|
switch (encoding [pos + 1]) {
|
|
case 'd': { // decimal escape
|
|
const unsigned hundreds = last_byte / 100;
|
|
const unsigned tens = (last_byte - hundreds) / 10;
|
|
const unsigned units = last_byte % 10;
|
|
|
|
*pdig++ = 'd';
|
|
|
|
if (hundreds)
|
|
*pdig++ = xdigits [hundreds];
|
|
|
|
*pdig++ = xdigits [tens];
|
|
*pdig++ = xdigits [units];
|
|
*pdig = '\0';
|
|
break;
|
|
}
|
|
|
|
case 'x': { // hex escape
|
|
const unsigned hi = last_byte >> 4;
|
|
const unsigned lo = last_byte & 0xfU;
|
|
|
|
*pdig++ = 'x';
|
|
*pdig++ = xdigits [hi];
|
|
*pdig++ = xdigits [lo];
|
|
*pdig = '\0';
|
|
break;
|
|
}
|
|
default: { // octal escape
|
|
const unsigned hi = last_byte >> 6;
|
|
const unsigned mid = (last_byte >> 3) & 07U;
|
|
const unsigned lo = last_byte & 07U;
|
|
|
|
if (hi)
|
|
*pdig++ = xdigits [hi];
|
|
|
|
*pdig++ = xdigits [mid];
|
|
*pdig++ = xdigits [lo];
|
|
*pdig = '\0';
|
|
}
|
|
} // switch
|
|
|
|
// replace the last escape sequence with the new one
|
|
encoding.replace (pos + 1, std::string::npos, byte_str);
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
std::string Charmap::
|
|
encoding_to_mbchar (const std::string &encoding) const
|
|
{
|
|
std::string mbchar;
|
|
|
|
for (const char *pbyte = encoding.c_str (); pbyte && *pbyte; )
|
|
mbchar += char (scanner_.convert_escape (pbyte, &pbyte));
|
|
|
|
return mbchar;
|
|
}
|
|
|
|
|
|
// convert the locale's encoded character to UCS4 wchar_t
|
|
wchar_t Charmap::
|
|
convert_sym_to_ucs (const std::string &sym) const
|
|
{
|
|
std::string::const_iterator it (sym.begin ());
|
|
|
|
if ( sym.size () < 4 || *it != '<' || *++it != 'U'
|
|
|| !(std::isxdigit)(*++it)) {
|
|
issue_diag (E_UCS, true, 0,
|
|
"Unable to convert symbolic name %s to UCS.\n",
|
|
sym.c_str ());
|
|
}
|
|
|
|
const unsigned long val = std::strtoul (&*++it, (char**)0, 16);
|
|
|
|
if (_RWSTD_WCHAR_MAX <= val)
|
|
issue_diag (E_UCS, true, 0,
|
|
"UCS value %lu of symbolic character %s out of range.\n",
|
|
val, sym.c_str ());
|
|
|
|
return wchar_t (val);
|
|
}
|
|
|
|
|
|
// convert the locale's encoded character to UCS4/UCS2 wchar_t
|
|
bool Charmap::convert_to_ucs (const std::string &sym_name,
|
|
const std::string &encoding, wchar_t& wc)
|
|
{
|
|
#ifndef _MSC_VER
|
|
|
|
if (in_utf8_) {
|
|
wc = utf8_decode (encoding.c_str (), &*(encoding.end () - 1));
|
|
return true;
|
|
}
|
|
|
|
// allocate enough space for the longest possible UTF-8 character
|
|
char utf8_enc [8 + 1 /* NUL */];
|
|
|
|
const char* const ch_end =
|
|
convert_to_utf8 (encoding.c_str (), encoding.size (),
|
|
utf8_enc, sizeof utf8_enc);
|
|
if (ch_end)
|
|
// only if conversion to utf8 succeeded
|
|
wc = utf8_decode (utf8_enc, ch_end);
|
|
else
|
|
// if not, try to convert the symbolic name directly
|
|
wc = convert_sym_to_ucs (sym_name);
|
|
|
|
return true;
|
|
|
|
#else
|
|
|
|
if (0 != codepage_) {
|
|
wchar_t ret[2] = {0};
|
|
MultiByteToWideChar (codepage_, 0, encoding.c_str(), -1, ret, 2);
|
|
if (ret[1] != 0)
|
|
return false;
|
|
|
|
wc = ret[0];
|
|
return true;
|
|
} else {
|
|
wc = convert_sym_to_ucs (sym_name);
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
|
|
#endif // _MSC_VER
|
|
}
|
|
|
|
|
|
void Charmap::add_to_cmaps (const std::string &sym_name,
|
|
const std::string &encoding,
|
|
bool is_mbchar /* = false */)
|
|
{
|
|
// compute the external (multibyte) encoding of the character
|
|
// if necessary (i.e., unless already done by the caller)
|
|
const std::string mbchar =
|
|
is_mbchar ? encoding : encoding_to_mbchar (encoding);
|
|
|
|
symnames_list_.push_back (sym_name);
|
|
|
|
if (1 == mbchar.size ()) {
|
|
// strval is a single-byte character
|
|
|
|
const unsigned char ch = mbchar [0];
|
|
|
|
// add the wide character and its symbolic name to the narrow
|
|
// character maps
|
|
if (forward_maps) {
|
|
// the locale utility doesn't need reverse maps
|
|
n_cmap_.insert (std::make_pair (sym_name, ch));
|
|
}
|
|
|
|
if (reverse_maps)
|
|
rn_cmap_.insert (std::make_pair (ch, sym_name));
|
|
|
|
if (ch > largest_nchar_)
|
|
largest_nchar_ = ch;
|
|
}
|
|
|
|
// (try to) compute the wide character value of the character
|
|
wchar_t wch;
|
|
|
|
if (convert_to_wc (sym_name, mbchar, wch)) {
|
|
|
|
// add the wide character and its symbolic name to the wide
|
|
// character maps
|
|
if (forward_maps) {
|
|
// the locale utility doesn't need forward maps
|
|
w_cmap_.insert (std::make_pair (sym_name, wch));
|
|
}
|
|
|
|
if (reverse_maps)
|
|
rw_cmap_.insert (std::make_pair (wch, sym_name));
|
|
|
|
// add the corresponding multibyte character to the multibyte
|
|
// character maps
|
|
mb_cmap_.insert (std::make_pair (mbchar, wch));
|
|
rmb_cmap_.insert (std::make_pair (wch, mbchar));
|
|
}
|
|
|
|
// compute the UCS value of the character
|
|
wchar_t uch;
|
|
|
|
if (convert_to_ucs (sym_name, mbchar, uch)) {
|
|
|
|
// add UCS character and its symbolic name to the UCS
|
|
// character maps
|
|
ucs4_cmap_.insert (std::make_pair (sym_name, uch));
|
|
rucs4_cmap_.insert (std::make_pair (uch, sym_name));
|
|
}
|
|
}
|
|
|
|
|
|
// process the characters implicitly defined by using ellipsis between
|
|
// two explicitly defined characters
|
|
std::size_t Charmap::
|
|
process_ellipsis (const Scanner::token_t &beg_tok, int num_ellipsis)
|
|
{
|
|
// get the upper end of the range denoted by the ellipsis
|
|
const Scanner::token_t end_tok = scanner_.next_token ();
|
|
|
|
// get the human readabale encoding of the character
|
|
// denoted by the lower end of the ellipsis
|
|
const std::string encoding = scanner_.next_token ().name;
|
|
|
|
// convert the encoding to a multibyte character
|
|
std::string mbchar = encoding_to_mbchar (encoding);
|
|
|
|
// add the beg_tok symbol name to the maps
|
|
add_to_cmaps (beg_tok.name, mbchar, true);
|
|
|
|
// extract the numeric portion of the symbolic character name
|
|
// denoted by the lower end of the ellipsis
|
|
std::size_t idx = 0;
|
|
|
|
int base; // numeric base
|
|
const char *fmat; // sprintf() format specifier
|
|
|
|
const std::size_t beg_len = beg_tok.name.size ();
|
|
|
|
// determine the value of the beginning of the range
|
|
// denoted by the ellipsis
|
|
if (2 == num_ellipsis) {
|
|
base = 16;
|
|
fmat = "%.*s%0*lX>";
|
|
|
|
// advance to the first hex digit
|
|
while (idx < beg_len && !(std::isxdigit)(beg_tok.name [idx]))
|
|
++idx;
|
|
}
|
|
else {
|
|
base = 10;
|
|
fmat = "%.*s%0*ld>";
|
|
|
|
// advance to the first decimal digit
|
|
while (idx < beg_len && !(std::isdigit)(beg_tok.name [idx]))
|
|
++idx;
|
|
}
|
|
|
|
// length of non-numeric prefix of the symbolic character name
|
|
const std::size_t pfx_len = idx;
|
|
|
|
// get the character value plus one (since the first value
|
|
// has already been added to the map earlier)
|
|
char *num_end;
|
|
const unsigned long beg_val =
|
|
1 + std::strtoul (beg_tok.name.c_str () + pfx_len, &num_end, base);
|
|
|
|
// the length of the numeric portion
|
|
const std::size_t num_size =
|
|
num_end - (beg_tok.name.c_str () + pfx_len);
|
|
|
|
// find the end of the range denoted by the ellipsis
|
|
idx = 0;
|
|
|
|
const std::size_t end_len = end_tok.name.size ();
|
|
|
|
if (2 == num_ellipsis) {
|
|
// advance to the next hex digit
|
|
while (idx < end_len && !(std::isxdigit)(end_tok.name [idx]))
|
|
++idx;
|
|
}
|
|
else {
|
|
// advance to the next dec digit
|
|
while (idx < end_len && !(std::isdigit)(end_tok.name [idx]))
|
|
++idx;
|
|
}
|
|
|
|
const unsigned long end_val =
|
|
std::strtoul (end_tok.name.c_str () + idx, (char**)0, base);
|
|
|
|
// the ending numeric value must be greater than or equal
|
|
// to the beginning numeric value
|
|
if (end_val < beg_val)
|
|
issue_diag (E_RANGE, true, &end_tok,
|
|
"invalid range found in character map file\n");
|
|
|
|
char next_name [MAX_SYM_NAME_LEN];
|
|
|
|
std::size_t nchars = 0;
|
|
|
|
const char* const pfx = beg_tok.name.c_str ();
|
|
|
|
for (unsigned long val = beg_val; val <= end_val; ++val, ++nchars) {
|
|
|
|
std::sprintf (next_name, fmat, pfx_len, pfx, num_size, val);
|
|
|
|
// increment the last byte of the multibyte character
|
|
// and if the result is valid (i.e., doesn't contain
|
|
// an embedded NUL) add the generated name and the
|
|
// multibyte character to the maps
|
|
const unsigned char last_byte = mbchar [mbchar.size () - 1];
|
|
if (last_byte < UCHAR_MAX) {
|
|
mbchar [mbchar.size () - 1] = last_byte + 1;
|
|
add_to_cmaps (next_name, mbchar, true);
|
|
}
|
|
else {
|
|
// an ellipsis must not specify a range that includes
|
|
// an encoding with an embedded NUL
|
|
issue_diag (E_RANGE, true, &beg_tok,
|
|
"encoding of an element in range contains NUL\n");
|
|
}
|
|
}
|
|
|
|
// return the number of characters denoted by the ellipsis
|
|
return nchars;
|
|
}
|
|
|
|
|
|
// process all the characters in the character map file.
|
|
void Charmap::process_chars()
|
|
{
|
|
issue_diag (I_STAGE, false, 0, "processing CHARMAP section\n");
|
|
|
|
std::size_t ntokens = 0;
|
|
std::size_t nellips = 0;
|
|
std::size_t nchars = 0;
|
|
|
|
next = scanner_.next_token();
|
|
Scanner::token_t nextnext;
|
|
|
|
// loop until we find the closing charmap token
|
|
for ( ; next.token != Scanner::tok_charmap; ++ntokens) {
|
|
|
|
switch (next.token) {
|
|
|
|
case Scanner::tok_nl:
|
|
case Scanner::tok_end:
|
|
break;
|
|
|
|
case Scanner::tok_sym_name:
|
|
// the next token may be either ellipsis if this line
|
|
// of the charmap is in the form:
|
|
// "%s...%s %s\n", <sym_name>, <sym_name>, <encoding>
|
|
// or an encoding if this line is in the format:
|
|
// "%s %s\n", <sym_name>, <encoding>
|
|
nextnext = scanner_.next_token ();
|
|
ntokens += 3;
|
|
|
|
switch (nextnext.token) {
|
|
|
|
case Scanner::tok_abs_ellipsis:
|
|
// absolute ellipsis (see ISO/IEC TR 14652)
|
|
nchars += process_ellipsis (next, 3);
|
|
++nellips;
|
|
break;
|
|
|
|
case Scanner::tok_hex_ellipsis:
|
|
// hexadecimal symbolic ellipsis (see ISO/IEC TR 14652)
|
|
nchars += process_ellipsis (next, 2);
|
|
++nellips;
|
|
break;
|
|
|
|
case Scanner::tok_char_value:
|
|
// character represented as a numeric constant
|
|
add_to_cmaps (next.name, nextnext.name);
|
|
++nchars;
|
|
break;
|
|
|
|
default:
|
|
issue_diag (E_SYNTAX, true, &next,
|
|
"byte value expected following symbolic "
|
|
"name in character map file\n");
|
|
}
|
|
|
|
scanner_.ignore_line ();
|
|
break;
|
|
|
|
default:
|
|
issue_diag (E_SYNTAX, true, &next,
|
|
"symbolic name expected in character map file\n");
|
|
break;
|
|
}
|
|
|
|
next = scanner_.next_token();
|
|
}
|
|
|
|
issue_diag (I_STAGE, false, 0,
|
|
"done processing CHARMAP section (%lu tokens, "
|
|
"%lu ellipses, %lu characters)\n",
|
|
ntokens, nellips, nchars);
|
|
|
|
// make sure that all characters in the portable character set
|
|
// are in the charmap
|
|
if (forward_maps)
|
|
verify_portable_charset();
|
|
}
|
|
|
|
|
|
void Charmap::verify_portable_charset () const
|
|
{
|
|
const std::size_t nchars =
|
|
sizeof portable_charset / sizeof *portable_charset;
|
|
|
|
for (std::size_t i = 0; i < nchars; ++i) {
|
|
if (0 == portable_charset [i])
|
|
continue;
|
|
|
|
if (n_cmap_.find (portable_charset [i]) == n_cmap_.end ())
|
|
issue_diag (W_NOPCS, false, 0,
|
|
"member of portable character set %s not found "
|
|
"in the character map\n", portable_charset [i]);
|
|
}
|
|
}
|
|
|
|
|
|
Charmap::Charmap(const char* Clocale,
|
|
const char* fname,
|
|
bool in_utf8, bool create_forward_maps,
|
|
bool create_reverse_maps, bool use_UCS4)
|
|
: mb_cur_max_(1),
|
|
charmap_name_ (fname),
|
|
Clocale_ (Clocale),
|
|
largest_nchar_(0),
|
|
in_utf8_(in_utf8),
|
|
forward_maps (create_forward_maps),
|
|
reverse_maps (create_reverse_maps),
|
|
UCS4_internal_ (use_UCS4)
|
|
{
|
|
#ifndef _RWSTD_NO_ICONV
|
|
ic_to_utf8_ = 0;
|
|
ic_to_ext_ = 0;
|
|
#endif // _RWSTD_NO_ICONV
|
|
|
|
scanner_.open (fname, '#', '\\');
|
|
|
|
// set code_set_name to the name of the character set description
|
|
// file by default, in case it's not explicitly specified
|
|
const char* const slash = std::strrchr (fname, _RWSTD_PATH_SEP);
|
|
code_set_name_ = slash ? slash + 1 : fname;
|
|
|
|
// loop until we reach the end of the file
|
|
while ((next = scanner_.next_token()).token != Scanner::tok_end_tokens) {
|
|
|
|
switch (next.token) {
|
|
|
|
case Scanner::tok_code_set_name:
|
|
next = scanner_.next_token ();
|
|
|
|
if (next.token == Scanner::tok_string) {
|
|
code_set_name_ = next.name.substr (1, next.name.size () - 2);
|
|
}
|
|
else if (next.token == Scanner::tok_ndef) {
|
|
code_set_name_ = next.name;
|
|
}
|
|
else
|
|
issue_diag (E_SYNTAX, true, &next,
|
|
"string expected following <code_set_name>\n");
|
|
|
|
// we always need a iconv to utf8 so that we can create
|
|
// the utf8_charmap unless we are on windows
|
|
#ifndef _RWSTD_NO_ICONV
|
|
if (!in_utf8_) {
|
|
ic_to_utf8_ = open_iconv_to_utf8 ();
|
|
# if !defined (_RWSTD_NO_ISO_10646_WCHAR_T)
|
|
ic_to_ext_ = open_iconv_to_ext ();
|
|
# endif // _RWSTD_NO_ISO_10646_WCHAR_T
|
|
}
|
|
|
|
#else // if defined (_RWSTD_NO_ICONV)
|
|
|
|
# ifdef _MSC_VER
|
|
codepage_ = get_codepage (code_set_name_);
|
|
if (codepage_ == 0) {
|
|
issue_diag (W_ICONV, false, 0,
|
|
"iconv_open (%s to UTF-8) failed\n",
|
|
code_set_name_.c_str());
|
|
}
|
|
|
|
# endif // _MSC_VER
|
|
#endif // _RWSTD_NO_ICONV
|
|
|
|
scanner_.ignore_line ();
|
|
break;
|
|
|
|
case Scanner::tok_mb_cur_max:
|
|
mb_cur_max_ = std::atoi (scanner_.next_token ().name.c_str ());
|
|
scanner_.ignore_line ();
|
|
break;
|
|
|
|
case Scanner::tok_mb_cur_min:
|
|
scanner_.ignore_line ();
|
|
break;
|
|
|
|
case Scanner::tok_charmap:
|
|
scanner_.ignore_line ();
|
|
process_chars();
|
|
break;
|
|
case Scanner::tok_width:
|
|
// ignore the width section of the character map
|
|
while ((next = scanner_.next_token ()).token != Scanner::tok_width);
|
|
break;
|
|
|
|
case Scanner::tok_nl:
|
|
break;
|
|
|
|
default:
|
|
issue_diag (E_SYNTAX, false, &next,
|
|
"unknown token %s in character map file\n",
|
|
next.name.c_str ());
|
|
}
|
|
}
|
|
}
|