611 lines
18 KiB
C++
611 lines
18 KiB
C++
/***************************************************************************
|
|
*
|
|
* def.cpp
|
|
*
|
|
* $Id: def.cpp 522614 2007-03-26 20:25:09Z sebor $
|
|
*
|
|
***************************************************************************
|
|
*
|
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
* contributor license agreements. See the NOTICE file distributed
|
|
* with this work for additional information regarding copyright
|
|
* ownership. The ASF licenses this file to you under the Apache
|
|
* License, Version 2.0 (the "License"); you may not use this file
|
|
* except in compliance with the License. You may obtain a copy of
|
|
* the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
|
* implied. See the License for the specific language governing
|
|
* permissions and limitations under the License.
|
|
*
|
|
* Copyright 2001-2006 Rogue Wave Software.
|
|
*
|
|
**************************************************************************/
|
|
|
|
// #ifndef _RWSTD_NO_PURE_C_HEADERS
|
|
// # define _RWSTD_NO_PURE_C_HEADERS
|
|
// #endif // _RWSTD_NO_PURE_C_HEADERS
|
|
|
|
// #ifndef _RWSTD_NO_DEPRECATED_C_HEADERS
|
|
// # define _RWSTD_NO_DEPRECATED_C_HEADERS
|
|
// #endif // _RWSTD_NO_DEPRECATED_C_HEADERS
|
|
|
|
#ifdef __DECCXX
|
|
# undef __PURE_CNAME
|
|
#endif // __DECCXX
|
|
|
|
#include <algorithm>
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <locale>
|
|
#include <map>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include <cassert>
|
|
#include <cctype>
|
|
#include <cerrno>
|
|
#include <climits>
|
|
#include <clocale>
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <cstring> // for memset()
|
|
|
|
#include "aliases.h"
|
|
#include "def.h"
|
|
#include "diagnostic.h"
|
|
#include "loc_exception.h"
|
|
#include "path.h"
|
|
|
|
|
|
#define UTF8_MAX_SIZE 6
|
|
|
|
|
|
// convert_to_ext converts a wchar_t value with some encoding into
|
|
// a narrow character string in the current locale's encoding
|
|
std::string Def::convert_to_ext (wchar_t val)
|
|
{
|
|
rmb_cmap_iter it;
|
|
if ((it = charmap_.get_rmb_cmap().find(val))
|
|
!= charmap_.get_rmb_cmap().end()){
|
|
return it->second;
|
|
|
|
}
|
|
|
|
issue_diag (E_CVT2EXT, true, 0,
|
|
"unable to convert character %d to external "
|
|
"representation\n", val);
|
|
|
|
return std::string("");
|
|
}
|
|
|
|
|
|
// convert the wchar_t value into a utf8 string
|
|
std::string Def::utf8_encode (wchar_t wc)
|
|
{
|
|
unsigned int wc_int = _RWSTD_STATIC_CAST (unsigned int, wc);
|
|
|
|
std::string ret;
|
|
std::size_t size = 0;
|
|
char buf[UTF8_MAX_SIZE + 1];
|
|
char* bufp = buf;
|
|
|
|
if (wc_int < 0x80)
|
|
{
|
|
size = 1;
|
|
*bufp++ = wc_int;
|
|
}
|
|
else
|
|
{
|
|
int b;
|
|
|
|
for (b = 2; b < UTF8_MAX_SIZE; b++)
|
|
if ((wc_int & (~(wchar_t)0 << (5 * b + 1))) == 0)
|
|
break;
|
|
size = b;
|
|
|
|
*bufp = (unsigned char) (~0xff >> b);
|
|
--b;
|
|
do
|
|
{
|
|
bufp[b] = 0x80 | (wc_int & 0x3f);
|
|
wc_int >>= 6;
|
|
}
|
|
while (--b > 0);
|
|
*bufp |= wc_int;
|
|
}
|
|
buf[size] = (char)0;
|
|
for (unsigned int i = 0; i < size; i++)
|
|
ret += buf[i];
|
|
return ret;
|
|
|
|
}
|
|
|
|
void Def::copy_file (const std::string& name, const std::string& outname)
|
|
{
|
|
assert (name.size() > 0);
|
|
assert (outname.size() > 0);
|
|
|
|
std::ifstream from (name.c_str(), std::ios::binary);
|
|
if (!from) {
|
|
issue_diag (E_OPENRD, true,
|
|
&next, "unable to open locale database %s\n",
|
|
name.c_str());
|
|
}
|
|
from.exceptions (std::ios::badbit);
|
|
|
|
std::ofstream to (outname.c_str(), std::ios::binary);
|
|
if (!to) {
|
|
issue_diag (E_OPENWR, true,
|
|
&next, "unable to create locale database %s\n",
|
|
outname.c_str());
|
|
}
|
|
to.exceptions (std::ios::failbit | std::ios::badbit);
|
|
|
|
// copy the file
|
|
to << from.rdbuf ();
|
|
}
|
|
|
|
|
|
void Def::copy_category(int category, std::string name)
|
|
{
|
|
assert (name.size() > 0);
|
|
|
|
// create the name of the file to copy to and call copy_file
|
|
std::string outname (output_name_);
|
|
|
|
makedir (outname.c_str ());
|
|
|
|
switch (category) {
|
|
// append the category name to both 'name' and 'outname'
|
|
// and call the copy_file routine
|
|
// the xxx_written variable is set to true so that write_xxx
|
|
// does not overwrite the file that is written here
|
|
case LC_CTYPE:
|
|
(name += _RWSTD_PATH_SEP) += "LC_CTYPE";
|
|
(outname += _RWSTD_PATH_SEP) += "LC_CTYPE";
|
|
copy_file (name, outname);
|
|
ctype_written_ = true;
|
|
|
|
break;
|
|
case LC_COLLATE:
|
|
(name += _RWSTD_PATH_SEP) += "LC_COLLATE";
|
|
(outname += _RWSTD_PATH_SEP) += "LC_COLLATE";
|
|
copy_file(name, outname);
|
|
collate_written_ = true;
|
|
|
|
break;
|
|
case LC_MONETARY:
|
|
(name += _RWSTD_PATH_SEP) += "LC_MONETARY";
|
|
(outname += _RWSTD_PATH_SEP) += "LC_MONETARY";
|
|
copy_file(name, outname);
|
|
mon_written_ = true;
|
|
|
|
break;
|
|
|
|
case LC_NUMERIC:
|
|
(name += _RWSTD_PATH_SEP) += "LC_NUMERIC";
|
|
(outname += _RWSTD_PATH_SEP) += "LC_NUMERIC";
|
|
copy_file(name, outname);
|
|
num_written_ = true;
|
|
|
|
break;
|
|
case LC_TIME:
|
|
(name += _RWSTD_PATH_SEP) += "LC_TIME";
|
|
(outname += _RWSTD_PATH_SEP) += "LC_TIME";
|
|
copy_file(name, outname);
|
|
time_written_ = true;
|
|
|
|
break;
|
|
|
|
#ifdef LC_MESSAGES
|
|
case LC_MESSAGES:
|
|
(name += _RWSTD_PATH_SEP) += "LC_MESSAGES";
|
|
(outname += _RWSTD_PATH_SEP) += "LC_MESSAGES";
|
|
copy_file(name, outname);
|
|
messages_written_ = true;
|
|
break;
|
|
#endif // LC_MESSAGES
|
|
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
|
|
// strip a pair, which should be in the form '(<sym>,<sym2>)'
|
|
void Def::strip_pair (const std::string &tok, std::string &sym,
|
|
std::string &sym2)
|
|
{
|
|
std::size_t i = 0;
|
|
|
|
if(tok[i] == '(') {
|
|
if(tok[++i] == '<')
|
|
while (tok[i] != '>'){
|
|
if (tok[i] == scanner_.escape_char ())
|
|
i++;
|
|
sym.push_back(tok[i++]);
|
|
}
|
|
// this push_back is safe because the while loop above ends when
|
|
// tok[i] == '>'
|
|
sym.push_back(tok[i++]);
|
|
if (tok[i++] != ',')
|
|
issue_diag (E_PAIR, true, &next,
|
|
"invalid pair %s\n", tok.c_str());
|
|
if (tok[i] == '<')
|
|
while (tok[i] != '>'){
|
|
if (tok[i] == scanner_.escape_char ())
|
|
sym2.push_back(tok[i++]);
|
|
if ('\0' != tok[i])
|
|
sym2.push_back(tok[i++]);
|
|
else
|
|
issue_diag (E_PAIR, true, &next,
|
|
"invalid pair %s\n", tok.c_str());
|
|
}
|
|
|
|
// this push_back is safe because the while loop above ends when
|
|
// tok[i] == '>'
|
|
sym2.push_back(tok[i++]);
|
|
|
|
}
|
|
}
|
|
|
|
// converts str, which is a string in the following format
|
|
// "[<sym_name>][char]" including the quotes to a string of characters
|
|
// str is not a const reference because if the string spans multiple lines
|
|
// str is modified
|
|
std::string Def::convert_string (const std::string &str1)
|
|
{
|
|
assert (str1[0] == '\"');
|
|
|
|
std::string ret;
|
|
|
|
std::string sym;
|
|
// the index starts at 1 so that we ignore the initial '"'
|
|
int idx = 1;
|
|
|
|
const char* str = str1.c_str();
|
|
while (str[idx] != '\"') {
|
|
sym.clear();
|
|
// if we reach the null-terminator before we see an end-quote
|
|
// then we must have a multi-line string, so get the next token
|
|
if (str[idx] == '\0') {
|
|
if((next = scanner_.next_token()).token == Scanner::tok_string)
|
|
break;
|
|
str = next.name.c_str();
|
|
idx = 0;
|
|
}
|
|
|
|
// '<' marks the beginning of a symbolic name
|
|
// construct the name and look up its value in the cmap
|
|
if (str[idx] == '<') {
|
|
while (str [idx] && str [idx] != '>') {
|
|
if (str[idx] == scanner_.escape_char ())
|
|
idx++;
|
|
sym += str[idx++];
|
|
}
|
|
|
|
// this is safe because the while loop ended with *str == '>'
|
|
if (str [idx])
|
|
sym += str [idx++];
|
|
|
|
w_cmap_iter w_pos = charmap_.get_w_cmap().find (sym);
|
|
if (w_pos != charmap_.get_w_cmap().end()) {
|
|
ret += convert_to_ext(w_pos->second);
|
|
}
|
|
else {
|
|
return std::string();
|
|
}
|
|
}
|
|
|
|
// the definition file contains a sting with non-symbol names.
|
|
// process each character as it's actual character value.
|
|
// Locale definitions that use this may not be portable.
|
|
else {
|
|
ret += (char)str[idx++];
|
|
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
|
|
|
|
}
|
|
|
|
#ifndef _RWSTD_NO_WCHAR_T
|
|
// converts a collating element definition to an array of wide characters
|
|
// (the wide characters the collating element is composed of).
|
|
|
|
// this overload deals with collating elements defined through
|
|
// a sequence of symbolic names, NOT enclosed within quotes.
|
|
std::wstring
|
|
Def::convert_wstring (const StringVector& sym_array)
|
|
{
|
|
std::wstring ret;
|
|
StringVector::const_iterator it = sym_array.begin ();
|
|
while (it != sym_array.end ()) {
|
|
// lookup the symbol we just constructed
|
|
w_cmap_iter w_pos = charmap_.get_w_cmap().find (*it);
|
|
if (w_pos != charmap_.get_w_cmap().end()) {
|
|
ret += w_pos->second;
|
|
it++;
|
|
}
|
|
else {
|
|
// we return an empty string if we couldn't find any character
|
|
// in the character map
|
|
ret.clear();
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
// this overload deals with collating elements defined through
|
|
// a sequence of characters or symbolic names, enclosed within quotes.
|
|
std::wstring
|
|
Def::convert_wstring (const token_t& t)
|
|
{
|
|
std::wstring ret;
|
|
std::string sym;
|
|
|
|
std::string str1 (t.name);
|
|
|
|
int idx = 0;
|
|
char term = 0;
|
|
const char* str = str1.c_str();
|
|
|
|
// skip first character if quote
|
|
if (str[idx] == '\"') {
|
|
term = '\"', idx++;
|
|
}
|
|
|
|
while (str[idx] != term) {
|
|
sym.clear();
|
|
|
|
// '<' marks the beginning of a symbolic name
|
|
// construct the name and look up its value in the cmap
|
|
if (str[idx] == '<') {
|
|
while (str[idx] != '>') {
|
|
if (str[idx] == scanner_.escape_char ()) {
|
|
// sym += str[idx++];
|
|
idx++;
|
|
}
|
|
|
|
if ('\0' != str[idx])
|
|
sym += str[idx++];
|
|
else
|
|
issue_diag (E_SYMEND, true, &t,
|
|
"end of symbolic name not found\n");
|
|
}
|
|
|
|
// this is safe because the while loop ended with *str == '>'
|
|
sym += str[idx++];
|
|
|
|
// lookup the symbol we just constructed
|
|
w_cmap_iter w_pos = charmap_.get_w_cmap().find (sym);
|
|
if (w_pos != charmap_.get_w_cmap().end()) {
|
|
ret += w_pos->second;
|
|
}
|
|
else {
|
|
// if we can't find a symbol then return an empty string,
|
|
// most likely this will happen if inside a collating-element
|
|
// the user uses a character that is not in the current
|
|
// codeset, in this case the collating element will be ignored
|
|
ret.clear();
|
|
return ret;
|
|
}
|
|
}
|
|
// the definition file contains a string with non-symbol names.
|
|
// process each character as it's actual character value.
|
|
// Locale definitions that use this may not be portable.
|
|
else
|
|
ret += (wchar_t)str[idx++];
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
#endif // _RWSTD_NO_WCHAR_T
|
|
|
|
|
|
// automatically fill any categories that depend on other categories
|
|
void Def::auto_fill ()
|
|
{
|
|
|
|
mask_iter mask_pos;
|
|
|
|
for (std::size_t i = 0; i <= UCHAR_MAX; i++) {
|
|
if ( ctype_out_.mask_tab[i] & std::ctype_base::upper
|
|
|| ctype_out_.mask_tab[i] & std::ctype_base::lower
|
|
|| ctype_out_.mask_tab[i] & std::ctype_base::alpha
|
|
|| ctype_out_.mask_tab[i] & std::ctype_base::digit
|
|
|| ctype_out_.mask_tab[i] & std::ctype_base::xdigit
|
|
|| ctype_out_.mask_tab[i] & std::ctype_base::punct)
|
|
|
|
ctype_out_.mask_tab[i] |= std::ctype_base::print;
|
|
|
|
if ( ctype_out_.mask_tab[i] & std::ctype_base::upper
|
|
|| ctype_out_.mask_tab[i] & std::ctype_base::lower)
|
|
|
|
ctype_out_.mask_tab[i] |= std::ctype_base::alpha;
|
|
|
|
if ( ctype_out_.mask_tab[i] & std::ctype_base::upper
|
|
|| ctype_out_.mask_tab[i] & std::ctype_base::lower
|
|
|| ctype_out_.mask_tab[i] & std::ctype_base::alpha
|
|
|| ctype_out_.mask_tab[i] & std::ctype_base::digit
|
|
|| ctype_out_.mask_tab[i] & std::ctype_base::xdigit
|
|
|| ctype_out_.mask_tab[i] & std::ctype_base::punct)
|
|
|
|
ctype_out_.mask_tab[i] |= std::ctype_base::graph;
|
|
}
|
|
|
|
for (mask_pos = mask_.begin(); mask_pos != mask_.end(); mask_pos++) {
|
|
// all lower, alpha, digit, xdigit, and punct, and space
|
|
// characters are automatically print
|
|
|
|
if ( mask_pos->second & std::ctype_base::upper
|
|
|| mask_pos->second & std::ctype_base::lower
|
|
|| mask_pos->second & std::ctype_base::alpha
|
|
|| mask_pos->second & std::ctype_base::digit
|
|
|| mask_pos->second & std::ctype_base::xdigit
|
|
|| mask_pos->second & std::ctype_base::punct)
|
|
// || mask_pos->second & std::ctype_base::space)
|
|
|
|
mask_pos->second |= std::ctype_base::print;
|
|
|
|
// all upper and lower characters are alpha
|
|
if ( mask_pos->second & std::ctype_base::upper
|
|
|| mask_pos->second & std::ctype_base::lower)
|
|
|
|
mask_pos->second |= std::ctype_base::alpha;
|
|
|
|
// all upper, lower, alpha, digit, xdigit, and punct characters
|
|
// are graph characters
|
|
if ( mask_pos->second & std::ctype_base::upper
|
|
|| mask_pos->second & std::ctype_base::lower
|
|
|| mask_pos->second & std::ctype_base::alpha
|
|
|| mask_pos->second & std::ctype_base::digit
|
|
|| mask_pos->second & std::ctype_base::xdigit
|
|
|| mask_pos->second & std::ctype_base::punct)
|
|
|
|
mask_pos->second |= std::ctype_base::graph;
|
|
|
|
|
|
}
|
|
}
|
|
|
|
|
|
void Def::process_input ()
|
|
{
|
|
while ((next = scanner_.next_token ()).token != Scanner::tok_end_tokens) {
|
|
|
|
switch (next.token) {
|
|
|
|
case Scanner::tok_comment:
|
|
scanner_.ignore_line ();
|
|
break;
|
|
|
|
case Scanner::tok_ctype:
|
|
process_ctype ();
|
|
break;
|
|
|
|
case Scanner::tok_collate:
|
|
process_collate ();
|
|
break;
|
|
|
|
case Scanner::tok_monetary:
|
|
process_monetary ();
|
|
break;
|
|
|
|
case Scanner::tok_numeric:
|
|
process_numeric ();
|
|
break;
|
|
|
|
case Scanner::tok_time:
|
|
process_time ();
|
|
break;
|
|
|
|
case Scanner::tok_messages:
|
|
process_messages ();
|
|
break;
|
|
|
|
case Scanner::tok_nl:
|
|
break;
|
|
|
|
default:
|
|
scanner_.ignore_line ();
|
|
break;
|
|
}
|
|
}
|
|
|
|
auto_fill ();
|
|
}
|
|
|
|
|
|
Def::Def (const char* filename, const char* out_name, Charmap& char_map,
|
|
bool no_position)
|
|
: warnings_occurred_ (false),
|
|
scan_ahead_ (false),
|
|
next_offset_ (0),
|
|
output_name_ (out_name),
|
|
charmap_ (char_map),
|
|
ctype_written_ (false),
|
|
codecvt_written_ (false),
|
|
collate_written_ (false),
|
|
time_written_ (false),
|
|
num_written_ (false),
|
|
mon_written_ (false),
|
|
messages_written_ (false),
|
|
ctype_def_found_ (false),
|
|
collate_def_found_ (false),
|
|
time_def_found_ (false),
|
|
num_def_found_ (false),
|
|
mon_def_found_ (false),
|
|
messages_def_found_ (false),
|
|
undefined_keyword_found_ (false),
|
|
no_position_ (no_position)
|
|
|
|
{
|
|
// make sure ctype_out object is cleared
|
|
std::memset (&ctype_out_, 0, sizeof (ctype_out_));
|
|
std::memset (&time_out_, 0, sizeof (time_out_));
|
|
|
|
// invalidate format characters by setting each to CHAR_MAX
|
|
// as specified by the C function localeconv()
|
|
mon_out_.frac_digits [0] = CHAR_MAX;
|
|
mon_out_.frac_digits [1] = CHAR_MAX;
|
|
mon_out_.p_cs_precedes [0] = CHAR_MAX;
|
|
mon_out_.p_sep_by_space [0] = CHAR_MAX;
|
|
mon_out_.n_cs_precedes [0] = CHAR_MAX;
|
|
mon_out_.n_sep_by_space [0] = CHAR_MAX;
|
|
mon_out_.p_sign_posn [0] = CHAR_MAX;
|
|
mon_out_.n_sign_posn [0] = CHAR_MAX;
|
|
|
|
mon_st_.mon_grouping += CHAR_MAX;
|
|
|
|
// invalidate int'l formats
|
|
mon_out_.p_cs_precedes [1] = CHAR_MAX;
|
|
mon_out_.p_sep_by_space [1] = CHAR_MAX;
|
|
mon_out_.n_cs_precedes [1] = CHAR_MAX;
|
|
mon_out_.n_sep_by_space [1] = CHAR_MAX;
|
|
mon_out_.p_sign_posn [1] = CHAR_MAX;
|
|
mon_out_.n_sign_posn [1] = CHAR_MAX;
|
|
|
|
num_st_.grouping += CHAR_MAX;
|
|
|
|
collate_out_.largest_ce = 1;
|
|
collate_out_.longest_weight = 1;
|
|
collate_out_.num_wchars = 0;
|
|
std::memset (collate_out_.weight_type, 0,
|
|
sizeof (collate_out_.weight_type));
|
|
|
|
// initialize all extensions to 0
|
|
ctype_out_.ctype_ext_off = 0;
|
|
num_out_.numeric_ext_off = 0;
|
|
collate_out_.collate_ext_off = 0;
|
|
mon_out_.monetary_ext_off = 0;
|
|
time_out_.time_ext_off = 0;
|
|
|
|
// actual processing
|
|
scanner_.open (filename);
|
|
}
|
|
|
|
|
|
Def::~Def ()
|
|
{
|
|
// free up the memory that was allocated
|
|
|
|
coll_map_iter coll_map_pos;
|
|
for (coll_map_pos = coll_map_.begin();
|
|
coll_map_pos != coll_map_.end(); coll_map_pos ++) {
|
|
delete[] (coll_map_pos->second.weights);
|
|
}
|
|
|
|
}
|