/*************************************************************************** * * def.cpp * * $Id: def.cpp 522614 2007-03-26 20:25:09Z sebor $ * *************************************************************************** * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. * * Copyright 2001-2006 Rogue Wave Software. * **************************************************************************/ // #ifndef _RWSTD_NO_PURE_C_HEADERS // # define _RWSTD_NO_PURE_C_HEADERS // #endif // _RWSTD_NO_PURE_C_HEADERS // #ifndef _RWSTD_NO_DEPRECATED_C_HEADERS // # define _RWSTD_NO_DEPRECATED_C_HEADERS // #endif // _RWSTD_NO_DEPRECATED_C_HEADERS #ifdef __DECCXX # undef __PURE_CNAME #endif // __DECCXX #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include // for memset() #include "aliases.h" #include "def.h" #include "diagnostic.h" #include "loc_exception.h" #include "path.h" #define UTF8_MAX_SIZE 6 // convert_to_ext converts a wchar_t value with some encoding into // a narrow character string in the current locale's encoding std::string Def::convert_to_ext (wchar_t val) { rmb_cmap_iter it; if ((it = charmap_.get_rmb_cmap().find(val)) != charmap_.get_rmb_cmap().end()){ return it->second; } issue_diag (E_CVT2EXT, true, 0, "unable to convert character %d to external " "representation\n", val); return std::string(""); } // convert the wchar_t value into a utf8 string std::string Def::utf8_encode (wchar_t wc) { unsigned int wc_int = _RWSTD_STATIC_CAST (unsigned int, wc); std::string ret; std::size_t size = 0; char buf[UTF8_MAX_SIZE + 1]; char* bufp = buf; if (wc_int < 0x80) { size = 1; *bufp++ = wc_int; } else { int b; for (b = 2; b < UTF8_MAX_SIZE; b++) if ((wc_int & (~(wchar_t)0 << (5 * b + 1))) == 0) break; size = b; *bufp = (unsigned char) (~0xff >> b); --b; do { bufp[b] = 0x80 | (wc_int & 0x3f); wc_int >>= 6; } while (--b > 0); *bufp |= wc_int; } buf[size] = (char)0; for (unsigned int i = 0; i < size; i++) ret += buf[i]; return ret; } void Def::copy_file (const std::string& name, const std::string& outname) { assert (name.size() > 0); assert (outname.size() > 0); std::ifstream from (name.c_str(), std::ios::binary); if (!from) { issue_diag (E_OPENRD, true, &next, "unable to open locale database %s\n", name.c_str()); } from.exceptions (std::ios::badbit); std::ofstream to (outname.c_str(), std::ios::binary); if (!to) { issue_diag (E_OPENWR, true, &next, "unable to create locale database %s\n", outname.c_str()); } to.exceptions (std::ios::failbit | std::ios::badbit); // copy the file to << from.rdbuf (); } void Def::copy_category(int category, std::string name) { assert (name.size() > 0); // create the name of the file to copy to and call copy_file std::string outname (output_name_); makedir (outname.c_str ()); switch (category) { // append the category name to both 'name' and 'outname' // and call the copy_file routine // the xxx_written variable is set to true so that write_xxx // does not overwrite the file that is written here case LC_CTYPE: (name += _RWSTD_PATH_SEP) += "LC_CTYPE"; (outname += _RWSTD_PATH_SEP) += "LC_CTYPE"; copy_file (name, outname); ctype_written_ = true; break; case LC_COLLATE: (name += _RWSTD_PATH_SEP) += "LC_COLLATE"; (outname += _RWSTD_PATH_SEP) += "LC_COLLATE"; copy_file(name, outname); collate_written_ = true; break; case LC_MONETARY: (name += _RWSTD_PATH_SEP) += "LC_MONETARY"; (outname += _RWSTD_PATH_SEP) += "LC_MONETARY"; copy_file(name, outname); mon_written_ = true; break; case LC_NUMERIC: (name += _RWSTD_PATH_SEP) += "LC_NUMERIC"; (outname += _RWSTD_PATH_SEP) += "LC_NUMERIC"; copy_file(name, outname); num_written_ = true; break; case LC_TIME: (name += _RWSTD_PATH_SEP) += "LC_TIME"; (outname += _RWSTD_PATH_SEP) += "LC_TIME"; copy_file(name, outname); time_written_ = true; break; #ifdef LC_MESSAGES case LC_MESSAGES: (name += _RWSTD_PATH_SEP) += "LC_MESSAGES"; (outname += _RWSTD_PATH_SEP) += "LC_MESSAGES"; copy_file(name, outname); messages_written_ = true; break; #endif // LC_MESSAGES default: break; } } // strip a pair, which should be in the form '(,)' void Def::strip_pair (const std::string &tok, std::string &sym, std::string &sym2) { std::size_t i = 0; if(tok[i] == '(') { if(tok[++i] == '<') while (tok[i] != '>'){ if (tok[i] == scanner_.escape_char ()) i++; sym.push_back(tok[i++]); } // this push_back is safe because the while loop above ends when // tok[i] == '>' sym.push_back(tok[i++]); if (tok[i++] != ',') issue_diag (E_PAIR, true, &next, "invalid pair %s\n", tok.c_str()); if (tok[i] == '<') while (tok[i] != '>'){ if (tok[i] == scanner_.escape_char ()) sym2.push_back(tok[i++]); if ('\0' != tok[i]) sym2.push_back(tok[i++]); else issue_diag (E_PAIR, true, &next, "invalid pair %s\n", tok.c_str()); } // this push_back is safe because the while loop above ends when // tok[i] == '>' sym2.push_back(tok[i++]); } } // converts str, which is a string in the following format // "[][char]" including the quotes to a string of characters // str is not a const reference because if the string spans multiple lines // str is modified std::string Def::convert_string (const std::string &str1) { assert (str1[0] == '\"'); std::string ret; std::string sym; // the index starts at 1 so that we ignore the initial '"' int idx = 1; const char* str = str1.c_str(); while (str[idx] != '\"') { sym.clear(); // if we reach the null-terminator before we see an end-quote // then we must have a multi-line string, so get the next token if (str[idx] == '\0') { if((next = scanner_.next_token()).token == Scanner::tok_string) break; str = next.name.c_str(); idx = 0; } // '<' marks the beginning of a symbolic name // construct the name and look up its value in the cmap if (str[idx] == '<') { while (str [idx] && str [idx] != '>') { if (str[idx] == scanner_.escape_char ()) idx++; sym += str[idx++]; } // this is safe because the while loop ended with *str == '>' if (str [idx]) sym += str [idx++]; w_cmap_iter w_pos = charmap_.get_w_cmap().find (sym); if (w_pos != charmap_.get_w_cmap().end()) { ret += convert_to_ext(w_pos->second); } else { return std::string(); } } // the definition file contains a sting with non-symbol names. // process each character as it's actual character value. // Locale definitions that use this may not be portable. else { ret += (char)str[idx++]; } } return ret; } #ifndef _RWSTD_NO_WCHAR_T // converts a collating element definition to an array of wide characters // (the wide characters the collating element is composed of). // this overload deals with collating elements defined through // a sequence of symbolic names, NOT enclosed within quotes. std::wstring Def::convert_wstring (const StringVector& sym_array) { std::wstring ret; StringVector::const_iterator it = sym_array.begin (); while (it != sym_array.end ()) { // lookup the symbol we just constructed w_cmap_iter w_pos = charmap_.get_w_cmap().find (*it); if (w_pos != charmap_.get_w_cmap().end()) { ret += w_pos->second; it++; } else { // we return an empty string if we couldn't find any character // in the character map ret.clear(); return ret; } } return ret; } // this overload deals with collating elements defined through // a sequence of characters or symbolic names, enclosed within quotes. std::wstring Def::convert_wstring (const token_t& t) { std::wstring ret; std::string sym; std::string str1 (t.name); int idx = 0; char term = 0; const char* str = str1.c_str(); // skip first character if quote if (str[idx] == '\"') { term = '\"', idx++; } while (str[idx] != term) { sym.clear(); // '<' marks the beginning of a symbolic name // construct the name and look up its value in the cmap if (str[idx] == '<') { while (str[idx] != '>') { if (str[idx] == scanner_.escape_char ()) { // sym += str[idx++]; idx++; } if ('\0' != str[idx]) sym += str[idx++]; else issue_diag (E_SYMEND, true, &t, "end of symbolic name not found\n"); } // this is safe because the while loop ended with *str == '>' sym += str[idx++]; // lookup the symbol we just constructed w_cmap_iter w_pos = charmap_.get_w_cmap().find (sym); if (w_pos != charmap_.get_w_cmap().end()) { ret += w_pos->second; } else { // if we can't find a symbol then return an empty string, // most likely this will happen if inside a collating-element // the user uses a character that is not in the current // codeset, in this case the collating element will be ignored ret.clear(); return ret; } } // the definition file contains a string with non-symbol names. // process each character as it's actual character value. // Locale definitions that use this may not be portable. else ret += (wchar_t)str[idx++]; } return ret; } #endif // _RWSTD_NO_WCHAR_T // automatically fill any categories that depend on other categories void Def::auto_fill () { mask_iter mask_pos; for (std::size_t i = 0; i <= UCHAR_MAX; i++) { if ( ctype_out_.mask_tab[i] & std::ctype_base::upper || ctype_out_.mask_tab[i] & std::ctype_base::lower || ctype_out_.mask_tab[i] & std::ctype_base::alpha || ctype_out_.mask_tab[i] & std::ctype_base::digit || ctype_out_.mask_tab[i] & std::ctype_base::xdigit || ctype_out_.mask_tab[i] & std::ctype_base::punct) ctype_out_.mask_tab[i] |= std::ctype_base::print; if ( ctype_out_.mask_tab[i] & std::ctype_base::upper || ctype_out_.mask_tab[i] & std::ctype_base::lower) ctype_out_.mask_tab[i] |= std::ctype_base::alpha; if ( ctype_out_.mask_tab[i] & std::ctype_base::upper || ctype_out_.mask_tab[i] & std::ctype_base::lower || ctype_out_.mask_tab[i] & std::ctype_base::alpha || ctype_out_.mask_tab[i] & std::ctype_base::digit || ctype_out_.mask_tab[i] & std::ctype_base::xdigit || ctype_out_.mask_tab[i] & std::ctype_base::punct) ctype_out_.mask_tab[i] |= std::ctype_base::graph; } for (mask_pos = mask_.begin(); mask_pos != mask_.end(); mask_pos++) { // all lower, alpha, digit, xdigit, and punct, and space // characters are automatically print if ( mask_pos->second & std::ctype_base::upper || mask_pos->second & std::ctype_base::lower || mask_pos->second & std::ctype_base::alpha || mask_pos->second & std::ctype_base::digit || mask_pos->second & std::ctype_base::xdigit || mask_pos->second & std::ctype_base::punct) // || mask_pos->second & std::ctype_base::space) mask_pos->second |= std::ctype_base::print; // all upper and lower characters are alpha if ( mask_pos->second & std::ctype_base::upper || mask_pos->second & std::ctype_base::lower) mask_pos->second |= std::ctype_base::alpha; // all upper, lower, alpha, digit, xdigit, and punct characters // are graph characters if ( mask_pos->second & std::ctype_base::upper || mask_pos->second & std::ctype_base::lower || mask_pos->second & std::ctype_base::alpha || mask_pos->second & std::ctype_base::digit || mask_pos->second & std::ctype_base::xdigit || mask_pos->second & std::ctype_base::punct) mask_pos->second |= std::ctype_base::graph; } } void Def::process_input () { while ((next = scanner_.next_token ()).token != Scanner::tok_end_tokens) { switch (next.token) { case Scanner::tok_comment: scanner_.ignore_line (); break; case Scanner::tok_ctype: process_ctype (); break; case Scanner::tok_collate: process_collate (); break; case Scanner::tok_monetary: process_monetary (); break; case Scanner::tok_numeric: process_numeric (); break; case Scanner::tok_time: process_time (); break; case Scanner::tok_messages: process_messages (); break; case Scanner::tok_nl: break; default: scanner_.ignore_line (); break; } } auto_fill (); } Def::Def (const char* filename, const char* out_name, Charmap& char_map, bool no_position) : warnings_occurred_ (false), scan_ahead_ (false), next_offset_ (0), output_name_ (out_name), charmap_ (char_map), ctype_written_ (false), codecvt_written_ (false), collate_written_ (false), time_written_ (false), num_written_ (false), mon_written_ (false), messages_written_ (false), ctype_def_found_ (false), collate_def_found_ (false), time_def_found_ (false), num_def_found_ (false), mon_def_found_ (false), messages_def_found_ (false), undefined_keyword_found_ (false), no_position_ (no_position) { // make sure ctype_out object is cleared std::memset (&ctype_out_, 0, sizeof (ctype_out_)); std::memset (&time_out_, 0, sizeof (time_out_)); // invalidate format characters by setting each to CHAR_MAX // as specified by the C function localeconv() mon_out_.frac_digits [0] = CHAR_MAX; mon_out_.frac_digits [1] = CHAR_MAX; mon_out_.p_cs_precedes [0] = CHAR_MAX; mon_out_.p_sep_by_space [0] = CHAR_MAX; mon_out_.n_cs_precedes [0] = CHAR_MAX; mon_out_.n_sep_by_space [0] = CHAR_MAX; mon_out_.p_sign_posn [0] = CHAR_MAX; mon_out_.n_sign_posn [0] = CHAR_MAX; mon_st_.mon_grouping += CHAR_MAX; // invalidate int'l formats mon_out_.p_cs_precedes [1] = CHAR_MAX; mon_out_.p_sep_by_space [1] = CHAR_MAX; mon_out_.n_cs_precedes [1] = CHAR_MAX; mon_out_.n_sep_by_space [1] = CHAR_MAX; mon_out_.p_sign_posn [1] = CHAR_MAX; mon_out_.n_sign_posn [1] = CHAR_MAX; num_st_.grouping += CHAR_MAX; collate_out_.largest_ce = 1; collate_out_.longest_weight = 1; collate_out_.num_wchars = 0; std::memset (collate_out_.weight_type, 0, sizeof (collate_out_.weight_type)); // initialize all extensions to 0 ctype_out_.ctype_ext_off = 0; num_out_.numeric_ext_off = 0; collate_out_.collate_ext_off = 0; mon_out_.monetary_ext_off = 0; time_out_.time_ext_off = 0; // actual processing scanner_.open (filename); } Def::~Def () { // free up the memory that was allocated coll_map_iter coll_map_pos; for (coll_map_pos = coll_map_.begin(); coll_map_pos != coll_map_.end(); coll_map_pos ++) { delete[] (coll_map_pos->second.weights); } }