/*************************************************************************** * * collate.cpp * * $Id: collate.cpp 648752 2008-04-16 17:01:56Z faridz $ * *************************************************************************** * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed * with this work for additional information regarding copyright * ownership. The ASF licenses this file to you under the Apache * License, Version 2.0 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. * * Copyright 2001-2008 Rogue Wave Software, Inc. * **************************************************************************/ #include "def.h" // for Def #include "diagnostic.h" // for issue_diag() #include "path.h" // for get_pathname() #include "scanner.h" // for scanner #include // for ofstream #include // for cout #include // for string #include // for vector #include // for assert() #include // for toupper() #include // for sprintf() #include // for strtoul() #include // for strchr() void Def:: create_wchar_utf8_table () { if (!wchar_utf8_to_ext_.empty()) return; typedef std::map::const_iterator n_cmap_citer2; const n_cmap_citer2 n_cmap_end = charmap_.get_mb_cmap ().end (); for (n_cmap_citer2 n_cmap_it = charmap_.get_mb_cmap ().begin (); n_cmap_it != n_cmap_end; ++n_cmap_it) { const std::string wchar_utf8 = utf8_encode (n_cmap_it->second); wchar_utf8_to_ext_.insert (std::make_pair (wchar_utf8, n_cmap_it->first)); } } void Def::process_weights (collate_entry_t& entry) { // iterator of weights tokens token_list_t::iterator w_it = entry.second.begin (); w_cmap_iter wcmap_it; ce_map_iter ce_map_it; coll_map_iter coll_map_it; Weights_t* weights = new Weights_t[collate_out_.num_weights]; std::size_t i = 0; for (i = 0; i < collate_out_.num_weights && w_it != entry.second.end (); ++i, ++w_it){ get_weight (*w_it, weights, i); } const std::map& w_cmap = charmap_.get_w_cmap (); // the remaining weights - see above - are given as weight the // collating value of the symbol at start of line while (i < collate_out_.num_weights) { weights[i].size = 1; wcmap_it = w_cmap.find (entry.first.name); if (wcmap_it != w_cmap.end()) { coll_map_it = coll_map_.find (wcmap_it->second); weights[i].weight[0] = coll_map_it->second.coll_val; } else if ((ce_map_it = ce_map_.find (entry.first.name)) != ce_map_.end()) { weights[i].weight[0] = ce_map_it->second.coll_val; } else { warnings_occurred_ = issue_diag (W_COLSYM, false, &entry.first, "symbolic name %s not defined as character or " "collating-element, ignoring\n", entry.first.name.c_str()) || warnings_occurred_; } i++; } wcmap_it = w_cmap.find (entry.first.name); if (wcmap_it != w_cmap.end()) { coll_map_it = coll_map_.find (wcmap_it->second); coll_map_it->second.weights = weights; } else if ((ce_map_it = ce_map_.find (entry.first.name)) != ce_map_.end()) { ce_map_it->second.weights = weights; } else { warnings_occurred_ = issue_diag (W_COLSYM, false, &entry.first, "symbolic name %s not defined as character or " "collating-element, ignoring\n", entry.first.name.c_str()) || warnings_occurred_; } } unsigned int Def::process_order_stmt (collate_section_t& section) { // number of orders in the section collate_out_.num_weights = section.order.size (); if (collate_out_.num_weights == 0) { collate_out_.num_weights = 1; collate_out_.weight_type[0] = 0; } else { collate_out_.num_weights = 0; token_list_t::iterator ord_it = section.order.begin (); for (; ord_it != section.order.end (); ++ord_it, ++collate_out_.num_weights) { if (ord_it->token == Scanner::tok_forward) { collate_out_.weight_type[collate_out_.num_weights] = 0; } else if (ord_it->token == Scanner::tok_backward) { collate_out_.weight_type[collate_out_.num_weights] = 1; } else if (ord_it->token == Scanner::tok_position) { // this is equivalent to "forward,position" if (no_position_) collate_out_.weight_type[collate_out_.num_weights] = 0; else collate_out_.weight_type[collate_out_.num_weights] = 2; } else { std::string::size_type comma_pos = ord_it->name.find (','); if (comma_pos != std::string::npos) { std::string first = ord_it->name.substr (0, comma_pos); std::string second = ord_it->name.substr (comma_pos + 1); // verify a valid weight ordering if ((first == "forward" && second != "position") || first == "backward" && second != "position") { issue_diag (E_COLORD, true, &*ord_it, "unrecognized collating order\n"); break; } else if (first == "forward") { if (no_position_) collate_out_.weight_type [collate_out_.num_weights] = 0; else collate_out_.weight_type [collate_out_.num_weights] = 2; } else collate_out_.weight_type[collate_out_.num_weights] = 3; } else { issue_diag (E_COLORD, true, &*ord_it, "unrecognized collating order\n"); } } } } // build the order value; unsigned int order = 0; for (int i = 0; i < collate_out_.num_weights; ++i) { order <<= 2; order |= collate_out_.weight_type[i]; } return order; } // decimally increment the symbolic name, turning something like // into , or to static std::string dec_increment (const std::string &sym) { const char *pdig = sym.c_str (); for (; !(std::isdigit (*pdig)); ++pdig); char *end; const unsigned long sym_val = std::strtoul (pdig, &end, 10); char numstr [64]; std::sprintf (numstr, "%lu", sym_val + 1); std::string next_sym = sym.substr (0, pdig - sym.c_str ()); next_sym.append (numstr); next_sym.append (end); return next_sym; } // hexadecimally increment the symbolic name static std::string hex_increment (const std::string& sym) { const char *pdig = sym.c_str (); for (; !(std::isxdigit (*pdig)); ++pdig); char *end; const unsigned long sym_val = std::strtoul (pdig, &end, 16); char numstr [64]; std::sprintf (numstr, "%lx", sym_val + 1); std::string next_sym = sym.substr (0, pdig - sym.c_str ()); next_sym.append (numstr); next_sym.append (end); return next_sym; } // scanning ahead and adding all symbols to the collating symbols map void Def::preprocess_collation_definitions() { // start with collation values from 2 unsigned int coll_value = 2; // previous_elm and next_elm are used for processing ellipsis. std::string prev_elm; std::string next_elm; static unsigned int max_orders = 0; token_list_t::iterator cs_it = sym_list_.begin (); for (; cs_it != sym_list_.end (); ++cs_it) { cs_map_iter csm_it = cs_map_.find (cs_it->name); if (csm_it != cs_map_.end ()) csm_it->second = coll_value++; } std::list::iterator s_it = section_list_.begin (); if (s_it != section_list_.end()) { // get maximum weights count max_orders = s_it->order.size (); } for (; s_it != section_list_.end (); ++s_it) { if (max_orders != s_it->order.size ()) issue_diag (E_COLNUM, true, &*s_it->order.begin (), "number of collation orders " "different for this section\n"); } const std::map& w_cmap = charmap_.get_w_cmap (); // preprocess sections' collation definitions for (s_it = section_list_.begin (); s_it != section_list_.end (); ++s_it) { // process the order statement and get the weight unsigned int order = process_order_stmt (*s_it); std::list::iterator e_it = s_it->entries.begin (); for (; e_it != s_it->entries.end (); ++e_it) { switch (e_it->first.token) { case Scanner::tok_sym_name: // store the previous name prev_elm = e_it->first.name; // process each of the collation definitions process_collation_definition (false, *e_it, coll_value++, order); break; case Scanner::tok_abs_ellipsis: // "..." case Scanner::tok_hex_ellipsis: // ".." case Scanner::tok_dec_ellipsis: // "...." { if (++e_it != s_it->entries.end ()) next_elm = e_it->first.name; else { issue_diag (E_SYNTAX, true, &e_it->first, "unexpected end of collation section while" " processing ellipsis during scan_ahead\n"); } if (e_it->first.token == Scanner::tok_abs_ellipsis) { wchar_t w_cmap_cur_val = w_cmap.find(prev_elm)->second; wchar_t w_cmap_end_val = w_cmap.find(next_elm)->second; // the first value has already been // added so don't add it again w_cmap_cur_val = charmap_.increment_wchar (w_cmap_cur_val); while (w_cmap_cur_val != w_cmap_end_val) { // process iteration collate_info_t ci = {UINT_MAX, UINT_MAX, 0, 0}; ci.coll_val = coll_value++; ci.order = order; coll_map_.insert (std::make_pair (w_cmap_cur_val, ci)); w_cmap_cur_val = charmap_.increment_wchar (w_cmap_cur_val); } // add last element "next_elm" to array collate_info_t ci = {UINT_MAX, UINT_MAX, 0, 0}; ci.coll_val = coll_value++; ci.order = order; coll_map_.insert (std::make_pair (w_cmap_cur_val, ci)); } else { // we are incrementing the symbolic names std::string next_name = prev_elm; do { if (e_it->first.token == Scanner::tok_hex_ellipsis) next_name = hex_increment (next_name); else next_name = dec_increment (next_name); w_cmap_iter it = w_cmap.find (next_name); if (it != w_cmap.end()) { // process iteration collate_info_t ci = {UINT_MAX, UINT_MAX, 0, 0}; ci.coll_val = coll_value++; ci.order = order; coll_map_.insert ( std::make_pair (it->second, ci)); } } while (next_name != next_elm); } prev_elm = next_elm; break; } case Scanner::tok_dbl_ellipsis: issue_diag (W_NOTSUP, true, &e_it->first, "ellipsis not supported" " processing ellipsis during scan_ahead\n"); default: break; } } } } void Def::process_collation_definition ( bool do_weights, collate_entry_t& entry, unsigned int coll_value, unsigned int order) { const std::map& w_cmap = charmap_.get_w_cmap (); // iterators w_cmap_iter w_cmap_pos = w_cmap.find(entry.first.name); ce_map_iter ce_map_pos = ce_map_.find(entry.first.name); // look up the symbolic name in the wide character map if (w_cmap_pos != w_cmap.end()) { wchar_t wval = w_cmap_pos->second; coll_map_iter coll_map_pos = coll_map_.find (wval); coll_map_pos->second.coll_val = coll_value; coll_map_pos->second.order = order; // process the weights if (do_weights) { unsigned int offset = next_offset_++; coll_map_pos->second.offset = offset; off_mapr_.insert(std::make_pair(offset, entry.first.name)); process_weights (entry); } } else if (ce_map_pos != ce_map_.end()) { ce_map_pos->second.coll_val = coll_value; ce_map_pos->second.order = order; // process the weights if (do_weights) { unsigned int offset = next_offset_++; ce_map_pos->second.offset = offset; off_mapr_.insert(std::make_pair(offset, entry.first.name)); process_weights (entry); } } else if (cs_map_.find(entry.first.name) != cs_map_.end()) { cs_map_.find(entry.first.name)->second = coll_value; } else { // the symbol is missing from all three maps; // add it in cs map as "sym <-> collating_value" pair cs_map_.insert (std::make_pair(entry.first.name, coll_value)); warnings_occurred_ = issue_diag (W_COLSYM, false, &entry.first, "unknown symbol name %s found in " "LC_COLLATE definition\n", entry.first.name.c_str()) || warnings_occurred_; } } void Def::process_order(collate_section_t& section, unsigned int& coll_value) { std::string sym; // get the orders unsigned int section_order = process_order_stmt (section); // current element std::string curr_elm; // previous_elm and next_elm are used for processing ellipsis. std::string prev_elm; std::string next_elm; const std::map& w_cmap = charmap_.get_w_cmap (); // iterate thru section's entries std::list::iterator e_it = section.entries.begin (); for (; e_it != section.entries.end (); ++e_it) { if (e_it->first.token == Scanner::tok_sym_name) { prev_elm = e_it->first.name; process_collation_definition (true, *e_it, coll_value++, section_order); } else if ( e_it->first.token == Scanner::tok_abs_ellipsis || e_it->first.token == Scanner::tok_dec_ellipsis || e_it->first.token == Scanner::tok_hex_ellipsis) { // process line with ellipsis Weights_t* weights_template = new Weights_t[collate_out_.num_weights]; std::vector ordinal_weights; token_list_t::iterator w_it = e_it->second.begin (); for (int i = 0; i < collate_out_.num_weights && w_it != e_it->second.end (); ++i, ++w_it) { ordinal_weights.push_back ( get_weight (*w_it,weights_template, i)); } while (ordinal_weights.size() < collate_out_.num_weights) ordinal_weights.push_back (true); // next line if (++e_it != section.entries.end ()) next_elm = e_it->first.name; else { issue_diag (E_SYNTAX, true, &e_it->first, "unexpected end of collation section while" " processing ellipsis\n"); } if (e_it->first.token == Scanner::tok_abs_ellipsis) { wchar_t w_cmap_cur_val = w_cmap.find(prev_elm)->second; wchar_t w_cmap_end_val = w_cmap.find(next_elm)->second; // the first value has already been added so don't add it again w_cmap_cur_val = charmap_.increment_wchar (w_cmap_cur_val); while (w_cmap_cur_val != w_cmap_end_val) { add_to_coll (w_cmap_cur_val, weights_template, coll_value++, ordinal_weights, false); w_cmap_cur_val = charmap_.increment_wchar (w_cmap_cur_val); } // add the end element to the collation array. add_to_coll (w_cmap_end_val, weights_template, coll_value++, ordinal_weights, false); } else { // we are incrementing the symbolic names std::string next_name = prev_elm; do { if (e_it->first.token == Scanner::tok_hex_ellipsis) next_name = hex_increment (next_name); else next_name = dec_increment (next_name); w_cmap_iter it = w_cmap.find (next_name); if (it != w_cmap.end()) { add_to_coll (it->second, weights_template, coll_value++, ordinal_weights, false); } } while (next_name != next_elm); } // store current left-hand symbol next_elm = prev_elm; delete[] weights_template; } else if (e_it->first.token == Scanner::tok_undefined) { // add all characters that were not explicitely given a collation // value in increasing order based on their encoded values undefined_keyword_found_ = true; collate_out_.undefined_optimization = true; undef_char_info_.order = section_order; Weights_t* weights_template = new Weights_t[collate_out_.num_weights]; std::vector ordinal_weights; token_list_t::iterator w_it = e_it->second.begin (); for (int i = 0; i < collate_out_.num_weights && w_it != e_it->second.end(); ++i, ++w_it) { if (w_it->token == Scanner::tok_abs_ellipsis) collate_out_.undefined_optimization = false; ordinal_weights.push_back ( !get_weight (*w_it, weights_template, i)); } while (ordinal_weights.size() < collate_out_.num_weights) ordinal_weights.push_back (false); add_missing_values (ordinal_weights, weights_template, coll_value, false); delete[] weights_template; } } } void Def::add_missing_values (const std::vector &ordinal_weights, const Weights_t* weights_template, unsigned int &coll_value, bool give_warning) { const std::map& w_cmap = charmap_.get_w_cmap (); // we want to print a warning message once if there are characters that // were not given collation values. bool warning_issued = false; symnames_list_iter symnames_it; for (symnames_it = charmap_.get_symnames_list ().begin (); symnames_it != charmap_.get_symnames_list ().end (); ++symnames_it) { std::map::const_iterator w_cmap_it; if ((w_cmap_it = w_cmap.find(*symnames_it)) != w_cmap.end()) { wchar_t wchar_val = (*w_cmap_it).second; coll_map_iter coll_map_it; if ((coll_map_it = coll_map_.find(wchar_val)) != coll_map_.end()) { if (coll_map_it->second.offset == UINT_MAX) { if (give_warning && !warning_issued) { warning_issued = true; warnings_occurred_ = issue_diag (W_MISSING, false, 0, "some characters in the codeset " "were not explicitly given a " "collation value\n") || warnings_occurred_; } if (!collate_out_.undefined_optimization) { add_to_coll (wchar_val, weights_template, coll_value++, ordinal_weights, true); } } } } } if (collate_out_.undefined_optimization){ collate_out_.undefined_weight_idx = next_offset_++; undef_char_info_.offset = collate_out_.undefined_weight_idx; undef_char_info_.coll_val = coll_value++; // initialize the weight template with the undefined collation info Weights_t* weights = new Weights_t[collate_out_.num_weights]; if (0 != weights_template) { for (int k = 0; k < collate_out_.num_weights; ++k) { weights[k].size = weights_template[k].size; for (int j = 0; j < 256; ++j) weights[k].weight[j] = weights_template[k].weight[j]; } } std::vector::const_iterator it; // now go through the vector that tells us if a particular // weight is based on the ordinal position of the element // and place the correct ordinal value in the weight vector // if it is. int i = 0; for (it = ordinal_weights.begin(); it != ordinal_weights.end(); ++it, ++i) { // FIXME: this may not be correct if (!*it){ weights[i].size = 1; weights[i].weight[0] = undef_char_info_.coll_val; } } undef_char_info_.weights = weights; } } void Def::add_to_coll (const wchar_t w_cmap_cur_val, const Weights_t* weights_template, const unsigned int coll_value, const std::vector& ordinal_weights, bool undefined_value) { Weights_t* weights = new Weights_t[collate_out_.num_weights]; if (0 != weights_template) { for (int k = 0; k < collate_out_.num_weights; ++k) { weights[k].size = weights_template[k].size; for (int j = 0; j < 256; ++j) weights[k].weight[j] = weights_template[k].weight[j]; } } // get the symbolic name from the value std::string name = charmap_.get_rw_cmap().find (w_cmap_cur_val)->second; coll_map_.find (w_cmap_cur_val)->second.coll_val = coll_value; unsigned int offset; if (undefined_value && collate_out_.undefined_optimization) offset = next_offset_; else offset = next_offset_++; const std::map& w_cmap = charmap_.get_w_cmap (); w_cmap_iter w_cmap_pos = w_cmap.find(name); coll_map_iter coll_it = coll_map_.find (w_cmap_pos->second); coll_it->second.offset = offset; off_mapr_.insert(std::make_pair(offset, name)); std::vector::const_iterator it; // now go through the vector that tells us if a particular // weight is based on the ordinal position of the element // and place the correct ordinal value in the weight vector // if it is. int i = 0; for (it = ordinal_weights.begin(); it != ordinal_weights.end(); ++it, ++i) { if (*it){ weights[i].size = 1; weights[i].weight[0] = coll_value; } } coll_it->second.weights = weights; } // if undefined optimization is on then only those characters that have // been defined should go into the valid set, otherwise all characters go // into the set. void Def::gen_valid_coll_mb_set () { if (!valid_coll_mb_set_.empty()) return; for (coll_map_iter coll_it = coll_map_.begin(); coll_it != coll_map_.end(); ++coll_it) { if (coll_it->second.offset != UINT_MAX || !collate_out_.undefined_optimization){ rmb_cmap_iter rn_cmap_it = charmap_.get_rmb_cmap().find (coll_it->first); std::string valid = rn_cmap_it->second.substr (0, rn_cmap_it->second.size() - 1); while (valid.size() > 0){ valid_coll_mb_set_.insert (valid); valid = valid.substr(0, valid.size() - 1); } } } } // In processing the collate section of the locale definition file // we will filter out the characters that are not present in the // charmap file; these characters when encountered on the left-hand side // of a weight definition for a symbolic name will be left in place // and used as a mere collating symbol for the rest of the parsing - // i.e. its weights ignored and assigned the weight corresponding to // its position in the collate section. void Def::process_collate() { issue_diag (I_STAGE, false, 0, "processing LC_COLLATE section\n"); // update flags collate_def_found_ = true; // initialization init_coll_map (); // first preprocess the collate section preprocess_collate (); const std::map& w_cmap = charmap_.get_w_cmap (); // process the collating symbols list token_list_t::iterator cs_it = cs_list_.begin (); for (;cs_it != cs_list_.end (); ++cs_it) { if (w_cmap.find (cs_it->name) != w_cmap.end()) issue_diag (E_SYNTAX, true, &*cs_it, "collating-symbol %s found in character map\n", cs_it->name.c_str ()); else if (ce_map_.find (cs_it->name) != ce_map_.end()) issue_diag (E_SYNTAX, true, &*cs_it, "collating-symbol %s has already been defined as " "a collating-element\n", cs_it->name.c_str ()); cs_map_.insert (std::make_pair (cs_it->name, 0)); } // process the collating elements list std::list::iterator ce_it = ce_list_.begin (); for (; ce_it != ce_list_.end (); ++ce_it) { if (w_cmap.find (ce_it->first.name) != w_cmap.end()) { issue_diag (E_SYNTAX, true, &ce_it->first, "collating element %s found in charmap\n", ce_it->first.name.c_str()); } // collating element info to be stored ce_info_t ce_tmp; ce_tmp.offset = UINT_MAX; ce_tmp.coll_val = UINT_MAX; ce_tmp.order = 0; ce_tmp.weights = 0; // array of symbolic names std::vector ce_sym_array; // get the definition of this collating element token_list_t& ce_def_list = ce_it->second; token_list_t::iterator ce_def_it = ce_def_list.begin (); bool invalid = false; if (ce_def_it->token == Scanner::tok_string) { if ((ce_tmp.ce_wstr = convert_wstring (*ce_def_it)).empty()) invalid = true; } else if (ce_def_it->token == Scanner::tok_sym_name) { for (; ce_def_it != ce_def_list.end (); ++ce_def_it) ce_sym_array.push_back (ce_def_it->name); if ((ce_tmp.ce_wstr = convert_wstring (ce_sym_array)).empty()) invalid = true; } else { issue_diag (E_SYNTAX, true, &*ce_def_it, "illegal collating-element expression: %s\n", ce_def_it->name.c_str ()); } if (invalid) issue_diag (W_SYNTAX, false, &*ce_def_it, "character in collating element definition " "not found in character map\n"); if (!invalid) { ce_map_.insert (std::make_pair(ce_it->first.name, ce_tmp)); std::string ce_str; std::string ce_utf8_wstr; for (unsigned int i = 0; i < ce_tmp.ce_wstr.size(); ++i){ ce_str += convert_to_ext(ce_tmp.ce_wstr[i]); ce_utf8_wstr += utf8_encode (ce_tmp.ce_wstr[i]); } ce_sym_map_.insert (std::make_pair(ce_str, ce_it->first.name)); ce_wsym_map_.insert (std::make_pair (ce_utf8_wstr, ce_it->first.name)); } } // preprocess the collation definitions and make known all // the symbolic names available preprocess_collation_definitions(); // collation values 0 and 1 are reserved unsigned int coll_value = 2; // FIXME: assign values to the collating-elements AGAIN cs_it = sym_list_.begin (); for (; cs_it != sym_list_.end (); ++cs_it) { cs_map_iter csm_it = cs_map_.find (cs_it->name); if (csm_it != cs_map_.end ()) csm_it->second = coll_value++; } // process the sections std::list::iterator sect_it = section_list_.begin (); for (; sect_it != section_list_.end (); ++sect_it) process_order (*sect_it, coll_value); // check to make sure that all the elements in the codeset were added if (!undefined_keyword_found_) { std::vector ordinal_weights; for (int i = 0; i < collate_out_.num_weights; ++i) ordinal_weights.push_back (false); collate_out_.undefined_optimization = true; add_missing_values (ordinal_weights, 0, coll_value, true); } } void Def::gen_n_to_w_coll_tables (const std::string &charp, unsigned int tab_num) { gen_valid_coll_mb_set(); offset_tab_t tab; for (unsigned int k = 0; k <= UCHAR_MAX; ++k) { tab.off[k] = UINT_MAX; } tab.first_offset = -1; mb_cmap_iter n_cmap_it; for (unsigned int i = 0; i <= UCHAR_MAX; ++i){ unsigned char cur_char = (unsigned char)i; std::string mb_char = charp; mb_char += (char)cur_char; if ((n_cmap_it = charmap_.get_mb_cmap().find (mb_char)) != charmap_.get_mb_cmap().end()) { coll_map_iter coll_map_it = coll_map_.find (n_cmap_it->second); if (coll_map_it->second.offset != UINT_MAX || !collate_out_.undefined_optimization) { tab.off[cur_char] = coll_map_it->second.offset; if (tab.first_offset == -1) tab.first_offset = cur_char; } } else { if (valid_coll_mb_set_.find (mb_char) != valid_coll_mb_set_.end()){ ++next_tab_num_; tab.off[cur_char] = next_tab_num_ | 0x80000000; gen_n_to_w_coll_tables (mb_char, next_tab_num_); if (tab.first_offset == -1) tab.first_offset = cur_char; } else { tab.off[cur_char] = UINT_MAX; } } } char_offs_.insert (std::make_pair (tab_num, tab)); } void Def::gen_w_to_n_coll_tables (const std::string &charp, unsigned int tab_num) { // initialize a table used in ctype and collate create_wchar_utf8_table(); gen_valid_coll_wchar_set(); offset_tab_t tab; for (unsigned int k = 0; k <= UCHAR_MAX; ++k) tab.off[k] = UINT_MAX; tab.first_offset = -1; wchar_utf8_iter wu_it; mb_cmap_iter n_cmap_it; for (unsigned int i = 0; i <= UCHAR_MAX; ++i) { unsigned char cur_char = (unsigned char)i; std::string mb_char = (charp); mb_char += (char)cur_char; wu_it = wchar_utf8_to_ext_.find (mb_char); if (wu_it != wchar_utf8_to_ext_.end()) { n_cmap_it = charmap_.get_mb_cmap().find (wu_it->second); if (n_cmap_it != charmap_.get_mb_cmap().end ()) { coll_map_iter coll_map_it = coll_map_.find (n_cmap_it->second); if (coll_map_it->second.offset != UINT_MAX || !collate_out_.undefined_optimization) { tab.off[cur_char] = coll_map_it->second.offset; if (tab.first_offset == -1) tab.first_offset = cur_char; } } } else if (valid_coll_wchar_set_.find (mb_char) != valid_coll_wchar_set_.end()) { ++next_wchar_coll_tab_num_; tab.off[cur_char] = next_wchar_coll_tab_num_ | 0x80000000; gen_w_to_n_coll_tables (mb_char, next_wchar_coll_tab_num_); if (tab.first_offset == -1) tab.first_offset = cur_char; } else tab.off[cur_char] = UINT_MAX; } w_to_n_coll_.insert (std::make_pair (tab_num, tab)); } void Def::gen_n_ce_tables (const std::setce_elms, unsigned int idx, unsigned int tab_num) { if (ce_elms.size() > 0) { ce_offset_tab_t tab; for (unsigned int k = 0; k <= UCHAR_MAX; ++k) { tab.off[k] = UINT_MAX; } tab.first_offset = -1; tab.last_offset = 0; ce_sym_map_iter ce_sym_map_it; ce_map_iter ce_map_it; std::set::const_iterator ce_elms_it; std::set next_elms; for (unsigned int i = 0; i <= UCHAR_MAX; ++i) { next_elms.clear(); for (ce_elms_it = ce_elms.begin(); ce_elms_it != ce_elms.end(); ce_elms_it ++) { if ((unsigned char)i == (unsigned char)((*ce_elms_it)[idx])) { if ((*ce_elms_it).size() == idx + 1) { if ((ce_sym_map_it = ce_sym_map_.find (*ce_elms_it)) != ce_sym_map_.end()) { ce_map_it = ce_map_.find(ce_sym_map_it->second); tab.off[i] = ce_map_it->second.offset; if (tab.first_offset == -1) tab.first_offset = i; if ((unsigned int)tab.last_offset < i) tab.last_offset = i; } } else { next_elms.insert (*ce_elms_it); if (tab.off[i] == UINT_MAX) { ++ next_n_ce_tab_num_; tab.off[i] = next_n_ce_tab_num_ | 0x80000000; } if (tab.first_offset == -1) tab.first_offset = i; if ((unsigned int)tab.last_offset < i) tab.last_offset = i; } } } // now recursively call gen_n_ce_tables if any collating // elements with this character were found; if (next_elms.size() != 0) gen_n_ce_tables (next_elms, idx + 1, next_n_ce_tab_num_); } n_ce_offs_.insert(std::make_pair(tab_num, tab)); } } void Def::gen_w_ce_tables (const std::setce_elms, unsigned int idx, unsigned int tab_num) { if (ce_elms.size() > 0) { ce_offset_tab_t tab; for (unsigned int k = 0; k <= UCHAR_MAX; ++k) { tab.off[k] = UINT_MAX; } tab.first_offset = -1; tab.last_offset = 0; ce_sym_map_iter ce_sym_map_it; ce_map_iter ce_map_it; std::set::const_iterator ce_elms_it; std::set next_elms; for (unsigned int i = 0; i <= UCHAR_MAX; ++i) { next_elms.clear(); for (ce_elms_it = ce_elms.begin(); ce_elms_it != ce_elms.end(); ce_elms_it ++) { if ((unsigned char)i == (unsigned char)(*ce_elms_it)[idx]) { if ((*ce_elms_it).size() == idx + 1) { if ((ce_sym_map_it = ce_wsym_map_.find (*ce_elms_it)) != ce_wsym_map_.end()) { ce_map_it = ce_map_.find(ce_sym_map_it->second); tab.off[i] = ce_map_it->second.offset; if (tab.first_offset == -1) tab.first_offset = i; if ((unsigned int)tab.last_offset < i) tab.last_offset = i; } } else { next_elms.insert (*ce_elms_it); if (tab.off[i] == UINT_MAX) { ++ next_w_ce_tab_num_; tab.off[i] = next_w_ce_tab_num_ | 0x80000000; } if (tab.first_offset == -1) tab.first_offset = i; if ((unsigned int)tab.last_offset < i) tab.last_offset = i; } } } // now recursively call gen_w_ce_tables if any collating // elements with this character were found; if (next_elms.size() != 0) gen_w_ce_tables (next_elms, idx + 1, next_w_ce_tab_num_); } w_ce_offs_.insert(std::make_pair(tab_num, tab)); } } void Def::dump_collate () { std::cout << "LC_COLLATE\n"; const std::map& w_cmap = charmap_.get_w_cmap (); token_list_t::iterator sl_it = sym_list_.begin (); for (; sl_it != sym_list_.end (); ++sl_it) { std::cout << sl_it->name << " % "; cs_map_iter cs_it = cs_map_.find (sl_it->name); if (cs_it != cs_map_.end ()) { std::cout << cs_it->second << '\n'; continue; } ce_map_iter ce_it = ce_map_.find (sl_it->name); if (ce_it != ce_map_.end ()) { std::cout << ce_it->second.coll_val << '\n'; continue; } std::map::const_iterator cw_it = w_cmap.find (sl_it->name); if (cw_it != w_cmap.end ()) { coll_map_iter cm_it = coll_map_.find (cw_it->second); if (cm_it != coll_map_.end ()) { std::cout << cm_it->second.coll_val << '\n'; continue; } } std::cout << '\n'; } std::cout << "\n\n"; std::list::iterator sect_it = section_list_.begin (); while (sect_it != section_list_.end ()) { // dump the orders std::cout << "order_start " << sect_it->name; token_list_t::iterator o_it = sect_it->order.begin (); for (; o_it != sect_it->order.end (); ++o_it) std::cout << ";" << o_it->name; std::cout << '\n'; // for each entry in the entries list collate_entry_list_t::iterator e_it = sect_it->entries.begin (); for (; e_it != sect_it->entries.end (); ++e_it) { // dump the collation definition (weights included) std::cout << e_it->first.name << " "; if (!e_it->second.empty ()) { token_list_t::iterator w_it = e_it->second.begin (); std::cout << (w_it++)->name; for (; w_it != e_it->second.end (); ++w_it) std::cout << ";" << w_it->name; } // lookup the value associated with this collation symbol cs_map_iter cs_it = cs_map_.find (e_it->first.name); if (cs_it != cs_map_.end ()) { std::cout << " % " << cs_it->second << '\n'; continue; } ce_map_iter ce_it = ce_map_.find (e_it->first.name); if (ce_it != ce_map_.end ()) { std::cout << " % " << ce_it->second.coll_val << '\n'; continue; } std::map::const_iterator cw_it = w_cmap.find (e_it->first.name); if (cw_it != w_cmap.end ()) { coll_map_iter cm_it = coll_map_.find (cw_it->second); if (cm_it != coll_map_.end ()) { std::cout << " % " << cm_it->second.coll_val << '\n'; continue; } } std::cout << '\n'; } std::cout << "order_end\n"; ++sect_it; } std::cout << "END LC_COLLATE\n"; } void Def::write_collate (std::string dir_name) { assert (!dir_name.empty()); static const char lc_name[] = "LC_COLLATE"; if (collate_written_) return; if (!collate_def_found_) { issue_diag (I_SKIP, false, 0, "%s section not found, skipping\n", lc_name); return; } next_tab_num_ = 0; next_wchar_coll_tab_num_ = 0; std::set ce_strs; ce_sym_map_iter it; ce_map_iter ce_mit; for (it = ce_sym_map_.begin (); it != ce_sym_map_.end (); ++it) { if ((ce_mit = ce_map_.find (it->second))->second.coll_val != UINT_MAX) { // check to see of the largest_ce needs to be changed if (ce_mit->second.ce_wstr.size() + 1 > collate_out_.largest_ce) collate_out_.largest_ce = ce_mit->second.ce_wstr.size(); ce_strs.insert (it->first); } } next_n_ce_tab_num_ = 0; gen_n_ce_tables(ce_strs, 0, 0); ce_strs.clear(); for (it = ce_wsym_map_.begin (); it != ce_wsym_map_.end (); ++it) { if (ce_map_.find (it->second)->second.coll_val != UINT_MAX) ce_strs.insert (it->first); } next_w_ce_tab_num_ = 0; gen_w_ce_tables (ce_strs, 0, 0); gen_n_to_w_coll_tables ("", next_tab_num_); gen_w_to_n_coll_tables ("", next_wchar_coll_tab_num_); (dir_name += _RWSTD_PATH_SEP) += lc_name; issue_diag (I_OPENWR, false, 0, "writing %s\n", dir_name.c_str ()); std::ofstream out (dir_name.c_str(), std::ios::binary); out.exceptions (std::ios::failbit | std::ios::badbit); unsigned int i; // calculate the size of an individual weight element collate_out_.elm_size = collate_out_.num_weights * collate_out_.longest_weight * sizeof (unsigned int) + sizeof (unsigned int); // the first section of the collate database is the collating // element information collate_out_.n_ce_tab_off = 0; collate_out_.w_ce_tab_off = collate_out_.n_ce_tab_off; n_ce_offs_iter n_ce_offs_it; for (n_ce_offs_it = n_ce_offs_.begin(); n_ce_offs_it != n_ce_offs_.end (); ++n_ce_offs_it) { collate_out_.w_ce_tab_off += (n_ce_offs_it->second.last_offset - n_ce_offs_it->second.first_offset + 1)* sizeof (int); } // next comes the weight information collate_out_.weight_tab_off = collate_out_.w_ce_tab_off; w_ce_offs_iter w_ce_offs_it; for (w_ce_offs_it = w_ce_offs_.begin(); w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) { collate_out_.weight_tab_off += (w_ce_offs_it->second.last_offset - w_ce_offs_it->second.first_offset + 1)* sizeof (int); } coll_map_iter coll_map_pos; collate_out_.num_elms = off_mapr_.size(); if (collate_out_.undefined_optimization) ++collate_out_.num_elms; // now calculate the offset for the first narrow character table collate_out_.n_char_tab_off = collate_out_.weight_tab_off + collate_out_.num_elms * collate_out_.elm_size; // now calculate the offset fo the fist wide character table // but first we need the size of the narrow tables char_offs_iter char_offs_it; unsigned int char_offs_size = 0; for (char_offs_it = char_offs_.begin(); char_offs_it != char_offs_.end(); ++char_offs_it) { char_offs_size += (UCHAR_MAX + 1 - char_offs_it->second.first_offset) * sizeof (unsigned int); } collate_out_.w_char_tab_off = collate_out_.n_char_tab_off + char_offs_size; // now calculate the offset for the narrow character offset table // but first we need the size of the wide tables w_to_n_coll_iter w_to_n_coll_it; unsigned int w_to_n_size = 0; for (w_to_n_coll_it = w_to_n_coll_.begin(); w_to_n_coll_it != w_to_n_coll_.end(); ++w_to_n_coll_it) { w_to_n_size += (UCHAR_MAX + 1 - w_to_n_coll_it->second.first_offset) * sizeof (unsigned int); } collate_out_.n_char_off_tab_off = collate_out_.w_char_tab_off + w_to_n_size; // now calculate the offset for the wide character offset table collate_out_.w_char_off_tab_off = collate_out_.n_char_off_tab_off + char_offs_.size() * sizeof (unsigned int); // calculate the offset for the narrow collating element offset table collate_out_.n_ce_off_tab_off = collate_out_.w_char_off_tab_off + w_to_n_coll_.size() * sizeof (unsigned int); // calculate the offset for the wide collating element offset table collate_out_.w_ce_off_tab_off = collate_out_.n_ce_off_tab_off + n_ce_offs_.size() * sizeof (unsigned int); // now calculate the offset of the first character information collate_out_.n_char_first_char_off = collate_out_.w_ce_off_tab_off + w_ce_offs_.size() * sizeof (unsigned int); // now calculate the offset of the wide table first char info collate_out_.w_char_first_char_off = collate_out_.n_char_first_char_off + char_offs_.size() * sizeof (unsigned char); // now calculate the offset of the narrow ce first character info collate_out_.n_ce_first_char_off = collate_out_.w_char_first_char_off + w_to_n_coll_.size() * sizeof (unsigned char); // now calculate the offset of the wide ce first character info collate_out_.w_ce_first_char_off = collate_out_.n_ce_first_char_off + n_ce_offs_.size() * sizeof (unsigned char); // now calculate the offset of the narrow ce last character info collate_out_.n_ce_last_char_off = collate_out_.w_ce_first_char_off + w_ce_offs_.size() * sizeof (unsigned char); // now calculate the offset of the wide ce last character info collate_out_.w_ce_last_char_off = collate_out_.n_ce_last_char_off + n_ce_offs_.size() * sizeof (unsigned char); // now calculate the offset of the codeset name collate_out_.codeset_off = collate_out_.w_ce_last_char_off + w_ce_offs_.size() * sizeof (unsigned char); // finally calculate the offset of the charmap name collate_out_.charmap_off = collate_out_.codeset_off + charmap_.get_code_set_name().size() + 1; // print out the collate struct out.write ((char*)&collate_out_, sizeof(collate_out_)); for (n_ce_offs_it = n_ce_offs_.begin(); n_ce_offs_it != n_ce_offs_.end(); ++n_ce_offs_it) { for (i = (unsigned int)n_ce_offs_it->second.first_offset; i <= (unsigned int)n_ce_offs_it->second.last_offset; ++i) out.write ((char*)&n_ce_offs_it->second.off[i], sizeof (n_ce_offs_it->second.off[i])); } for (w_ce_offs_it = w_ce_offs_.begin(); w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) { for (i = (unsigned int)w_ce_offs_it->second.first_offset; i <= (unsigned int)w_ce_offs_it->second.last_offset; ++i) out.write ((char*)&w_ce_offs_it->second.off[i], sizeof (w_ce_offs_it->second.off[i])); } // now print out the weight array unsigned int maxw = UINT_MAX; Weights_t* weights; bool undefined_written = false; const std::map& w_cmap = charmap_.get_w_cmap (); std::size_t off_idx = 0; off_mapr_iter current_off = off_mapr_.begin(); for (; current_off != off_mapr_.end(); ++off_idx) { // check the current offset if (current_off->first != off_idx && off_idx != undef_char_info_.offset) issue_diag (E_RANGE, true, 0, "current_off : %d, index : %d, giving up\n", current_off->first, off_idx); if (collate_out_.undefined_optimization && off_idx == undef_char_info_.offset) { undefined_written = true; weights = undef_char_info_.weights; out.write ((char*)&undef_char_info_.order, sizeof (undef_char_info_.order)); for (int j = 0; j < collate_out_.num_weights; ++j) { for (int k = 0; k < weights[j].size; ++k) { out.write ((char*)&weights[j].weight[k], sizeof (weights[j].weight[k])); } for (int c = weights[j].size; c < collate_out_.longest_weight; ++c) out.write ((char*)&maxw, sizeof (maxw)); } continue; } w_cmap_iter w_cmap_pos = w_cmap.find (current_off->second); if (w_cmap_pos != w_cmap.end()) { coll_map_pos = coll_map_.find (w_cmap_pos->second); out.write ((char*)&coll_map_pos->second.order, sizeof (coll_map_pos->second.order)); weights = coll_map_pos->second.weights; for (int j = 0; j < collate_out_.num_weights; ++j) { for (int k = 0; k < weights[j].size; ++k) { out.write ((char*)&weights[j].weight[k], sizeof (weights[j].weight[k])); } for (int c = weights[j].size; c < collate_out_.longest_weight; ++c) out.write ((char*)&maxw, sizeof (maxw)); } ++current_off; } else { ce_map_iter ce_map_it = ce_map_.find (current_off->second); if (ce_map_it != ce_map_.end() && ce_map_it->second.coll_val != UINT_MAX) { out.write ((char*)&ce_map_it->second.order, sizeof (ce_map_it->second.order)); weights = ce_map_it->second.weights; for (int j = 0; j < collate_out_.num_weights; ++j) { for (int k = 0; k < weights[j].size; ++k) { out.write ((char*)&weights[j].weight[k], sizeof (weights[j].weight[k])); } for (int c = weights[j].size; c < collate_out_.longest_weight; ++c) out.write ((char*)&maxw, sizeof (maxw)); } } ++current_off; } } // maske sure that we got to the undefined value if (!undefined_written && collate_out_.undefined_optimization) { if (off_idx != undef_char_info_.offset) issue_diag (E_RANGE, true, 0, "current_off : %d, index : %d, giving up\n", undef_char_info_.offset, off_idx); weights = undef_char_info_.weights; out.write ((char*)&undef_char_info_.order, sizeof (undef_char_info_.order)); for (int j = 0; j < collate_out_.num_weights; ++j) { for (int k = 0; k < weights[j].size; ++k) { out.write ((char*)&weights[j].weight[k], sizeof (weights[j].weight[k])); } for (int c = weights[j].size; c < collate_out_.longest_weight; ++c) out.write ((char*)&maxw, sizeof (maxw)); } } // now print out the narrow character tables for (char_offs_it = char_offs_.begin(); char_offs_it != char_offs_.end(); ++char_offs_it) { for (unsigned int c = char_offs_it->second.first_offset; c <= UCHAR_MAX; ++c) { out.write ((const char*)&char_offs_it->second.off[c], sizeof (char_offs_it->second.off[c])); } } // now print out the wide character tables for (w_to_n_coll_it = w_to_n_coll_.begin(); w_to_n_coll_it != w_to_n_coll_.end(); ++w_to_n_coll_it) { for (unsigned int c = w_to_n_coll_it->second.first_offset; c <= UCHAR_MAX; ++c) { out.write ((const char*)&w_to_n_coll_it->second.off[c], sizeof (w_to_n_coll_it->second.off[c])); } } // now print the narrow character table offsets unsigned int current_offset = 0; for (char_offs_it = char_offs_.begin(); char_offs_it != char_offs_.end(); ++char_offs_it) { out.write ((const char*)¤t_offset, sizeof (current_offset)); current_offset += (UCHAR_MAX + 1 - char_offs_it->second.first_offset); } // now print the wide character table offsets current_offset = 0; for (w_to_n_coll_it = w_to_n_coll_.begin(); w_to_n_coll_it != w_to_n_coll_.end(); ++w_to_n_coll_it) { out.write ((const char*)¤t_offset, sizeof (current_offset)); current_offset += (UCHAR_MAX + 1 - w_to_n_coll_it->second.first_offset); } // now print the narrow ce table offsets current_offset = 0; for (n_ce_offs_it = n_ce_offs_.begin(); n_ce_offs_it != n_ce_offs_.end(); ++n_ce_offs_it) { out.write ((const char*)¤t_offset, sizeof (current_offset)); current_offset += (n_ce_offs_it->second.last_offset - n_ce_offs_it->second.first_offset + 1); } // now print the wide ce table offsets current_offset = 0; for (w_ce_offs_it = w_ce_offs_.begin(); w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) { out.write ((const char*)¤t_offset, sizeof (current_offset)); current_offset += (w_ce_offs_it->second.last_offset - w_ce_offs_it->second.first_offset + 1); } // now print out the narrow character tables starting character for (char_offs_it = char_offs_.begin(); char_offs_it != char_offs_.end(); ++char_offs_it) { const char off = char ((char_offs_it->second).first_offset); out << off; } // now print out the wide character tables starting character for (w_to_n_coll_it = w_to_n_coll_.begin(); w_to_n_coll_it != w_to_n_coll_.end(); ++w_to_n_coll_it) { const char off = char ((w_to_n_coll_it->second).first_offset); out << off; } // now print out the narrow ce tables starting character for (n_ce_offs_it = n_ce_offs_.begin(); n_ce_offs_it != n_ce_offs_.end(); ++n_ce_offs_it) { const char off = char ((n_ce_offs_it->second).first_offset); out << off; } // now print out the wide ce tables starting character for (w_ce_offs_it = w_ce_offs_.begin(); w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) { const char off = char ((w_ce_offs_it->second).first_offset); out << off; } // now print out the narrow ce tables ending character for (n_ce_offs_it = n_ce_offs_.begin(); n_ce_offs_it != n_ce_offs_.end(); ++n_ce_offs_it) { const char off = char ((n_ce_offs_it->second).last_offset); out << off; } // now print out the wide ce tables ending character for (w_ce_offs_it = w_ce_offs_.begin(); w_ce_offs_it != w_ce_offs_.end(); ++w_ce_offs_it) { const char off = char ((w_ce_offs_it->second).last_offset); out << off; } // finally write the codeset and charmap names out << charmap_.get_code_set_name() << std::ends << charmap_.get_charmap_name() << std::ends; } void Def::init_coll_map() { rw_cmap_iter rw_cmap_pos; collate_info_t tmp = {UINT_MAX, UINT_MAX, 0, 0}; for (rw_cmap_pos = charmap_.get_rw_cmap().begin(); rw_cmap_pos != charmap_.get_rw_cmap().end(); ++rw_cmap_pos) { coll_map_.insert (std::make_pair (rw_cmap_pos->first, tmp)); } } // In the preprocess_collate member function the collate section is // preprocessed and the result of the preprocessing is stored in-memory // as a number of lists; the content of these lists is then preprocessed // acoording to the reorder statements; the result is then passed // on to process_collate. void Def::preprocess_collate () { int nesting_level = 0; do { // fetch next token next = scanner_.next_token(); switch (next.token) { case Scanner::tok_end: next = scanner_.next_token(); if (next.token == Scanner::tok_collate) { // end of collation block if (nesting_level == 0) return; nesting_level--; scanner_.close (); } else issue_diag (E_SYNTAX, true, &next, "wrong section name in END directive\n"); break; case Scanner::tok_copy: { next = scanner_.next_token(); if (next.token != Scanner::tok_string) issue_diag (E_SYNTAX, true, &next, "expected string following \"copy\" directive\n"); // bump up the nesting level ++nesting_level; issue_diag (I_STAGE, false, 0, "processing copy directive\n"); // open the file scanner_.open (get_pathname (strip_quotes (next.name), next.file)); // get comment char and escape char; // these informations are stored by the scanner while ((next = scanner_.next_token ()).token != Scanner::tok_collate ){ // the LC_IDENTIFICATION section may also have a // LC_COLLATE token that will mess up the parsing if (next.token == Scanner::tok_ident) { while ((next = scanner_.next_token()).token != Scanner::tok_end ); next = scanner_.next_token(); } } break; } // a collating element definition defines a symbolic name that // represents symbolically the congregation of two other symbolic // names which have to be present in the character map; // the form of the phrase is: // collating-element sym from ("string" | (sym sym+)) case Scanner::tok_coll_elem: { next = scanner_.next_token(); // we expect a symbolic name if (next.token != Scanner::tok_sym_name) issue_diag (E_SYNTAX, true, &next, "symbolic name expected following " "collating-element\n"); // symbolic name token_t sym (next); next = scanner_.next_token (); if (next.token != Scanner::tok_from) issue_diag (E_SYNTAX, true, &next, "illegal collating-element expression\n"); token_list_t ce_def_list; next = scanner_.next_token (); if (next.token == Scanner::tok_string) { ce_def_list.push_back (next); ce_list_.push_back (std::make_pair(sym,ce_def_list)); } else if (next.token == Scanner::tok_sym_name) { do { ce_def_list.push_back (next); next = scanner_.next_token(); } while (next.token != Scanner::tok_nl); ce_list_.push_back (std::make_pair(sym,ce_def_list)); } else issue_diag (E_SYNTAX, true, &next, "illegal collating-element expression\n"); break; } case Scanner::tok_coll_sym: next = scanner_.next_token(); if (next.token != Scanner::tok_sym_name) issue_diag (E_SYNTAX, true, &next, "symbolic name expected following " "collating-symbol\n"); cs_list_.push_back (next); break; case Scanner::tok_script: next = scanner_.next_token (); if (next.token != Scanner::tok_sym_name) issue_diag (W_SYNTAX, false, &next, "expecting script name, got %s\n", next.name.c_str ()); script_list_.push_back(next); break; case Scanner::tok_sym_name: sym_list_.push_back (next); break; case Scanner::tok_order_start: preprocess_order(); break; case Scanner::tok_reorder: preprocess_reorder (); break; case Scanner::tok_reorder_section: preprocess_reorder_section (); break; default: break; } } while (Scanner::tok_end_tokens != next.token); } // The task of preprocess_order is to parse and model the content of the // order sections in the input files void Def::preprocess_order () { // one order-start keyword has been encountered; // push a collate_section down the list and use it while (next.token != Scanner::tok_order_end) { section_list_.push_back (collate_section_t()); collate_section_t& section = section_list_.back (); next = scanner_.next_token (); // store the orders while (next.token != Scanner::tok_nl) { if (next.token == Scanner::tok_sym_name) { // symbolic name, has to be a section name section.name = next.name; } else { // might be a combination of order and position // do the same until a better way is found section.order.push_back (next); } // fetch next token next = scanner_.next_token (); } // unnamed sections if (section.name == "") section.name = "unnamed"; issue_diag (I_STAGE, false, 0, "processing %s order\n", section.name.c_str ()); // store the collation statements while (!(next.token == Scanner::tok_order_end || next.token == Scanner::tok_order_start )) { next = scanner_.next_token (); switch (next.token) { case Scanner::tok_end: issue_diag (E_SYNTAX, true, &next, "unexpected END directive while " "parsing collate section\n"); case Scanner::tok_nl: continue; case Scanner::tok_comment: scanner_.ignore_line (); break; case Scanner::tok_undefined: case Scanner::tok_sym_name: { token_t sym (next); section.entries.push_back ( std::make_pair(sym,token_list_t())); collate_entry_t& entry = section.entries.back (); next = scanner_.next_token (); while (next.token != Scanner::tok_nl) { entry.second.push_back (next); next = scanner_.next_token (); } break; } default: break; } } } } // Handles the reorder statements in the form: // reorder-after // ... // OR // reorder-after // .. ..... // followed by reorder-end statement or another reorder-after void Def::preprocess_reorder () { while (true) { // process one or more statements grouped // under a reorder section // retrieve the symbol next = scanner_.next_token (); if (next.token != Scanner::tok_sym_name) issue_diag (E_SYNTAX, true, &next, "expecting symbolic name, got %s\n", next.name.c_str ()); // store the symbolic name token_t sym (next); collate_entry_list_t entries_list; while (true) { // process the statements next = scanner_.next_token (); if (next.token == Scanner::tok_nl ) continue; else if ( next.token == Scanner::tok_sym_name || next.token == Scanner::tok_hex_ellipsis) { // the line will contain one single symbolic name // or a complete collation statement collate_entry_t entry; entry.first = next; next = scanner_.next_token (); while (next.token != Scanner::tok_nl) { entry.second.push_back (next); next = scanner_.next_token (); } // add the entry to the list entries_list.push_back (entry); } else if (next.token == Scanner::tok_reorder) { // call insert_entry if (!entries_list.empty ()) insert_entries (sym, entries_list); entries_list.clear (); break; } else if (next.token == Scanner::tok_reorder_end ) { // call insert_entry // call insert_entry if (!entries_list.empty ()) insert_entries (sym, entries_list); return; } else { issue_diag (E_SYNTAX, true, &next, "unexpected token : %s in a reorder block\n", next.name.c_str ()); } } } } void Def::preprocess_reorder_section () { while (next.token != Scanner::tok_reorder_section_end) { // process one or more statements grouped // under a reorder section // retrieve the symbol next = scanner_.next_token (); token_t sym (next); // process the statements while (!(next.token == Scanner::tok_reorder_section || next.token == Scanner::tok_reorder_section_end)) { next = scanner_.next_token (); switch (next.token) { case Scanner::tok_nl: continue; case Scanner::tok_sym_name: { // the line will contain one single symbolic name // which is the name of a section next = scanner_.next_token (); if (next.token != Scanner::tok_sym_name) { issue_diag (E_SYNTAX, true, &next, "expecting section name, got %s\n", next.name.c_str ()); } std::list::iterator beg = section_list_.begin (); std::list::iterator end = section_list_.end (); std::list::iterator ref_it, mov_it; for (ref_it = beg; ref_it->name != sym.name && ref_it != end; ++ref_it); for (mov_it = beg; mov_it->name != next.name && mov_it != end; ++mov_it); if (ref_it == end || mov_it == end) issue_diag (E_SYNTAX, true, &next, "missing sections %s %s requested\n", sym.name.c_str (), next.name.c_str ()); // replace it section_list_.splice (++ref_it, section_list_, mov_it, mov_it); // replace the insertion point sym = next; break; } default: issue_diag (E_SYNTAX, true, &next, "unexpected token : %s in a " "reorder section block\n", next.name.c_str ()); break; } } } } void Def::list_collate () { // print the script list token_list_t::iterator script_it = script_list_.begin (); while (script_it != script_list_.end ()) { std::cout << "script " << (script_it++)->name << '\n'; } std::cout << '\n'; // print the collating symbols' list token_list_t::iterator cs_it = cs_list_.begin (); while (cs_it != cs_list_.end ()) { std::cout << "collating-symbol " << (cs_it++)->name << '\n'; } std::cout << '\n'; // print the collating elements list std::list::iterator ce_it = ce_list_.begin (); while (ce_it != ce_list_.end ()) { std::cout << "collating-element " << ce_it->first.name << " from "; token_list_t::iterator ce_def_it = ce_it->second.begin (); for (;ce_def_it != ce_it->second.end (); ce_def_it++) { std::cout << ce_def_it->name << ";"; } std::cout << '\n'; } std::cout << '\n'; // print out general/global symbols' list token_list_t::iterator sym_it = sym_list_.begin (); while (sym_it != sym_list_.end ()) { std::cout << (sym_it++)->name << '\n'; } std::cout << '\n'; // print out sections std::list::iterator sc_it = section_list_.begin (); while (sc_it != section_list_.end ()) { // print prolog std::cout << "order_start "; if (sc_it->name != "unnamed") std::cout << sc_it->name << ";"; token_list_t::iterator it = sc_it->order.begin (); while (it != sc_it->order.end ()) std::cout << (it++)->name << ";"; std::cout << '\n'; std::list::iterator e_it = sc_it->entries.begin (); while (e_it != sc_it->entries.end ()) { std::cout << e_it->first.name << " "; token_list_t::iterator w_it = e_it->second.begin (); while (w_it != e_it->second.end ()) std::cout << (w_it++)->name << ";"; std::cout << '\n'; ++e_it; } ++sc_it; } std::cout << '\n' << "order_end\n"; } // FIXME - modify the algorithm to get log(N) complexity // Inserts an entry inside the section information; // searching for the appropriate entry in the list is // done in linear time bool Def::insert_entries (token_t& s, collate_entry_list_t& e) { // first remove these entries if found collate_entry_list_t::iterator r_it = e.begin (); for (; r_it != e.end (); ++r_it) remove_entry (*r_it); // first search through the symbols list; if found check // the collate_entry object; it should not have weights token_list_t::iterator it = sym_list_.begin (); for (; it != sym_list_.end (); ++it) { if (it->name != s.name) continue; // FIXME - all statements have to have no weights if they are // to be inserted after collation symbols outside section boundaries; // found it; check one collate_entry_t object if (!e.begin ()->second.empty ()) { issue_diag (E_REORD, true, &s, "requested reorder-after: reference " "symbol %s was found outside section boundaries " "and the objects to be reordered are collation " "definitions\n", s.name.c_str ()); } // insert the symbolic name there collate_entry_list_t::iterator e_it = e.begin (); for (; e_it != e.end (); ++e_it) it = sym_list_.insert (++it, e_it->first); return true; } // if not successful then search in each section std::list::iterator sect_it = section_list_.begin (); while (sect_it != section_list_.end ()) { collate_entry_list_t::iterator e_it = sect_it->entries.begin (); for (; e_it != sect_it->entries.end (); ++e_it) { if (e_it->first.name != s.name) continue; // found it; insert entry sect_it->entries.insert (++e_it, e.begin (), e.end ()); return true; } ++sect_it; } issue_diag (W_REORD, false, &s, "requested reorder-after: reference " "symbol %s was not found \n", s.name.c_str ()); return false; } // FIXME - modify the algorithm to get log(N) complexity void Def::remove_entry (collate_entry_t& e) { // search in the sym_list_ and in the sections if (e.second.empty ()) { token_list_t::iterator it = sym_list_.begin (); for (; it != sym_list_.end (); ++it) { if (it->name != e.first.name) continue; sym_list_.erase (it); return; } } // search in the sections std::list::iterator sect_it = section_list_.begin (); while (sect_it != section_list_.end ()) { std::list::iterator e_it = sect_it->entries.begin (); for (; e_it != sect_it->entries.end (); ++e_it) { if (e_it->first.name != e.first.name) continue; sect_it->entries.erase (e_it); return; } ++sect_it; } return ; } bool Def::get_weight ( token_t& w, Weights_t* weights, int weight_num) { const std::map& w_cmap = charmap_.get_w_cmap (); bool ret = false; weights[weight_num].size = 1; ce_map_iter ce_map_it; if(w.token == Scanner::tok_sym_name) { w_cmap_iter w_cmap_pos = w_cmap.find (w.name); if (w_cmap_pos != w_cmap.end()){ coll_map_iter coll_map_pos = coll_map_.find(w_cmap_pos->second); if (coll_map_pos->second.coll_val == UINT_MAX) issue_diag (E_SYMUSED, true, &w, "symbolic name %s " "used as weight before being assigned a " "collation value\n", w.name.c_str()); // store weight weights[weight_num].weight[0] = coll_map_pos->second.coll_val; } else if ((ce_map_it = ce_map_.find (w.name)) != ce_map_.end()) weights[weight_num].weight[0] = ce_map_it->second.coll_val; else { cs_map_iter cs_it = cs_map_.find (w.name); if (cs_it == cs_map_.end ()) issue_diag (E_SYNTAX, true, &w, "symbolic name %s not found\n", w.name.c_str()); weights[weight_num].weight[0] = cs_it->second; } } else if (w.token == Scanner::tok_char_value) { // the weight is given in numerical form const char* next_val = std::strchr (w.name.c_str (), scanner_.escape_char ()); assert (0 != next_val); const char* next_wt = std::strchr (w.name.c_str (), ';'); while (weight_num < collate_out_.num_weights) { std::size_t c; for (c = 0; *next_val && (!next_wt || next_val < next_wt); ++c) { const char* end = 0; weights [weight_num].weight [c] = scanner_.convert_escape (next_val, &end, true); assert (0 != end); next_val = end; } weights [weight_num++].size = c; if (next_wt) next_wt = std::strchr (next_val, ';'); } } else if(w.token == Scanner::tok_ignore) { // use the special weight 0 for IGNORE weights weights[weight_num].size = 1; weights[weight_num].weight[0] = 0; } else if (w.token == Scanner::tok_string) { // the weights are given either in symbolic name form (e.g., // "" or in the form of a quoted multibyte // character string (e.g., "\001\d010\x16\") const std::string tmp (w.name.substr (1, w.name.size() - 2)); // keeps track of the length of the weight unsigned char k = 0; // iterate thru the string content and retrieve the symbols std::string::const_iterator it = tmp.begin (); const char escape = scanner_.escape_char (); while (it != tmp.end ()) { // weight in string form std::string wsym; // next comes a symbolic name if (*it == '<') { while (*it != '>') { if (*it == escape) ++it; wsym += *it++; } wsym += *it++; // wsym has the symbolic name, lookup for it in // the character map, collating-symbol map, // and collating-element map w_cmap_iter w_cm_pos = w_cmap.find(wsym); cs_map_iter cs_it = cs_map_.find (wsym); ce_map_iter ce_it = ce_map_.find (wsym); if (w_cm_pos != w_cmap.end()) { // is in the character map, check its associated // collation value coll_map_iter coll_it = coll_map_.find(w_cm_pos->second); if (coll_it == coll_map_.end()) issue_diag (E_SYMUSED, true, &w, "weight %s not defined\n", wsym.c_str()); if (coll_it->second.coll_val == UINT_MAX) issue_diag (E_SYMUSED, true, &w, "symbolic name %s " "used as weight before being assigned a " "collation value\n", wsym.c_str()); weights[weight_num].weight[k++] = coll_it->second.coll_val; } else if (ce_it != ce_map_.end()) { // it is in the collating-element map weights[weight_num].weight[k++] = ce_it->second.coll_val; } else if (cs_it != cs_map_.end()) { // it is in the collating-symbol map weights[weight_num].weight[k++] = cs_it->second; } else { // it is not in any of the maps, that's an error issue_diag (E_SYNTAX, true, &w, "symbolic name %s not found\n", wsym.c_str()); } } else if (*it == escape) { // weight is given in a quoted escape form const char* const beg = tmp.c_str () + (it - tmp.begin ()); const char* end = 0; weights [weight_num].weight [k++] = scanner_.convert_escape (beg, &end, true); assert (0 != end); it += end - beg; break; } else issue_diag (E_SYNTAX, true, &w, "illegal string content as a weight"); } weights[weight_num].size = k; if (k > collate_out_.longest_weight) collate_out_.longest_weight = k; } else if (w.token == Scanner::tok_abs_ellipsis) // return true if ellipsis are embedded in the weight ret = true; else warnings_occurred_ = issue_diag (W_SYNTAX, false, &w, "illegal token %s found in collation definition\n", w.name.c_str()) || warnings_occurred_; return ret; } // if undefined optimization is on then only those characters that have // been defined should go into the valid set, otherwise all characters go // into the set. void Def::gen_valid_coll_wchar_set () { if (!valid_coll_wchar_set_.empty()) return; create_wchar_utf8_table(); for (coll_map_iter coll_it = coll_map_.begin(); coll_it != coll_map_.end(); ++coll_it) { if (coll_it->second.offset != UINT_MAX || !collate_out_.undefined_optimization){ std::string valid = utf8_encode (coll_it->first); valid = valid.substr (0, valid.size() - 1); while (valid.size() > 0){ valid_coll_wchar_set_.insert (valid); valid = valid.substr(0, valid.size() - 1); } } } }